* Re: [PATCH ghak90 (was ghak32) V4 02/10] audit: add container id
From: Steve Grubb @ 2018-08-24 16:01 UTC (permalink / raw)
To: linux-audit
Cc: Richard Guy Briggs, containers, linux-api, linux-fsdevel, LKML,
netdev, netfilter-devel, ebiederm, luto, carlos, dhowells, viro,
simo, eparis, serge
In-Reply-To: <12396e378a78ee8dd38c75f7730d67d8fbb08e02.1533065887.git.rgb@redhat.com>
On Tuesday, July 31, 2018 4:07:37 PM EDT Richard Guy Briggs wrote:
> Implement the proc fs write to set the audit container identifier of a
> process, emitting an AUDIT_CONTAINER_OP record to document the event.
>
> This is a write from the container orchestrator task to a proc entry of
> the form /proc/PID/audit_containerid where PID is the process ID of the
> newly created task that is to become the first task in a container, or
> an additional task added to a container.
>
> The write expects up to a u64 value (unset: 18446744073709551615).
>
> The writer must have capability CAP_AUDIT_CONTROL.
>
> This will produce a record such as this:
> type=CONTAINER_ID msg=audit(2018-06-06 12:39:29.636:26949) : op=set
> opid=2209 old-contid=18446744073709551615 contid=123456 pid=628 auid=root
> uid=root tty=ttyS0 ses=1
> subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 comm=bash
> exe=/usr/bin/bash res=yes
>
> The "op" field indicates an initial set.
The op field seems to be entirely unnecessary. There is no unset event (nor
should there be) so op=set is implied by the record type. Otherwise the event
format looks fine.
-Steve
> The "pid" to "ses" fields are
> the orchestrator while the "opid" field is the object's PID, the process
> being "contained". Old and new audit container identifier values are
> given in the "contid" fields, while res indicates its success.
>
> It is not permitted to unset the audit container identifier.
> A child inherits its parent's audit container identifier.
>
> See: https://github.com/linux-audit/audit-kernel/issues/90
> See: https://github.com/linux-audit/audit-userspace/issues/51
> See: https://github.com/linux-audit/audit-testsuite/issues/64
> See:
> https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> Acked-by: Serge Hallyn <serge@hallyn.com>
> Acked-by: Steve Grubb <sgrubb@redhat.com>
> ---
> fs/proc/base.c | 37 +++++++++++++++++++++++++
> include/linux/audit.h | 24 ++++++++++++++++
> include/uapi/linux/audit.h | 2 ++
> kernel/auditsc.c | 68
> ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 131
> insertions(+)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index b657294..1b3cda1 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1260,6 +1260,41 @@ static ssize_t proc_sessionid_read(struct file *
> file, char __user * buf, .read = proc_sessionid_read,
> .llseek = generic_file_llseek,
> };
> +
> +static ssize_t proc_contid_write(struct file *file, const char __user
> *buf, + size_t count, loff_t *ppos)
> +{
> + struct inode *inode = file_inode(file);
> + u64 contid;
> + int rv;
> + struct task_struct *task = get_proc_task(inode);
> +
> + if (!task)
> + return -ESRCH;
> + if (*ppos != 0) {
> + /* No partial writes. */
> + put_task_struct(task);
> + return -EINVAL;
> + }
> +
> + rv = kstrtou64_from_user(buf, count, 10, &contid);
> + if (rv < 0) {
> + put_task_struct(task);
> + return rv;
> + }
> +
> + rv = audit_set_contid(task, contid);
> + put_task_struct(task);
> + if (rv < 0)
> + return rv;
> + return count;
> +}
> +
> +static const struct file_operations proc_contid_operations = {
> + .write = proc_contid_write,
> + .llseek = generic_file_llseek,
> +};
> +
> #endif
>
> #ifdef CONFIG_FAULT_INJECTION
> @@ -2952,6 +2987,7 @@ static int proc_pid_patch_state(struct seq_file *m,
> struct pid_namespace *ns, #ifdef CONFIG_AUDITSYSCALL
> REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
> REG("sessionid", S_IRUGO, proc_sessionid_operations),
> + REG("audit_containerid", S_IWUSR, proc_contid_operations),
> #endif
> #ifdef CONFIG_FAULT_INJECTION
> REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> @@ -3337,6 +3373,7 @@ static int proc_tid_comm_permission(struct inode
> *inode, int mask) #ifdef CONFIG_AUDITSYSCALL
> REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
> REG("sessionid", S_IRUGO, proc_sessionid_operations),
> + REG("audit_containerid", S_IWUSR, proc_contid_operations),
> #endif
> #ifdef CONFIG_FAULT_INJECTION
> REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 8964332..71a6fc6 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -222,6 +222,7 @@ static inline void audit_log_task_info(struct
> audit_buffer *ab, struct audit_task_info {
> kuid_t loginuid;
> unsigned int sessionid;
> + u64 contid;
> struct audit_context *ctx;
> };
> extern struct audit_task_info init_struct_audit;
> @@ -334,6 +335,7 @@ static inline void audit_ptrace(struct task_struct *t)
> extern int auditsc_get_stamp(struct audit_context *ctx,
> struct timespec64 *t, unsigned int *serial);
> extern int audit_set_loginuid(kuid_t loginuid);
> +extern int audit_set_contid(struct task_struct *tsk, u64 contid);
>
> static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
> {
> @@ -351,6 +353,14 @@ static inline unsigned int audit_get_sessionid(struct
> task_struct *tsk) return AUDIT_SID_UNSET;
> }
>
> +static inline u64 audit_get_contid(struct task_struct *tsk)
> +{
> + if (!tsk->audit)
> + return AUDIT_CID_UNSET;
> + else
> + return tsk->audit->contid;
> +}
> +
> extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
> extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t
> gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm);
> @@ -545,6 +555,10 @@ static inline unsigned int audit_get_sessionid(struct
> task_struct *tsk) {
> return AUDIT_SID_UNSET;
> }
> +static inline kuid_t audit_get_contid(struct task_struct *tsk)
> +{
> + return AUDIT_CID_UNSET;
> +}
> static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
> { }
> static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
> @@ -609,6 +623,16 @@ static inline bool audit_loginuid_set(struct
> task_struct *tsk) return uid_valid(audit_get_loginuid(tsk));
> }
>
> +static inline bool audit_contid_valid(u64 contid)
> +{
> + return contid != AUDIT_CID_UNSET;
> +}
> +
> +static inline bool audit_contid_set(struct task_struct *tsk)
> +{
> + return audit_contid_valid(audit_get_contid(tsk));
> +}
> +
> static inline void audit_log_string(struct audit_buffer *ab, const char
> *buf) {
> audit_log_n_string(ab, buf, strlen(buf));
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index 4e3eaba..3474f57 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -71,6 +71,7 @@
> #define AUDIT_TTY_SET 1017 /* Set TTY auditing status */
> #define AUDIT_SET_FEATURE 1018 /* Turn an audit feature on or off */
> #define AUDIT_GET_FEATURE 1019 /* Get which features are enabled */
> +#define AUDIT_CONTAINER_OP 1020 /* Define the container id and information
> */
>
> #define AUDIT_FIRST_USER_MSG 1100 /* Userspace messages mostly
> uninteresting to kernel */ #define AUDIT_USER_AVC 1107 /* We filter this
> differently */
> @@ -468,6 +469,7 @@ struct audit_tty_status {
>
> #define AUDIT_UID_UNSET (unsigned int)-1
> #define AUDIT_SID_UNSET ((unsigned int)-1)
> +#define AUDIT_CID_UNSET ((u64)-1)
>
> /* audit_rule_data supports filter rules with both integer and string
> * fields. It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 88779a7..6125cef 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -956,6 +956,7 @@ int audit_alloc(struct task_struct *tsk)
> return -ENOMEM;
> info->loginuid = audit_get_loginuid(current);
> info->sessionid = audit_get_sessionid(current);
> + info->contid = audit_get_contid(current);
> tsk->audit = info;
>
> if (likely(!audit_ever_enabled))
> @@ -985,6 +986,7 @@ int audit_alloc(struct task_struct *tsk)
> struct audit_task_info init_struct_audit = {
> .loginuid = INVALID_UID,
> .sessionid = AUDIT_SID_UNSET,
> + .contid = AUDIT_CID_UNSET,
> .ctx = NULL,
> };
>
> @@ -2112,6 +2114,72 @@ int audit_set_loginuid(kuid_t loginuid)
> }
>
> /**
> + * audit_set_contid - set current task's audit_context contid
> + * @contid: contid value
> + *
> + * Returns 0 on success, -EPERM on permission failure.
> + *
> + * Called (set) from fs/proc/base.c::proc_contid_write().
> + */
> +int audit_set_contid(struct task_struct *task, u64 contid)
> +{
> + u64 oldcontid;
> + int rc = 0;
> + struct audit_buffer *ab;
> + uid_t uid;
> + struct tty_struct *tty;
> + char comm[sizeof(current->comm)];
> +
> + task_lock(task);
> + /* Can't set if audit disabled */
> + if (!task->audit) {
> + task_unlock(task);
> + return -ENOPROTOOPT;
> + }
> + oldcontid = audit_get_contid(task);
> + read_lock(&tasklist_lock);
> + /* Don't allow the audit containerid to be unset */
> + if (!audit_contid_valid(contid))
> + rc = -EINVAL;
> + /* if we don't have caps, reject */
> + else if (!capable(CAP_AUDIT_CONTROL))
> + rc = -EPERM;
> + /* if task has children or is not single-threaded, deny */
> + else if (!list_empty(&task->children))
> + rc = -EBUSY;
> + else if (!(thread_group_leader(task) && thread_group_empty(task)))
> + rc = -EALREADY;
> + read_unlock(&tasklist_lock);
> + if (!rc)
> + task->audit->contid = contid;
> + task_unlock(task);
> +
> + if (!audit_enabled)
> + return rc;
> +
> + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONTAINER_OP);
> + if (!ab)
> + return rc;
> +
> + uid = from_kuid(&init_user_ns, task_uid(current));
> + tty = audit_get_tty(current);
> + audit_log_format(ab, "op=set opid=%d old-contid=%llu contid=%llu pid=%d
> uid=%u auid=%u tty=%s ses=%u", + task_tgid_nr(task), oldcontid,
contid,
> + task_tgid_nr(current), uid,
> + from_kuid(&init_user_ns, audit_get_loginuid(current)),
> + tty ? tty_name(tty) : "(none)",
> + audit_get_sessionid(current));
> + audit_put_tty(tty);
> + audit_log_task_context(ab);
> + audit_log_format(ab, " comm=");
> + audit_log_untrustedstring(ab, get_task_comm(comm, current));
> + audit_log_d_path_exe(ab, current->mm);
> + audit_log_format(ab, " res=%d", !rc);
> + audit_log_end(ab);
> + return rc;
> +}
> +
> +/**
> * __audit_mq_open - record audit data for a POSIX MQ open
> * @oflag: open flag
> * @mode: mode bits
^ permalink raw reply
* Re: [PATCH ghak90 (was ghak32) V4 03/10] audit: log container info of syscalls
From: Steve Grubb @ 2018-08-24 16:01 UTC (permalink / raw)
To: linux-audit
Cc: simo, Richard Guy Briggs, linux-api, containers, LKML, dhowells,
carlos, netfilter-devel, ebiederm, luto, netdev, linux-fsdevel,
eparis, serge, viro
In-Reply-To: <34017c395d03a213d6b0d49b9964429bd32b283d.1533065887.git.rgb@redhat.com>
On Tuesday, July 31, 2018 4:07:38 PM EDT Richard Guy Briggs wrote:
> Create a new audit record AUDIT_CONTAINER to document the audit
> container identifier of a process if it is present.
>
> Called from audit_log_exit(), syscalls are covered.
>
> A sample raw event:
> type=SYSCALL msg=audit(1519924845.499:257): arch=c000003e syscall=257
> success=yes exit=3 a0=ffffff9c a1=56374e1cef30 a2=241 a3=1b6 items=2
> ppid=606 pid=635 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0
> fsgid=0 tty=pts0 ses=3 comm="bash" exe="/usr/bin/bash"
> subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
> key="tmpcontainerid" type=CWD msg=audit(1519924845.499:257): cwd="/root"
> type=PATH msg=audit(1519924845.499:257): item=0 name="/tmp/" inode=13863
> dev=00:27 mode=041777 ouid=0 ogid=0 rdev=00:00
> obj=system_u:object_r:tmp_t:s0 nametype= PARENT cap_fp=0000000000000000
> cap_fi=0000000000000000 cap_fe=0 cap_fver=0 type=PATH
> msg=audit(1519924845.499:257): item=1 name="/tmp/tmpcontainerid"
> inode=17729 dev=00:27 mode=0100644 ouid=0 ogid=0 rdev=00:00
> obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE
> cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
> type=PROCTITLE msg=audit(1519924845.499:257):
> proctitle=62617368002D6300736C65657020313B206563686F2074657374203E202F746D
> 702F746D70636F6E7461696E65726964 type=CONTAINER
> msg=audit(1519924845.499:257): op=task contid=123458
Ack for the event format.
-Steve
> See: https://github.com/linux-audit/audit-kernel/issues/90
> See: https://github.com/linux-audit/audit-userspace/issues/51
> See: https://github.com/linux-audit/audit-testsuite/issues/64
> See:
> https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> Acked-by: Serge Hallyn <serge@hallyn.com>
> Acked-by: Steve Grubb <sgrubb@redhat.com>
> ---
> include/linux/audit.h | 7 +++++++
> include/uapi/linux/audit.h | 1 +
> kernel/audit.c | 24 ++++++++++++++++++++++++
> kernel/auditsc.c | 3 +++
> 4 files changed, 35 insertions(+)
>
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index 71a6fc6..d5a48dc 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -155,6 +155,9 @@ extern void audit_log_key(struct audit_buffer
*ab,
> extern int audit_log_task_context(struct audit_buffer *ab);
> extern void audit_log_task_info(struct audit_buffer *ab,
> struct task_struct *tsk);
> +extern int audit_log_contid(struct task_struct *tsk,
> + struct audit_context *context,
> + char *op);
>
> extern int audit_update_lsm_rules(void);
>
> @@ -205,6 +208,10 @@ static inline int audit_log_task_context(struct
> audit_buffer *ab) static inline void audit_log_task_info(struct
> audit_buffer *ab,
> struct task_struct *tsk)
> { }
> +static inline int audit_log_contid(struct task_struct *tsk,
> + struct audit_context *context,
> + char *op)
> +{ }
> #define audit_enabled AUDIT_OFF
> #endif /* CONFIG_AUDIT */
>
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index 3474f57..dc259c7 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -115,6 +115,7 @@
> #define AUDIT_REPLACE 1329 /* Replace auditd if this packet unanswerd
*/
> #define AUDIT_KERN_MODULE 1330 /* Kernel Module events */
> #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */
> +#define AUDIT_CONTAINER 1332 /* Container ID */
>
> #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */
> #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */
> diff --git a/kernel/audit.c b/kernel/audit.c
> index 2a80587..15f54c7 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -2045,6 +2045,30 @@ void audit_log_session_info(struct audit_buffer *ab)
> audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
> }
>
> +/*
> + * audit_log_contid - report container info
> + * @tsk: task to be recorded
> + * @context: task or local context for record
> + * @op: contid string description
> + */
> +int audit_log_contid(struct task_struct *tsk,
> + struct audit_context *context, char *op)
> +{
> + struct audit_buffer *ab;
> +
> + if (!audit_contid_set(tsk))
> + return 0;
> + /* Generate AUDIT_CONTAINER record with container ID */
> + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER);
> + if (!ab)
> + return -ENOMEM;
> + audit_log_format(ab, "op=%s contid=%llu",
> + op, audit_get_contid(tsk));
> + audit_log_end(ab);
> + return 0;
> +}
> +EXPORT_SYMBOL(audit_log_contid);
> +
> void audit_log_key(struct audit_buffer *ab, char *key)
> {
> audit_log_format(ab, " key=");
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 6125cef..39e5633 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -1488,10 +1488,13 @@ static void audit_log_exit(struct audit_context
> *context, struct task_struct *ts
>
> audit_log_proctitle(tsk, context);
>
> + audit_log_contid(tsk, context, "task");
> +
> /* Send end of event record to help user space know we are finished */
> ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
> if (ab)
> audit_log_end(ab);
> +
> if (call_panic)
> audit_panic("error converting sid to string");
> }
^ permalink raw reply
* [PATCH net] mlxsw: spectrum_switchdev: Do not leak RIFs when removing bridge
From: Ido Schimmel @ 2018-08-24 12:41 UTC (permalink / raw)
To: netdev; +Cc: davem, jiri, petrm, alexpe, mlxsw, Ido Schimmel
When a bridge device is removed, the VLANs are flushed from each
configured port. This causes the ports to decrement the reference count
on the associated FIDs (filtering identifier). If the reference count of
a FID is 1 and it has a RIF (router interface), then this RIF is
destroyed.
However, if no port is member in the VLAN for which a RIF exists, then
the RIF will continue to exist after the removal of the bridge. To
reproduce:
# ip link add name br0 type bridge vlan_filtering 1
# ip link set dev swp1 master br0
# ip link add link br0 name br0.10 type vlan id 10
# ip address add 192.0.2.0/24 dev br0.10
# ip link del dev br0
The RIF associated with br0.10 continues to exist.
Fix this by iterating over all the bridge device uppers when it is
destroyed and take care of destroying their RIFs.
Fixes: 99f44bb3527b ("mlxsw: spectrum: Enable L3 interfaces on top of bridge devices")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
.../net/ethernet/mellanox/mlxsw/spectrum.h | 2 ++
.../ethernet/mellanox/mlxsw/spectrum_router.c | 11 ++++++++++
.../mellanox/mlxsw/spectrum_switchdev.c | 20 +++++++++++++++++++
3 files changed, 33 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 3ae930196741..3cdb7aca90b7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -414,6 +414,8 @@ mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
void
mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif);
+void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp,
+ struct net_device *dev);
/* spectrum_kvdl.c */
enum mlxsw_sp_kvdl_entry_type {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 3a96307f51b0..2ab9cf25a08a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -6234,6 +6234,17 @@ void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
mlxsw_sp_vr_put(mlxsw_sp, vr);
}
+void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp,
+ struct net_device *dev)
+{
+ struct mlxsw_sp_rif *rif;
+
+ rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+ if (!rif)
+ return;
+ mlxsw_sp_rif_destroy(rif);
+}
+
static void
mlxsw_sp_rif_subport_params_init(struct mlxsw_sp_rif_params *params,
struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 0d8444aaba01..db715da7bab7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -127,6 +127,24 @@ bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
}
+static int mlxsw_sp_bridge_device_upper_rif_destroy(struct net_device *dev,
+ void *data)
+{
+ struct mlxsw_sp *mlxsw_sp = data;
+
+ mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev);
+ return 0;
+}
+
+static void mlxsw_sp_bridge_device_rifs_destroy(struct mlxsw_sp *mlxsw_sp,
+ struct net_device *dev)
+{
+ mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev);
+ netdev_walk_all_upper_dev_rcu(dev,
+ mlxsw_sp_bridge_device_upper_rif_destroy,
+ mlxsw_sp);
+}
+
static struct mlxsw_sp_bridge_device *
mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
struct net_device *br_dev)
@@ -165,6 +183,8 @@ static void
mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge,
struct mlxsw_sp_bridge_device *bridge_device)
{
+ mlxsw_sp_bridge_device_rifs_destroy(bridge->mlxsw_sp,
+ bridge_device->dev);
list_del(&bridge_device->list);
if (bridge_device->vlan_enabled)
bridge->vlan_enabled_exists = false;
--
2.17.1
^ permalink raw reply related
* Re: [PATCH 4.4 18/31] r8152: napi hangup fix after disconnect
From: Ben Hutchings @ 2018-08-24 16:38 UTC (permalink / raw)
To: Jiri Slaby, linux-usb, netdev, David S. Miller
Cc: stable, Greg Kroah-Hartman, LKML
In-Reply-To: <20180720121340.812996969@linuxfoundation.org>
On Fri, 2018-07-20 at 14:13 +0200, Greg Kroah-Hartman wrote:
> 4.4-stable review patch. If anyone has any objections, please let me know.
>
> ------------------
>
> From: Jiri Slaby <jslaby@suse.cz>
>
> [ Upstream commit 0ee1f4734967af8321ecebaf9c74221ace34f2d5 ]
[...]
> --- a/drivers/net/usb/r8152.c
> +++ b/drivers/net/usb/r8152.c
> @@ -3139,7 +3139,8 @@ static int rtl8152_close(struct net_devi
> #ifdef CONFIG_PM_SLEEP
> unregister_pm_notifier(&tp->pm_notifier);
> #endif
> - napi_disable(&tp->napi);
> + if (!test_bit(RTL8152_UNPLUG, &tp->flags))
> + napi_disable(&tp->napi);
> clear_bit(WORK_ENABLE, &tp->flags);
> usb_kill_urb(tp->intr_urb);
> cancel_delayed_work_sync(&tp->schedule);
This flag appears to be set only if the USB device is actually
disconnected. In case the driver is unbound for some other reason
(like the module is removed), the same problem will occur.
What I think might work is to do:
if (!list_empty(&tp->napi.dev_list)
napi_disable(&tp->napi);
Ben.
--
Ben Hutchings, Software Developer Codethink Ltd
https://www.codethink.co.uk/ Dale House, 35 Dale Street
Manchester, M1 2HF, United Kingdom
^ permalink raw reply
* [PATCH] ixgb: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Colin King @ 2018-08-24 16:52 UTC (permalink / raw)
To: Jeff Kirsher, David S . Miller, intel-wired-lan, netdev
Cc: kernel-janitors, linux-kernel
From: Colin Ian King <colin.king@canonical.com>
Use dma_zalloc_coherent rather than dam_alloc_coherent and memset.
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
drivers/net/ethernet/intel/ixgb/ixgb_main.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index 43664adf7a3c..c71bc0237108 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -771,14 +771,12 @@ ixgb_setup_rx_resources(struct ixgb_adapter *adapter)
rxdr->size = rxdr->count * sizeof(struct ixgb_rx_desc);
rxdr->size = ALIGN(rxdr->size, 4096);
- rxdr->desc = dma_alloc_coherent(&pdev->dev, rxdr->size, &rxdr->dma,
- GFP_KERNEL);
-
+ rxdr->desc = dma_zalloc_coherent(&pdev->dev, rxdr->size, &rxdr->dma,
+ GFP_KERNEL);
if (!rxdr->desc) {
vfree(rxdr->buffer_info);
return -ENOMEM;
}
- memset(rxdr->desc, 0, rxdr->size);
rxdr->next_to_clean = 0;
rxdr->next_to_use = 0;
--
2.17.1
^ permalink raw reply related
* Re: [PATCH NET 3/3] net: hns: add configuration constraints for 1000M half
From: Andrew Lunn @ 2018-08-24 13:28 UTC (permalink / raw)
To: lipeng (Y)
Cc: davem, netdev, linux-kernel, linuxarm, yisen.zhuang, salil.mehta
In-Reply-To: <194ac663-94d8-55e2-0231-a6be53e35f04@huawei.com>
On Fri, Aug 24, 2018 at 02:39:36PM +0800, lipeng (Y) wrote:
>
>
> On 2018/8/24 11:41, Andrew Lunn wrote:
> >On Fri, Aug 24, 2018 at 11:42:23AM +0800, Peng Li wrote:
> >>Hisilicon hip05 and hip06 board network card do not support
> >>1000M half configuration. Driver can not config gmac as
> >>1000M half.
> >>
> >>Signed-off-by: Peng Li <lipeng321@huawei.com>
> >Hi Peng
> >
> >Does the driver remove SUPPORTED_1000baseT_Half from
> >phydev->supported? If you do that, the PHY should never negotiate
> >this speed.
> >
> > Andrew
> Hi, Andrew,
>
> The driver has removed SUPPORTED_1000baseT_Half from
>
> phydev->supported.
>
> the code is :
> #define MAC_GMAC_SUPPORTED \
> (SUPPORTED_10baseT_Half \
> | SUPPORTED_10baseT_Full \
> | SUPPORTED_100baseT_Half \
> | SUPPORTED_100baseT_Full \
> | SUPPORTED_Autoneg)
> h->if_support = MAC_GMAC_SUPPORTED;
> h->if_support |= SUPPORTED_1000baseT_Full;
> phydev->supported &= h->if_support;
>
> As gmac do not support 1000M half, we add this patch to
> make sure that no users can set 1000M half in any case.
Well, not quite. What this patch does is protect the hardware when it
is asked to change to an unsupported mode. This patch has nothing to
do with user APIs.
The user API for setting link modes is hns_nic_set_link_ksettings().
What you do have is
if (speed == SPEED_1000 && cmd->base.duplex == DUPLEX_HALF)
return -EINVAL;
which should stop somebody setting up a fixed speed link at 1000Half.
But you don't do anything with cmd->link_modes.advertising. However,
when you call phy_ethtool_ksettings_set(), it gets AND'ed with
phydev->supported, which you have already removed 1000Half from.
So is this a patch for a theoretical problem? Or have you seen it
happen? If it did happen, how did the user configure it to cause this
problem? That user API needs to prevent it.
Andrew
^ permalink raw reply
* [PATCH] wcn36xx: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Colin King @ 2018-08-24 17:14 UTC (permalink / raw)
To: Kalle Valo, David S . Miller, wcn36xx, linux-wireless, netdev
Cc: kernel-janitors, linux-kernel
From: Colin Ian King <colin.king@canonical.com>
It is preferable to use dma_zalloc_coherent rather than dam_alloc_coherent
and memset, so use it.
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
drivers/net/wireless/ath/wcn36xx/dxe.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index 06cfe8d311f3..e66ddaa39523 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -174,13 +174,11 @@ static int wcn36xx_dxe_init_descs(struct device *dev, struct wcn36xx_dxe_ch *wcn
int i;
size = wcn_ch->desc_num * sizeof(struct wcn36xx_dxe_desc);
- wcn_ch->cpu_addr = dma_alloc_coherent(dev, size, &wcn_ch->dma_addr,
- GFP_KERNEL);
+ wcn_ch->cpu_addr = dma_zalloc_coherent(dev, size, &wcn_ch->dma_addr,
+ GFP_KERNEL);
if (!wcn_ch->cpu_addr)
return -ENOMEM;
- memset(wcn_ch->cpu_addr, 0, size);
-
cur_dxe = (struct wcn36xx_dxe_desc *)wcn_ch->cpu_addr;
cur_ctl = wcn_ch->head_blk_ctl;
--
2.17.1
^ permalink raw reply related
* Re: should __LINK_STATE_* enums really be restricted to generic queueing layer?
From: Andrew Lunn @ 2018-08-24 13:49 UTC (permalink / raw)
To: rpjday; +Cc: netdev
In-Reply-To: <20180824071738.Horde.kWZLgt1409E2fICBH1FiM5U@crashcourse.ca>
On Fri, Aug 24, 2018 at 07:17:38AM -0400, rpjday@crashcourse.ca wrote:
> more curious pedantry ... in netdevice.h, one reads:
>
> /* These flag bits are private to the generic network queueing
> * layer; they may not be explicitly referenced by any other
> * code.
> */
>
> enum netdev_state_t {
> __LINK_STATE_START,
> __LINK_STATE_PRESENT,
> __LINK_STATE_NOCARRIER,
> __LINK_STATE_LINKWATCH_PENDING,
> __LINK_STATE_DORMANT,
> };
>
> ok, but there are definitely a few examples of what look like
> non-generic network queueing code referencing those enums.
>
> one example, drivers/net/ethernet/amd/sun3lance.c, which openly
> (and amusingly) admits that it shouldn't be doing this:
>
> dev->netdev_ops = &lance_netdev_ops;
> // KLUDGE -- REMOVE ME
> set_bit(__LINK_STATE_PRESENT, &dev->state);
> return 1;
> }
>
> there are a few more places (admittedly, not many) -- is this
> acceptable behaviour?
Not in a new driver.
The sun3lance is a very old driver. I have misty memories of a Sun
3/60. There is nothing in this code about a driving a PHY. Maybe using
netif_carrier_on() would work, but without having access to the
hardware to test, i would just leave it alone.
Andrew
^ permalink raw reply
* Re: [PATCH] i40e: report correct statistics when XDP is enabled
From: Jesper Dangaard Brouer @ 2018-08-24 14:00 UTC (permalink / raw)
To: Björn Töpel
Cc: jeffrey.t.kirsher, intel-wired-lan, magnus.karlsson,
magnus.karlsson, jacob.e.keller, bjorn.topel, brouer,
netdev@vger.kernel.org
In-Reply-To: <20180824112159.32298-1-bjorn.topel@intel.com>
On Fri, 24 Aug 2018 13:21:59 +0200
Björn Töpel <bjorn.topel@intel.com> wrote:
> When XDP is enabled, the driver will report incorrect
> statistics. Received frames will reported as transmitted frames.
>
> This commits fixes the i40e implementation of ndo_get_stats64 (struct
> net_device_ops), so that iproute2 will report correct statistics
> (e.g. when running "ip -stats link show dev eth0") even when XDP is
> enabled.
>
> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Fixes: 74608d17fe29 ("i40e: add support for XDP_TX action")
Stable candidate:
$ git describe --contains 74608d17fe29
v4.13-rc1~157^2~128^2~13
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
It works for me:
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
I'm explicitly _not_ ACK'ing the patch, as I think the your code changes
below makes it harder to follow whether a TX or RX ring is getting
updated. But it is 100% up to the driver maintainers to say if this is
acceptable from a maintenance PoV.
> ---
> drivers/net/ethernet/intel/i40e/i40e_main.c | 24 +++++++++++----------
> 1 file changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index e40c023cc7b6..7c122dd3faa1 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -425,9 +425,9 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
> struct rtnl_link_stats64 *stats)
> {
> struct i40e_netdev_priv *np = netdev_priv(netdev);
> - struct i40e_ring *tx_ring, *rx_ring;
> struct i40e_vsi *vsi = np->vsi;
> struct rtnl_link_stats64 *vsi_stats = i40e_get_vsi_stats_struct(vsi);
> + struct i40e_ring *ring;
> int i;
>
> if (test_bit(__I40E_VSI_DOWN, vsi->state))
> @@ -441,24 +441,26 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
> u64 bytes, packets;
> unsigned int start;
>
> - tx_ring = READ_ONCE(vsi->tx_rings[i]);
> - if (!tx_ring)
> + ring = READ_ONCE(vsi->tx_rings[i]);
> + if (!ring)
> continue;
> - i40e_get_netdev_stats_struct_tx(tx_ring, stats);
> + i40e_get_netdev_stats_struct_tx(ring, stats);
>
> - rx_ring = &tx_ring[1];
> + if (i40e_enabled_xdp_vsi(vsi)) {
> + ring++;
> + i40e_get_netdev_stats_struct_tx(ring, stats);
> + }
>
> + ring++;
> do {
> - start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
> - packets = rx_ring->stats.packets;
> - bytes = rx_ring->stats.bytes;
> - } while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
> + start = u64_stats_fetch_begin_irq(&ring->syncp);
> + packets = ring->stats.packets;
> + bytes = ring->stats.bytes;
> + } while (u64_stats_fetch_retry_irq(&ring->syncp, start));
>
> stats->rx_packets += packets;
> stats->rx_bytes += bytes;
>
> - if (i40e_enabled_xdp_vsi(vsi))
> - i40e_get_netdev_stats_struct_tx(&rx_ring[1], stats);
> }
> rcu_read_unlock();
>
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [PATCH] wcn36xx: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Kalle Valo @ 2018-08-24 17:35 UTC (permalink / raw)
To: Colin King
Cc: David S . Miller, wcn36xx, linux-wireless, netdev,
kernel-janitors, linux-kernel
In-Reply-To: <20180824171410.22539-1-colin.king@canonical.com>
Colin King <colin.king@canonical.com> writes:
> From: Colin Ian King <colin.king@canonical.com>
>
> It is preferable to use dma_zalloc_coherent rather than dam_alloc_coherent
> and memset, so use it.
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
> drivers/net/wireless/ath/wcn36xx/dxe.c | 6 ++----
> 1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
> index 06cfe8d311f3..e66ddaa39523 100644
> --- a/drivers/net/wireless/ath/wcn36xx/dxe.c
> +++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
> @@ -174,13 +174,11 @@ static int wcn36xx_dxe_init_descs(struct device *dev, struct wcn36xx_dxe_ch *wcn
> int i;
>
> size = wcn_ch->desc_num * sizeof(struct wcn36xx_dxe_desc);
> - wcn_ch->cpu_addr = dma_alloc_coherent(dev, size, &wcn_ch->dma_addr,
> - GFP_KERNEL);
> + wcn_ch->cpu_addr = dma_zalloc_coherent(dev, size, &wcn_ch->dma_addr,
> + GFP_KERNEL);
> if (!wcn_ch->cpu_addr)
> return -ENOMEM;
>
> - memset(wcn_ch->cpu_addr, 0, size);
> -
> cur_dxe = (struct wcn36xx_dxe_desc *)wcn_ch->cpu_addr;
> cur_ctl = wcn_ch->head_blk_ctl;
Hehe, I think you are the third person submitting a similar patch :)
Anyway, I just applied this few hours ago:
https://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git/commit/?h=ath-next&id=d410e28f3ae476e1572b8893c646ef44fae7bbbd
--
Kalle Valo
^ permalink raw reply
* Re: ixgbe hangs when XDP_TX is enabled
From: Jesper Dangaard Brouer @ 2018-08-24 14:25 UTC (permalink / raw)
To: Jeff Kirsher
Cc: brouer, Alexander Duyck, tehnerd, Netdev, tytus.a.wasilewski,
Tymoteusz Kielan, John Fastabend, Daniel Borkmann,
Alexei Starovoitov
In-Reply-To: <e9762fc08e0c62c52ee5169e43e512f5d65087a2.camel@intel.com>
On Wed, 22 Aug 2018 09:22:58 -0700 Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:
> On Tue, 2018-08-21 at 11:13 -0700, Alexander Duyck wrote:
> > On Tue, Aug 21, 2018 at 9:59 AM Nikita V. Shirokov <tehnerd@tehnerd.com> wrote:
> > >
> > > On Tue, Aug 21, 2018 at 08:58:15AM -0700, Alexander Duyck wrote:
> > > > On Mon, Aug 20, 2018 at 12:32 PM Nikita V. Shirokov <tehnerd@tehnerd.com> wrote:
> > > > >
> > > > > we are getting such errors:
> > > > >
> > > > > [ 408.737313] ixgbe 0000:03:00.0 eth0: Detected Tx Unit Hang (XDP)
> > > > > Tx Queue <46>
> > > > > TDH, TDT <0>, <2>
> > > > > next_to_use <2>
> > > > > next_to_clean <0>
> > > > > tx_buffer_info[next_to_clean]
> > > > > time_stamp <0>
> > > > > jiffies <1000197c0>
> > > > > [ 408.804438] ixgbe 0000:03:00.0 eth0: tx hang 1 detected on queue 46, resetting adapter
> > > > > [ 408.804440] ixgbe 0000:03:00.0 eth0: initiating reset due to tx timeout
> > > > > [ 408.817679] ixgbe 0000:03:00.0 eth0: Reset adapter
> > > > > [ 408.866091] ixgbe 0000:03:00.0 eth0: TXDCTL.ENABLE for one or more queues not cleared within the polling period
> > > > > [ 409.345289] ixgbe 0000:03:00.0 eth0: detected SFP+: 3
> > > > > [ 409.497232] ixgbe 0000:03:00.0 eth0: NIC Link is Up 10 Gbps, Flow Control: RX/TX
> > > > >
> > > > > while running XDP prog on ixgbe nic.
> > > > > right now i'm seing this on bpfnext kernel
> > > > > (latest commit from Wed Aug 15 15:04:25 2018 -0700 ;
> > > > > 9a76aba02a37718242d7cdc294f0a3901928aa57)
> > > > >
> > > > > looks like this is the same issue as reported by Brenden in
> > > > > https://www.spinics.net/lists/netdev/msg439438.html
> > > > >
> > > > [...] The total number of CPUs on the system
> > > > would be useful to know as well.
[...]
> > > # nproc
> > > 48
> > >
[...]
> > > ethtool -l eth0
> > > Channel parameters for eth0:
> > > Pre-set maximums:
> > > RX: 0
> > > TX: 0
> > > Other: 1
> > > Combined: 63
> > > Current hardware settings:
> > > RX: 0
> > > TX: 0
> > > Other: 1
> > > Combined: 48
[...]
> > >
> > > workaround for now is to do the same, as Brenden did in his
> > > original
> > > finding: make sure that combined + xdp queues < max_tx_queues
> > > (e.g. w/ combined == 14 the issue goes away).
> > >
> > > > the issue with one of the sample XDP programs provided with the
> > > > kernel such as the xdp2 which I believe uses the XDP_TX
> > > > function. We need to try and create a similar setup in our own
> > > > environment for reproduction and debugging.
> > >
> > > will try but this could take a while, because i'm not sure that we
> > > have ixgbe in our test lab (and it would be hard to run such test
> > > in prod)
Notice to reproduce you need a system with 48 cores. (I predict: for
less than 33 cores it will not show, and above 48 cores the XDP prog
should be rejected loading).
> >
> > So I have been reading the datasheet
> > (
> > https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82599-10-gbe-controller-datasheet.pdf
> > )
> > and it looks like the assumption that Brenden came to in the earlier
> > referenced link is probably correct. From what I can tell there is a
> > limit of 64 queues in the base RSS mode of the device, so while it
> > supports more than 64 queues you can only make use of 64 as per table
> > 7-25.
> >
As far as I can remember, the driver code assumes up-to 96 queue are
avail. It sounds like the driver XDP code that allocates 'one XDP
TX-queue per core' in the system are causing this.
I have previously complained that the ixgbe driver will be able to
enable XDP on machines with many CPU cores, due to the 'one XDP TX-queue
per core' design requirement.
> > For now I think the workaround you are using is probably the only
> > viable solution. I myself don't have time to work on resolving this,
> > but I am sure on of the maintainers for ixgbe will be responding
> > shortly.
>
> I have notified the 10GbE maintainers, and we are working to reproduce
> the issue currently.
For reproducers, notice the correlation with the number of cores the
system have.
> > One possible solution we may want to look at would be to make use of
> > the 32 pool/VF mode in the MTQC register. That should enable us to
> > make use of all 128 queues but I am sure there would be other side
> > effects such as having to set the bits in the PFVFTE register in
> > order to enable the extra Tx queues.
Getting access to more queue is of-cause good, as it move the bar for
how many cores a system can have before XDP will no-longer work with
the ixgbe driver.
An alternative solution is also possible, but there will be a
performance trade off. After merge commit 10f678683e4 ("Merge branch
'xdp_xmit-bulking'") the ndo_xdp_xmit() gets bulks of 16 frames (limit
within devmap). Thus, on systems that cannot allocate a NIC HW queue
foreach CPU core, can alternatively use locked XDP TX queue(s) (which
will be amortized due to bulking). This mode will be slower, thus the
question is how do we "warn" the user, that this will be operating in a
slightly less optimal XDP-TX mode? (will a simple pr_info be enough,
like when there is insufficient PCIe BW).
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [PATCH] ixgb: replace dma_alloc_coherent + memset with dma_zalloc_coherent
From: Jeff Kirsher @ 2018-08-24 18:21 UTC (permalink / raw)
To: Colin King, David S . Miller, intel-wired-lan, netdev
Cc: kernel-janitors, linux-kernel
In-Reply-To: <20180824165232.21585-1-colin.king@canonical.com>
[-- Attachment #1: Type: text/plain, Size: 547 bytes --]
On Fri, 2018-08-24 at 17:52 +0100, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
>
> Use dma_zalloc_coherent rather than dam_alloc_coherent and memset.
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
> drivers/net/ethernet/intel/ixgb/ixgb_main.c | 6 ++----
> 1 file changed, 2 insertions(+), 4 deletions(-)
Thanks Colin, I already have this change queued up from another community
member and I am just about to send a pull request to Dave for this change,
as well as several other fixes.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* Re: [PATCH v2] net: macb: do not disable MDIO bus at open/close time
From: Claudiu Beznea @ 2018-08-24 14:47 UTC (permalink / raw)
To: Anssi Hannula, netdev, Nicolas Ferre; +Cc: David S. Miller, Andrew Lunn
In-Reply-To: <20180823074522.5663-1-anssi.hannula@bitwise.fi>
On 23.08.2018 10:45, Anssi Hannula wrote:
> macb_reset_hw() is called from macb_close() and indirectly from
> macb_open(). macb_reset_hw() zeroes the NCR register, including the MPE
> (Management Port Enable) bit.
>
> This will prevent accessing any other PHYs for other Ethernet MACs on
> the MDIO bus, which remains registered at macb_reset_hw() time, until
> macb_init_hw() is called from macb_open() which sets the MPE bit again.
>
> I.e. currently the MDIO bus has a short disruption at open time and is
> disabled at close time until the interface is opened again.
>
> Fix that by only touching the RE and TE bits when enabling and disabling
> RX/TX.
>
> v2: Make macb_init_hw() NCR write a single statement.
This usually goes after the tree '-' below.
>
> Fixes: 6c36a7074436 ("macb: Use generic PHY layer")
> Signed-off-by: Anssi Hannula <anssi.hannula@bitwise.fi>
> ---
Here:
v2: Make macb_init_hw() NCR write a single statement.
Other than this:
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Checked that link goes up and ping is working on a SAMA5D2 Xplained:
Tested-by: Claudiu Beznea <claudiu.beznea@microchip.com>
> drivers/net/ethernet/cadence/macb_main.c | 9 ++++++---
> 1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
> index dc09f9a8a49b..225a7c8bad2d 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -2028,14 +2028,17 @@ static void macb_reset_hw(struct macb *bp)
> {
> struct macb_queue *queue;
> unsigned int q;
> + u32 ctrl = macb_readl(bp, NCR);
>
> /* Disable RX and TX (XXX: Should we halt the transmission
> * more gracefully?)
> */
> - macb_writel(bp, NCR, 0);
> + ctrl &= ~(MACB_BIT(RE) | MACB_BIT(TE));
>
> /* Clear the stats registers (XXX: Update stats first?) */
> - macb_writel(bp, NCR, MACB_BIT(CLRSTAT));
> + ctrl |= MACB_BIT(CLRSTAT);
> +
> + macb_writel(bp, NCR, ctrl);
>
> /* Clear all status flags */
> macb_writel(bp, TSR, -1);
> @@ -2223,7 +2226,7 @@ static void macb_init_hw(struct macb *bp)
> }
>
> /* Enable TX and RX */
> - macb_writel(bp, NCR, MACB_BIT(RE) | MACB_BIT(TE) | MACB_BIT(MPE));
> + macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(RE) | MACB_BIT(TE));
> }
>
> /* The hash address register is 64 bits long and takes up two
>
^ permalink raw reply
* Re: [PATCH v2 01/29] nvmem: add support for cell lookups
From: Boris Brezillon @ 2018-08-24 15:08 UTC (permalink / raw)
To: Bartosz Golaszewski
Cc: Jonathan Corbet, Sekhar Nori, Kevin Hilman, Russell King,
Arnd Bergmann, Greg Kroah-Hartman, David Woodhouse, Brian Norris,
Marek Vasut, Richard Weinberger, Grygorii Strashko,
David S . Miller, Srinivas Kandagatla, Naren,
Mauro Carvalho Chehab, Andrew Morton, Lukas Wunner, Dan Carpenter,
Florian Fainelli
In-Reply-To: <20180810080526.27207-2-brgl@bgdev.pl>
Hi Bartosz,
On Fri, 10 Aug 2018 10:04:58 +0200
Bartosz Golaszewski <brgl@bgdev.pl> wrote:
> +struct nvmem_cell_lookup {
> + struct nvmem_cell_info info;
> + struct list_head list;
> + const char *nvmem_name;
> +};
Hm, maybe I don't get it right, but this looks suspicious. Usually the
consumer lookup table is here to attach device specific names to
external resources.
So what I'd expect here is:
struct nvmem_cell_lookup {
/* The nvmem device name. */
const char *nvmem_name;
/* The nvmem cell name */
const char *nvmem_cell_name;
/*
* The local resource name. Basically what you have in the
* nvmem-cell-names prop.
*/
const char *conid;
};
struct nvmem_cell_lookup_table {
struct list_head list;
/* ID of the consumer device. */
const char *devid;
/* Array of cell lookup entries. */
unsigned int ncells;
const struct nvmem_cell_lookup *cells;
};
Looks like your nvmem_cell_lookup is more something used to attach cells
to an nvmem device, which is NVMEM provider's responsibility not the
consumer one.
Regards,
Boris
^ permalink raw reply
* RE: [net 07/13] ice: Use order_base_2 to calculate higher power of 2
From: Keller, Jacob E @ 2018-08-24 15:39 UTC (permalink / raw)
To: Sergei Shtylyov, Kirsher, Jeffrey T, davem@davemloft.net
Cc: netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
jogreene@redhat.com, Venkataramanan, Anirudh
In-Reply-To: <9c66a3d0-9c2a-c9dc-b3e1-d7e9ea12e402@cogentembedded.com>
> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Sergei Shtylyov
> Sent: Friday, August 24, 2018 2:03 AM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; davem@davemloft.net
> Cc: Keller, Jacob E <jacob.e.keller@intel.com>; netdev@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jogreene@redhat.com;
> Venkataramanan, Anirudh <anirudh.venkataramanan@intel.com>
> Subject: Re: [net 07/13] ice: Use order_base_2 to calculate higher power of 2
>
> Hello!
>
> On 8/23/2018 10:14 PM, Jeff Kirsher wrote:
>
> > From: Jacob Keller <jacob.e.keller@intel.com>
> >
> > Currently, we use a combination of ilog2 and is_power_of_2() to
> > calculate the next power of 2 for the qcount. This appears to be causing
> > a warning on some combinations of GCC and the Linux kernel:
> >
> > MODPOST 1 modules
> > WARNING: "____ilog2_NaN" [ice.ko] undefined!
> >
> > This appears to because because GCC realizes that qcount could be zero
>
> One "because" is enough. :-)
Woops! Thanks for catching it.
Regards,
Jake
^ permalink raw reply
* Re: [net 07/13] ice: Use order_base_2 to calculate higher power of 2
From: Sergei Shtylyov @ 2018-08-24 15:53 UTC (permalink / raw)
To: Keller, Jacob E, Kirsher, Jeffrey T, davem@davemloft.net
Cc: netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
jogreene@redhat.com, Venkataramanan, Anirudh
In-Reply-To: <02874ECE860811409154E81DA85FBB5884C6E3BF@ORSMSX115.amr.corp.intel.com>
On 08/24/2018 06:39 PM, Keller, Jacob E wrote:
>>> From: Jacob Keller <jacob.e.keller@intel.com>
>>>
>>> Currently, we use a combination of ilog2 and is_power_of_2() to
>>> calculate the next power of 2 for the qcount. This appears to be causing
>>> a warning on some combinations of GCC and the Linux kernel:
>>>
>>> MODPOST 1 modules
>>> WARNING: "____ilog2_NaN" [ice.ko] undefined!
>>>
>>> This appears to because because GCC realizes that qcount could be zero
>>
>> One "because" is enough. :-)
>
> Woops! Thanks for catching it.
You probably meant to type "happen" instead of the 1st because...
> Regards,
> Jake
MBR, Sergei
^ permalink raw reply
* Re: [RFC 1/3 net] lorawan: Add LoRaWAN class module
From: Jian-Hong Pan @ 2018-08-24 15:58 UTC (permalink / raw)
To: rdunlap
Cc: Andreas Färber, netdev,
<linux-arm-kernel@lists.infradead.org\,
linux-kernel@vger.kernel.org>,, Jiri Pirko, Marcel Holtmann,
David S. Miller, Matthias Brugger, Janus Piwek,
Michael Röder, Dollar Chen, Ken Yu, Konstantin Böhm,
Jan Jongboom, Jon Ortego, linux-kernel@vger.kernel.org>,,
Ben Whitten
In-Reply-To: <30b75248-7589-fc98-819f-2b68844522f1@infradead.org>
Randy Dunlap <rdunlap@infradead.org> 於 2018年8月24日 週五 上午1:43寫道:
>
> Hi,
>
> On 08/23/2018 10:15 AM, Jian-Hong Pan wrote:
> > diff --git a/net/maclorawan/Kconfig b/net/maclorawan/Kconfig
> > new file mode 100644
> > index 000000000000..741992b059c6
> > --- /dev/null
> > +++ b/net/maclorawan/Kconfig
> > @@ -0,0 +1,14 @@
> > +config LORAWAN
> > + tristate "MAC layer of LoRaWAN Network support"
> > + select CRYPTO
> > + select CRYPTO_CMAC
> > + select CRYPTO_CBC
> > + select CRYPTO_AES
> > + ---help---
> > + LoRaWAN defines a low data rate, low power and long range wireless
> > + wide area networks. It was designed to organise networks of sensors,
> > + switches and actuators, etc automation devices. It could operate as
> > + multiple kilometers wide.
>
> There are several things that I would change. How about this instead?
>
>
> LoRaWAN defines low data rate, low power and long range wireless
> wide area networks. It was designed to organise networks of automation
> devices, such as sensors, switches and actuators. It can operate
> multiple kilometers wide.
That becomes better! Thank you
> > +
> > + Say Y here to compile LoRaWAN support into the kernel or say M to
> > + compile it as modules.
>
>
> --
> ~Randy
^ permalink raw reply
* Re: [PATCH] sh_eth: Add R7S9210 support
From: Sergei Shtylyov @ 2018-08-24 16:07 UTC (permalink / raw)
To: Chris Brandt, David S . Miller, Rob Herring, Mark Rutland
Cc: netdev, devicetree, linux-renesas-soc, Simon Horman
In-Reply-To: <20180822205705.20835-1-chris.brandt@renesas.com>
On 08/22/2018 11:57 PM, Chris Brandt wrote:
> Add support for the R7S9210 which is part of the RZ/A2 series.
Is the manual publicly available? I tried to find it on Renesas' website
but it appears royally broken...
> Signed-off-by: Chris Brandt <chris.brandt@renesas.com>
> ---
> Documentation/devicetree/bindings/net/sh_eth.txt | 1 +
> drivers/net/ethernet/renesas/sh_eth.c | 107 +++++++++++++++++++++++
> drivers/net/ethernet/renesas/sh_eth.h | 1 +
> 3 files changed, 109 insertions(+)
>
[...]
> diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
> index 5573199c4536..f3a575e56ae4 100644
> --- a/drivers/net/ethernet/renesas/sh_eth.c
> +++ b/drivers/net/ethernet/renesas/sh_eth.c
> @@ -216,6 +216,59 @@ static const u16 sh_eth_offset_fast_rz[SH_ETH_MAX_REGISTER_OFFSET] = {
> [RXALCR0] = 0x008C,
> };
>
> +static const u16 sh_eth_offset_fast_rza2[SH_ETH_MAX_REGISTER_OFFSET] = {
> + SH_ETH_OFFSET_DEFAULTS,
> +
> + [EDMR] = 0x0000,
> + [EDTRR] = 0x0008,
> + [EDRRR] = 0x0010,
> + [EESR] = 0x0028,
> + [EESIPR] = 0x0030,
> + [TDLAR] = 0x0018,
> + [TBRAR] = 0x00d4,
> + [TDFAR] = 0x00d8,
> + [RDLAR] = 0x0020,
> + [RBWAR] = 0x00c8,
> + [RDFAR] = 0x00cc,
> + [TRSCER] = 0x0038,
> + [RMFCR] = 0x0040,
> + [TFTR] = 0x0048,
> + [FDR] = 0x0050,
> + [RMCR] = 0x0058,
> + [TFUCR] = 0x0064,
> + [RFOCR] = 0x0068,
> + [RPADIR] = 0x0078,
> + [TRIMD] = 0x007c,
> + [FCFTR] = 0x0070,
> +
> + [ECMR] = 0x0100,
> + [RFLR] = 0x0108,
> + [ECSR] = 0x0110,
> + [ECSIPR] = 0x0118,
> + [PIR] = 0x0120,
> + [PSR] = 0x0128,
> + [RDMLR] = 0x0140,
> + [IPGR] = 0x0150,
> + [APR] = 0x0154,
> + [MPR] = 0x0158,
> + [RFCF] = 0x0160,
> + [TPAUSER] = 0x0164,
> + [TPAUSECR] = 0x0168,
> + [BCFRR] = 0x016c,
> + [MAHR] = 0x01c0,
> + [MALR] = 0x01c8,
> + [TROCR] = 0x01d0,
> + [CDCR] = 0x01d4,
> + [LCCR] = 0x01d8,
> + [CNDCR] = 0x01dc,
> + [CEFCR] = 0x01e4,
> + [FRECR] = 0x01e8,
> + [TSFRCR] = 0x01ec,
> + [TLFRCR] = 0x01f0,
> + [RFCR] = 0x01f4,
> + [MAFCR] = 0x01f8,
> +};
> +
This array is absolutely the same as sh_eth_offset_fast_sh4[], except the latter
is (more or less) sorted.
> static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = {
> SH_ETH_OFFSET_DEFAULTS,
>
> @@ -635,6 +688,56 @@ static struct sh_eth_cpu_data r7s72100_data = {
> .no_tx_cntrs = 1,
> };
>
> +/* R7S9210 */
> +static void sh_eth_set_rate_r7s9210(struct net_device *ndev)
> +{
> + struct sh_eth_private *mdp = netdev_priv(ndev);
> + unsigned long RTM = 0x00000004;
Use #define, please.
> +
> + switch (mdp->speed) {
> + case 10: /* 10BASE */
> + sh_eth_modify(ndev, ECMR, RTM, 0);
> + break;
> + case 100:/* 100BASE */
> + sh_eth_modify(ndev, ECMR, RTM, RTM);
> + break;
> + }
> +}
> +
Seems the same as sh_eth_set_rate_rcar(), except for the bit name...
> +static struct sh_eth_cpu_data r7s9210_data = {
> + .soft_reset = sh_eth_soft_reset,
> +
> + .set_duplex = sh_eth_set_duplex,
> + .set_rate = sh_eth_set_rate_r7s9210,
> +
> + .register_type = SH_ETH_REG_FAST_RZA2,
> +
> + .edtrr_trns = EDTRR_TRNS_ETHER,
> + .ecsr_value = ECSR_ICD,
> + .ecsipr_value = ECSIPR_ICDIP,
> + .eesipr_value = EESIPR_TWBIP | EESIPR_TABTIP | EESIPR_RABTIP |
> + EESIPR_RFCOFIP | EESIPR_ECIIP | EESIPR_FTCIP |
> + EESIPR_TDEIP | EESIPR_TFUFIP | EESIPR_FRIP |
> + EESIPR_RDEIP | EESIPR_RFOFIP | EESIPR_CNDIP |
> + EESIPR_DLCIP | EESIPR_CDIP | EESIPR_TROIP |
> + EESIPR_RMAFIP | EESIPR_RRFIP | EESIPR_RTLFIP |
> + EESIPR_RTSFIP | EESIPR_PREIP | EESIPR_CERFIP,
> +
> + .tx_check = EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_TRO,
> + .eesr_err_check = EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
> + EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE,
> +
> + .fdr_value = 0x0000070f,
> +
> + .apr = 1,
> + .mpr = 1,
> + .tpauser = 1,
> + .hw_swap = 1,
> + .rpadir = 1,
> + .no_ade = 1,
> + .xdfar_rw = 1,
> +};
> +
Can't check these w/o the manual...
> static void sh_eth_chip_reset_r8a7740(struct net_device *ndev)
> {
> sh_eth_chip_reset(ndev);
[...]
MNR, Sergei
^ permalink raw reply
* Re: broken behaviour of TC filter delete
From: Roman Mashak @ 2018-08-24 16:18 UTC (permalink / raw)
To: Jiri Pirko; +Cc: netdev, jiri, Jamal Hadi Salim
In-Reply-To: <20180824081751.GA2931@nanopsycho>
Jiri Pirko <jiri@resnulli.us> writes:
> Thu, Aug 23, 2018 at 11:39:22PM CEST, mrv@mojatatu.com wrote:
>>
>>
>>It appears that the following commit changed the behaviour of scenario where a
>>filter is deleted twice:
>>
>>commit f71e0ca4db187af7c44987e9d21e9042c3046070
>>Author: Jiri Pirko <jiri@mellanox.com>
>>Date: Mon Jul 23 09:23:05 2018 +0200
>>
>> net: sched: Avoid implicit chain 0 creation
>>
>>
>>Steps to reproduce :
>>
>>1) create dummy device
>> $ ip link add dev dummy0 type dummy
>>
>>2) create qdisc
>> $ tc qdisc add dev dummy0 ingress
>>
>>3) create simple u32 filter with action attached
>> $ tc filter add dev dummy0 parent ffff: protocol ip prio 1 u32 match ip src 10.10.10.1/32 action ok
>>
>>4) list the filter
>> $ tc filter ls dev dummy0 parent ffff:
>>
>>5) delete the filter with the given protocol and priority
>> $ tc filter del dev dummy0 parent ffff: protocol ip prio 1
>>
>>6) repeat step 5, this will return -ENOENT ("Error: Filter with specified priority/protocol not found.")
>>However, before the change at step 6 we would get -EINVAL (Error: Cannot find specified filter chain.)
>>and that makes sense.
>
> Wait, this now returns:
> Error: Cannot find specified filter chain.
> So you want it to return -EINVAL (Error: Cannot find specified filter chain.) ?
> How about for other chains?
>
I must've mixed up the return codes and messages while typing the
message.
So _before_ commit f71e0ca4db187af7c44987e9d21e9042c3046070 step 6 would
return -ENOENT with "Error: Filter with specified priority/protocol not
found." and _after_ the commit it returns -EINVAL (Error: Cannot find
specified filter chain.)
ENOENT seems to be more logical to return when there's no more filter to delete.
^ permalink raw reply
* [PATCH v2 01/17] asm: simd context helper API
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
To: linux-kernel, netdev, davem
Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
linux-arch
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>
Sometimes it's useful to amortize calls to XSAVE/XRSTOR and the related
FPU/SIMD functions over a number of calls, because FPU restoration is
quite expensive. This adds a simple header for carrying out this pattern:
simd_context_t simd_context = simd_get();
while ((item = get_item_from_queue()) != NULL) {
encrypt_item(item, simd_context);
simd_context = simd_relax(simd_context);
}
simd_put(simd_context);
The relaxation step ensures that we don't trample over preemption, and
the get/put API should be a familiar paradigm in the kernel.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: linux-arch@vger.kernel.org
---
arch/alpha/include/asm/Kbuild | 5 ++--
arch/arc/include/asm/Kbuild | 1 +
arch/arm/include/asm/simd.h | 42 ++++++++++++++++++++++++++++++
arch/arm64/include/asm/simd.h | 37 +++++++++++++++++++++-----
arch/c6x/include/asm/Kbuild | 3 ++-
arch/h8300/include/asm/Kbuild | 3 ++-
arch/hexagon/include/asm/Kbuild | 1 +
arch/ia64/include/asm/Kbuild | 1 +
arch/m68k/include/asm/Kbuild | 1 +
arch/microblaze/include/asm/Kbuild | 1 +
arch/mips/include/asm/Kbuild | 1 +
arch/nds32/include/asm/Kbuild | 7 ++---
arch/nios2/include/asm/Kbuild | 1 +
arch/openrisc/include/asm/Kbuild | 7 ++---
arch/parisc/include/asm/Kbuild | 1 +
arch/powerpc/include/asm/Kbuild | 3 ++-
arch/riscv/include/asm/Kbuild | 3 ++-
arch/s390/include/asm/Kbuild | 3 ++-
arch/sh/include/asm/Kbuild | 1 +
arch/sparc/include/asm/Kbuild | 1 +
arch/um/include/asm/Kbuild | 3 ++-
arch/unicore32/include/asm/Kbuild | 1 +
arch/x86/include/asm/simd.h | 30 ++++++++++++++++++++-
arch/xtensa/include/asm/Kbuild | 1 +
include/asm-generic/simd.h | 15 +++++++++++
include/linux/simd.h | 28 ++++++++++++++++++++
26 files changed, 180 insertions(+), 21 deletions(-)
create mode 100644 arch/arm/include/asm/simd.h
create mode 100644 include/linux/simd.h
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 0580cb8c84b2..07b2c1025d34 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -2,14 +2,15 @@
generic-y += compat.h
+generic-y += current.h
generic-y += exec.h
generic-y += export.h
generic-y += fb.h
generic-y += irq_work.h
+generic-y += kprobes.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += preempt.h
generic-y += sections.h
+generic-y += simd.h
generic-y += trace_clock.h
-generic-y += current.h
-generic-y += kprobes.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index feed50ce89fa..a7f4255f1649 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += parport.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
+generic-y += simd.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += user.h
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
new file mode 100644
index 000000000000..bf468993bbef
--- /dev/null
+++ b/arch/arm/include/asm/simd.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
+static __must_check inline bool may_use_simd(void)
+{
+ return !in_interrupt();
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+
+static inline simd_context_t simd_get(void)
+{
+ bool have_simd = may_use_simd();
+ if (have_simd)
+ kernel_neon_begin();
+ return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+ if (prior_context != HAVE_NO_SIMD)
+ kernel_neon_end();
+}
+#else
+static inline simd_context_t simd_get(void)
+{
+ return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+#endif
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index 6495cc51246f..058c336de38d 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -1,11 +1,10 @@
-/*
- * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+/* SPDX-License-Identifier: GPL-2.0
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
+#include <linux/simd.h>
#ifndef __ASM_SIMD_H
#define __ASM_SIMD_H
@@ -16,6 +15,8 @@
#include <linux/types.h>
#ifdef CONFIG_KERNEL_MODE_NEON
+#include <asm/neon.h>
+#include <asm/simd.h>
DECLARE_PER_CPU(bool, kernel_neon_busy);
@@ -40,12 +41,36 @@ static __must_check inline bool may_use_simd(void)
!this_cpu_read(kernel_neon_busy);
}
+static inline simd_context_t simd_get(void)
+{
+ bool have_simd = may_use_simd();
+ if (have_simd)
+ kernel_neon_begin();
+ return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+ if (prior_context != HAVE_NO_SIMD)
+ kernel_neon_end();
+}
+
#else /* ! CONFIG_KERNEL_MODE_NEON */
-static __must_check inline bool may_use_simd(void) {
+static __must_check inline bool may_use_simd(void)
+{
return false;
}
+static inline simd_context_t simd_get(void)
+{
+ return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
#endif /* ! CONFIG_KERNEL_MODE_NEON */
#endif
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 33a2c94fed0d..22f3d8333c74 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -5,8 +5,8 @@ generic-y += compat.h
generic-y += current.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += exec.h
generic-y += extable.h
@@ -30,6 +30,7 @@ generic-y += pgalloc.h
generic-y += preempt.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += tlbflush.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index a5d0b2991f47..f5c2f12d593e 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -8,8 +8,8 @@ generic-y += current.h
generic-y += delay.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += exec.h
generic-y += extable.h
@@ -39,6 +39,7 @@ generic-y += preempt.h
generic-y += scatterlist.h
generic-y += sections.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += spinlock.h
generic-y += timex.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index dd2fd9c0d292..217d4695fd8a 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -29,6 +29,7 @@ generic-y += rwsem.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index 557bbc8ba9f5..41c5ebdf79e5 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += irq_work.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += preempt.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += vtime.h
generic-y += word-at-a-time.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index a4b8d3331a9e..73898dd1a4d0 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -19,6 +19,7 @@ generic-y += mm-arch-hooks.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += sections.h
+generic-y += simd.h
generic-y += spinlock.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index fe6a6c6e5003..9002fb24888c 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += parport.h
generic-y += percpu.h
generic-y += preempt.h
generic-y += serial.h
+generic-y += simd.h
generic-y += syscalls.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 58351e48421e..e8868e0fb2c3 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += qrwlock.h
generic-y += qspinlock.h
generic-y += sections.h
generic-y += segment.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += unaligned.h
generic-y += user.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index dbc4e5422550..603c1d020620 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -7,14 +7,14 @@ generic-y += bug.h
generic-y += bugs.h
generic-y += checksum.h
generic-y += clkdev.h
-generic-y += cmpxchg.h
generic-y += cmpxchg-local.h
+generic-y += cmpxchg.h
generic-y += compat.h
generic-y += cputime.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += errno.h
generic-y += exec.h
@@ -46,14 +46,15 @@ generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
generic-y += shmbuf.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += stat.h
generic-y += switch_to.h
generic-y += timex.h
generic-y += topology.h
generic-y += trace_clock.h
-generic-y += xor.h
generic-y += unaligned.h
generic-y += user.h
generic-y += vga.h
generic-y += word-at-a-time.h
+generic-y += xor.h
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index 8fde4fa2c34f..571a9d9ad107 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -33,6 +33,7 @@ generic-y += preempt.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += spinlock.h
generic-y += topology.h
generic-y += trace_clock.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 65964d390b10..81a39e274f6f 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -27,12 +27,13 @@ generic-y += module.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
-generic-y += qspinlock_types.h
-generic-y += qspinlock.h
-generic-y += qrwlock_types.h
generic-y += qrwlock.h
+generic-y += qrwlock_types.h
+generic-y += qspinlock.h
+generic-y += qspinlock_types.h
generic-y += sections.h
generic-y += segment.h
+generic-y += simd.h
generic-y += string.h
generic-y += switch_to.h
generic-y += topology.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index 2013d639e735..97970b4d05ab 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -17,6 +17,7 @@ generic-y += percpu.h
generic-y += preempt.h
generic-y += seccomp.h
generic-y += segment.h
+generic-y += simd.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += user.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d227e351..64290f48e733 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -4,7 +4,8 @@ generic-y += irq_regs.h
generic-y += irq_work.h
generic-y += local64.h
generic-y += mcs_spinlock.h
+generic-y += msi.h
generic-y += preempt.h
generic-y += rwsem.h
+generic-y += simd.h
generic-y += vtime.h
-generic-y += msi.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 576ffdca06ba..8d3e7aef3234 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -4,9 +4,9 @@ generic-y += checksum.h
generic-y += cputime.h
generic-y += device.h
generic-y += div64.h
-generic-y += dma.h
generic-y += dma-contiguous.h
generic-y += dma-mapping.h
+generic-y += dma.h
generic-y += emergency-restart.h
generic-y += errno.h
generic-y += exec.h
@@ -45,6 +45,7 @@ generic-y += setup.h
generic-y += shmbuf.h
generic-y += shmparam.h
generic-y += signal.h
+generic-y += simd.h
generic-y += socket.h
generic-y += sockios.h
generic-y += stat.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index e3239772887a..7a26dc6ce815 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -7,9 +7,9 @@ generated-y += unistd_nr.h
generic-y += asm-offsets.h
generic-y += cacheflush.h
generic-y += device.h
+generic-y += div64.h
generic-y += dma-contiguous.h
generic-y += dma-mapping.h
-generic-y += div64.h
generic-y += emergency-restart.h
generic-y += export.h
generic-y += fb.h
@@ -22,6 +22,7 @@ generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += preempt.h
generic-y += rwsem.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += unaligned.h
generic-y += word-at-a-time.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 6a5609a55965..8e64ff35a933 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += percpu.h
generic-y += preempt.h
generic-y += rwsem.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += trace_clock.h
generic-y += xor.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index 410b263ef5c8..72b9e08fb350 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -17,5 +17,6 @@ generic-y += msi.h
generic-y += preempt.h
generic-y += rwsem.h
generic-y += serial.h
+generic-y += simd.h
generic-y += trace_clock.h
generic-y += word-at-a-time.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b10dde6cb793..d37288b08dd2 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -16,15 +16,16 @@ generic-y += io.h
generic-y += irq_regs.h
generic-y += irq_work.h
generic-y += kdebug.h
+generic-y += kprobes.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += param.h
generic-y += pci.h
generic-y += percpu.h
generic-y += preempt.h
+generic-y += simd.h
generic-y += switch_to.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += word-at-a-time.h
generic-y += xor.h
-generic-y += kprobes.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index bfc7abe77905..98a908720bbd 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -27,6 +27,7 @@ generic-y += preempt.h
generic-y += sections.h
generic-y += segment.h
generic-y += serial.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += syscalls.h
generic-y += topology.h
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index a341c878e977..79411178988a 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,4 +1,11 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
#include <asm/fpu/api.h>
@@ -10,3 +17,24 @@ static __must_check inline bool may_use_simd(void)
{
return irq_fpu_usable();
}
+
+static inline simd_context_t simd_get(void)
+{
+ bool have_simd = false;
+#if !defined(CONFIG_UML)
+ have_simd = may_use_simd();
+ if (have_simd)
+ kernel_fpu_begin();
+#endif
+ return have_simd ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+#if !defined(CONFIG_UML)
+ if (prior_context != HAVE_NO_SIMD)
+ kernel_fpu_end();
+#endif
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index e5e1e61c538c..e3b194a187f9 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -23,6 +23,7 @@ generic-y += percpu.h
generic-y += preempt.h
generic-y += rwsem.h
generic-y += sections.h
+generic-y += simd.h
generic-y += topology.h
generic-y += trace_clock.h
generic-y += word-at-a-time.h
diff --git a/include/asm-generic/simd.h b/include/asm-generic/simd.h
index d0343d58a74a..fad899a5a92d 100644
--- a/include/asm-generic/simd.h
+++ b/include/asm-generic/simd.h
@@ -1,5 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/simd.h>
+#ifndef _ASM_SIMD_H
+#define _ASM_SIMD_H
+
#include <linux/hardirq.h>
/*
@@ -13,3 +17,14 @@ static __must_check inline bool may_use_simd(void)
{
return !in_interrupt();
}
+
+static inline simd_context_t simd_get(void)
+{
+ return HAVE_NO_SIMD;
+}
+
+static inline void simd_put(simd_context_t prior_context)
+{
+}
+
+#endif /* _ASM_SIMD_H */
diff --git a/include/linux/simd.h b/include/linux/simd.h
new file mode 100644
index 000000000000..f62d047188bf
--- /dev/null
+++ b/include/linux/simd.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _SIMD_H
+#define _SIMD_H
+
+typedef enum {
+ HAVE_NO_SIMD,
+ HAVE_FULL_SIMD
+} simd_context_t;
+
+#include <linux/sched.h>
+#include <asm/simd.h>
+
+static inline simd_context_t simd_relax(simd_context_t prior_context)
+{
+#ifdef CONFIG_PREEMPT
+ if (prior_context != HAVE_NO_SIMD && need_resched()) {
+ simd_put(prior_context);
+ return simd_get();
+ }
+#endif
+ return prior_context;
+}
+
+#endif /* _SIMD_H */
--
2.18.0
^ permalink raw reply related
* [PATCH v2 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
To: linux-kernel, netdev, davem
Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
Jean-Philippe Aumasson, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>
Zinc stands for "Zinc Is Neat Crypto" or "Zinc as IN Crypto" or maybe
just "Zx2c4's INsane Cryptolib." It's also short, easy to type, and
plays nicely with the recent trend of naming crypto libraries after
elements. The guiding principle is "don't overdo it". It's less of a
library and more of a directory tree for organizing well-curated direct
implementations of cryptography primitives.
Zinc is a new cryptography API that is much more minimal and lower-level
than the current one. It intends to complement it and provide a basis
upon which the current crypto API might build, as the provider of
software implementations of cryptographic primitives. It is motivated by
three primary observations in crypto API design:
* Highly composable "cipher modes" and related abstractions from
90s cryptographers did not turn out to be as terrific an idea as
hoped, leading to a host of API misuse problems.
* Most programmers are afraid of crypto code, and so prefer to
integrate it into libraries in a highly abstracted manner, so as to
shield themselves from implementation details. Cryptographers, on
the other hand, prefer simple direct implementations, which they're
able to verify for high assurance and optimize in accordance with
their expertise.
* Overly abstracted and flexible cryptography APIs lead to a host of
dangerous problems and performance issues. The kernel is in the
business usually not of coming up with new uses of crypto, but
rather implementing various constructions, which means it essentially
needs a library of primitives, not a highly abstracted enterprise-ready
pluggable system, with a few particular exceptions.
This last observation has seen itself play out several times over and
over again within the kernel:
* The perennial move of actual primitives away from crypto/ and into
lib/, so that users can actually call these functions directly with
no overhead and without lots of allocations, function pointers,
string specifier parsing, and general clunkiness. For example:
sha256, chacha20, siphash, sha1, and so forth live in lib/ rather
than in crypto/. Zinc intends to stop the cluttering of lib/ and
introduce these direct primitives into their proper place, lib/zinc/.
* An abundance of misuse bugs with the present crypto API that have
been very unpleasant to clean up.
* A hesitance to even use cryptography, because of the overhead and
headaches involved in accessing the routines.
Zinc goes in a rather different direction. Rather than providing a
thoroughly designed and abstracted API, Zinc gives you simple functions,
which implement some primitive, or some particular and specific
construction of primitives. It is not dynamic in the least, though one
could imagine implementing a complex dynamic dispatch mechanism (such as
the current crypto API) on top of these basic functions. After all,
dynamic dispatch is usually needed for applications with cipher agility,
such as IPsec, dm-crypt, AF_ALG, and so forth, and the existing crypto
API will continue to play that role. However, Zinc will provide a non-
haphazard way of directly utilizing crypto routines in applications
that do have neither the need nor desire for abstraction and dynamic
dispatch.
It also organizes the implementations in a simple, straight-forward,
and direct manner, making it enjoyable and intuitive to work on.
Rather than moving optimized assembly implementations into arch/, it
keeps them all together in lib/zinc/, making it simple and obvious to
compare and contrast what's happening. This is, notably, exactly what
the lib/raid6/ tree does, and that seems to work out rather well. It's
also the pattern of most successful crypto libraries. The architecture-
specific glue-code is made a part of each translation unit, rather than
being in a separate one, so that generic and architecture-optimized code
are combined at compile-time, and incompatibility branches compiled out by
the optimizer.
All implementations have been extensively tested and fuzzed, and are
selected for their quality, trustworthiness, and performance. Wherever
possible and performant, formally verified implementations are used,
such as those from HACL* [1] and Fiat-Crypto [2]. The routines also take
special care to zero out secrets using memzero_explicit (and future work
is planned to have gcc do this more reliably and performantly with
compiler plugins). The performance of the selected implementations is
state-of-the-art and unrivaled on a broad array of hardware, though of
course we will continue to fine tune these to the hardware demands
needed by kernel contributors. Each implementation also comes with
extensive self-tests and crafted test vectors, pulled from various
places such as Wycheproof [9].
Regularity of function signatures is important, so that users can easily
"guess" the name of the function they want. Though, individual
primitives are oftentimes not trivially interchangeable, having been
designed for different things and requiring different parameters and
semantics, and so the function signatures they provide will directly
reflect the realities of the primitives' usages, rather than hiding it
behind (inevitably leaky) abstractions. Also, in contrast to the current
crypto API, Zinc functions can work on stack buffers, and can be called
with different keys, without requiring allocations or locking.
SIMD is used automatically when available, though some routines may
benefit from either having their SIMD disabled for particular
invocations, or to have the SIMD initialization calls amortized over
several invocations of the function, and so Zinc utilizes function
signatures enabling that in conjunction with the recently introduced
simd_context_t.
More generally, Zinc provides function signatures that allow just what
is required by the various callers. This isn't to say that users of the
functions will be permitted to pollute the function semantics with weird
particular needs, but we are trying very hard not to overdo it, and that
means looking carefully at what's actually necessary, and doing just that,
and not much more than that. Remember: practicality and cleanliness rather
than over-zealous infrastructure.
Zinc provides also an opening for the best implementers in academia to
contribute their time and effort to the kernel, by being sufficiently
simple and inviting. In discussing this commit with some of the best and
brightest over the last few years, there are many who are eager to
devote rare talent and energy to this effort.
Following the merging of this, I expect for the primitives that
currently exist in lib/ to work their way into lib/zinc/, after intense
scrutiny of each implementation, potentially replacing them with either
formally-verified implementations, or better studied and faster
state-of-the-art implementations.
Also following the merging of this, I expect for the old crypto API
implementations to be ported over to use Zinc for their software-based
implementations.
As Zinc is simply library code, its config options are un-menued, with
the exception of CONFIG_ZINC_DEBUG, which enables various selftests and
BUG_ONs.
[1] https://github.com/project-everest/hacl-star
[2] https://github.com/mit-plv/fiat-crypto
[3] https://cr.yp.to/ecdh.html
[4] https://cr.yp.to/chacha.html
[5] https://cr.yp.to/snuffle/xsalsa-20081128.pdf
[6] https://cr.yp.to/mac.html
[7] https://blake2.net/
[8] https://tools.ietf.org/html/rfc8439
[9] https://github.com/google/wycheproof
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: linux-crypto@vger.kernel.org
---
MAINTAINERS | 8 ++++++++
lib/Kconfig | 2 ++
lib/Makefile | 2 ++
lib/zinc/Kconfig | 20 ++++++++++++++++++++
lib/zinc/Makefile | 7 +++++++
lib/zinc/main.c | 31 +++++++++++++++++++++++++++++++
6 files changed, 70 insertions(+)
create mode 100644 lib/zinc/Kconfig
create mode 100644 lib/zinc/Makefile
create mode 100644 lib/zinc/main.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 955463f8d518..f194eda86011 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16103,6 +16103,14 @@ Q: https://patchwork.linuxtv.org/project/linux-media/list/
S: Maintained
F: drivers/media/dvb-frontends/zd1301_demod*
+ZINC CRYPTOGRAPHY LIBRARY
+M: Jason A. Donenfeld <Jason@zx2c4.com>
+M: Samuel Neves <sneves@dei.uc.pt>
+S: Maintained
+F: lib/zinc/
+F: include/zinc/
+L: linux-crypto@vger.kernel.org
+
ZPOOL COMPRESSED PAGE STORAGE API
M: Dan Streetman <ddstreet@ieee.org>
L: linux-mm@kvack.org
diff --git a/lib/Kconfig b/lib/Kconfig
index 706836ec314d..7ea5437e9f7d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -478,6 +478,8 @@ config GLOB_SELFTEST
module load) by a small amount, so you're welcome to play with
it, but you probably don't need it.
+source "lib/zinc/Kconfig"
+
#
# Netlink attribute parsing support is select'ed if needed
#
diff --git a/lib/Makefile b/lib/Makefile
index d95bb2525101..ee41151cba7a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -212,6 +212,8 @@ obj-$(CONFIG_PERCPU_TEST) += percpu_test.o
obj-$(CONFIG_ASN1) += asn1_decoder.o
+obj-$(CONFIG_ZINC) += zinc/
+
obj-$(CONFIG_FONT_SUPPORT) += fonts/
obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
new file mode 100644
index 000000000000..aa4f8d449d6b
--- /dev/null
+++ b/lib/zinc/Kconfig
@@ -0,0 +1,20 @@
+config ZINC
+ tristate
+ select CRYPTO_BLKCIPHER
+ select VFP
+ select VFPv3
+ select NEON
+ select KERNEL_MODE_NEON
+
+config ZINC_DEBUG
+ bool "Zinc cryptography library debugging and self-tests"
+ depends on ZINC
+ help
+ This builds a series of self-tests for the Zinc crypto library, which
+ help diagnose any cryptographic algorithm implementation issues that
+ might be at the root cause of potential bugs. It also adds various
+ debugging traps.
+
+ Unless you're developing and testing cryptographic routines, or are
+ especially paranoid about correctness on your hardware, you may say
+ N here.
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
new file mode 100644
index 000000000000..8e30115217db
--- /dev/null
+++ b/lib/zinc/Makefile
@@ -0,0 +1,7 @@
+ccflags-y := -O3
+ccflags-y += -Wframe-larger-than=8192
+ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
+
+zinc-y += main.o
+
+obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
new file mode 100644
index 000000000000..590872563955
--- /dev/null
+++ b/lib/zinc/main.c
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#ifdef CONFIG_ZINC_DEBUG
+#define selftest(which) do { \
+ if (!which ## _selftest()) \
+ return -ENOTRECOVERABLE; \
+} while (0)
+#else
+#define selftest(which)
+#endif
+
+static int __init mod_init(void)
+{
+ return 0;
+}
+
+static void __exit mod_exit(void)
+{
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Zinc cryptography library");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
--
2.18.0
^ permalink raw reply related
* [PATCH v2 04/17] zinc: ChaCha20 ARM and ARM64 implementations
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
To: linux-kernel, netdev, davem
Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
Jean-Philippe Aumasson, Andy Polyakov, Russell King,
linux-arm-kernel, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>
These NEON and non-NEON implementations come from Andy Polyakov's
implementation, with several modifications for being friendly to kernel
space.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-crypto@vger.kernel.org
---
lib/zinc/Makefile | 8 +
lib/zinc/chacha20/chacha20-arm-glue.h | 42 +
lib/zinc/chacha20/chacha20-arm.S | 1471 +++++++++++++++++++
lib/zinc/chacha20/chacha20-arm64.S | 1940 +++++++++++++++++++++++++
4 files changed, 3461 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-arm-glue.h
create mode 100644 lib/zinc/chacha20/chacha20-arm.S
create mode 100644 lib/zinc/chacha20/chacha20-arm64.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 6ec77feb4ab7..87be627f3354 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -4,6 +4,14 @@ ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
ifeq ($(CONFIG_ZINC_CHACHA20),y)
zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_ARM),y)
+zinc-y += chacha20/chacha20-arm.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
+ifeq ($(CONFIG_ARM64),y)
+zinc-y += chacha20/chacha20-arm64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
+endif
endif
zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-arm-glue.h b/lib/zinc/chacha20/chacha20-arm-glue.h
new file mode 100644
index 000000000000..1b072eb1e50c
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm-glue.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+
+asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (!defined(__LINUX_ARM_ARCH__) || __LINUX_ARM_ARCH__ >= 7)
+#define ARM_USE_NEON
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_neon __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#if defined(CONFIG_ARM64)
+ chacha20_use_neon = elf_hwcap & HWCAP_ASIMD;
+#elif defined(CONFIG_ARM)
+ chacha20_use_neon = elf_hwcap & HWCAP_NEON;
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, const u32 key[8], const u32 counter[4], simd_context_t simd_context)
+{
+ if (simd_context != HAVE_FULL_SIMD
+#if defined(ARM_USE_NEON)
+ || !chacha20_use_neon
+#endif
+ )
+ chacha20_arm(dst, src, len, key, counter);
+ else
+ chacha20_neon(dst, src, len, key, counter);
+ return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, const u8 *key, simd_context_t simd_context) { return false; }
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-arm.S b/lib/zinc/chacha20/chacha20-arm.S
new file mode 100644
index 000000000000..601b4e34283d
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm.S
@@ -0,0 +1,1471 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+.word -1
+
+#if __LINUX_ARM_ARCH__ >= 7 && IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
+.arch armv7-a
+.fpu neon
+
+.align 5
+ENTRY(chacha20_neon)
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_neon
+ cmp r2,#192 @ test len
+ bls .Lshort
+.Lchacha20_neon_begin:
+ adr r14,.Lsigma
+ vstmdb sp!,{d8-d15} @ ABI spec says so
+ stmdb sp!,{r0-r3}
+
+ vld1.32 {q1-q2},[r3] @ load key
+ ldmia r3,{r4-r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {q3},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0-r3} @ load sigma
+ vld1.32 {q0},[r14]! @ load sigma
+ vld1.32 {q12},[r14] @ one
+ vst1.32 {q2-q3},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {q0-q1},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ vshl.i32 d26,d24,#1 @ two
+ vstr d24,[sp,#4*(16+0)]
+ vshl.i32 d28,d24,#2 @ four
+ vstr d26,[sp,#4*(16+2)]
+ vmov q4,q0
+ vstr d28,[sp,#4*(16+4)]
+ vmov q8,q0
+ vmov q5,q1
+ vmov q9,q1
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0-r9} @ load key material
+ cmp r11,#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ vmov q4,q0
+ str r11,[sp,#4*(32+2)] @ save len
+ vmov q8,q0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov q5,q1
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov q9,q1
+.Loop_neon_enter:
+ ldr r11, [sp,#4*(15)]
+ vadd.i32 q7,q3,q12 @ counter+1
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ vmov q6,q2
+ ldr r10, [sp,#4*(13)]
+ vmov q10,q2
+ ldr r14,[sp,#4*(14)]
+ vadd.i32 q11,q7,q12 @ counter+2
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ add r12,r12,#3 @ counter+3
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs r11,r11,#1
+ vadd.i32 q0,q0,q1
+ add r0,r0,r4
+ vadd.i32 q4,q4,q5
+ mov r12,r12,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r5
+ veor q3,q3,q0
+ mov r10,r10,ror#16
+ veor q7,q7,q4
+ eor r12,r12,r0,ror#16
+ veor q11,q11,q8
+ eor r10,r10,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r12
+ vrev32.16 q7,q7
+ mov r4,r4,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r10
+ vadd.i32 q2,q2,q3
+ mov r5,r5,ror#20
+ vadd.i32 q6,q6,q7
+ eor r4,r4,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r5,r5,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r4
+ veor q13,q5,q6
+ mov r12,r12,ror#24
+ veor q14,q9,q10
+ add r1,r1,r5
+ vshr.u32 q1,q12,#20
+ mov r10,r10,ror#24
+ vshr.u32 q5,q13,#20
+ eor r12,r12,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r10,r10,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r12
+ vsli.32 q5,q13,#12
+ mov r4,r4,ror#25
+ vsli.32 q9,q14,#12
+ add r9,r9,r10
+ vadd.i32 q0,q0,q1
+ mov r5,r5,ror#25
+ vadd.i32 q4,q4,q5
+ str r10,[sp,#4*(16+13)]
+ vadd.i32 q8,q8,q9
+ ldr r10,[sp,#4*(16+15)]
+ veor q12,q3,q0
+ eor r4,r4,r8,ror#25
+ veor q13,q7,q4
+ eor r5,r5,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+8)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+10)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r6
+ vshr.u32 q11,q14,#24
+ mov r14,r14,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+9)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+11)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r7
+ vadd.i32 q2,q2,q3
+ mov r10,r10,ror#16
+ vadd.i32 q6,q6,q7
+ eor r14,r14,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r10,r10,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r14
+ veor q13,q5,q6
+ mov r6,r6,ror#20
+ veor q14,q9,q10
+ add r9,r9,r10
+ vshr.u32 q1,q12,#25
+ mov r7,r7,ror#20
+ vshr.u32 q5,q13,#25
+ eor r6,r6,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r7,r7,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r6
+ vsli.32 q5,q13,#7
+ mov r14,r14,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r7
+ vext.8 q2,q2,q2,#8
+ mov r10,r10,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r14,r14,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r10,r10,r3,ror#24
+ vext.8 q1,q1,q1,#4
+ add r8,r8,r14
+ vext.8 q5,q5,q5,#4
+ mov r6,r6,ror#25
+ vext.8 q9,q9,q9,#4
+ add r9,r9,r10
+ vext.8 q3,q3,q3,#12
+ mov r7,r7,ror#25
+ vext.8 q7,q7,q7,#12
+ eor r6,r6,r8,ror#25
+ vext.8 q11,q11,q11,#12
+ eor r7,r7,r9,ror#25
+ vadd.i32 q0,q0,q1
+ add r0,r0,r5
+ vadd.i32 q4,q4,q5
+ mov r10,r10,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r6
+ veor q3,q3,q0
+ mov r12,r12,ror#16
+ veor q7,q7,q4
+ eor r10,r10,r0,ror#16
+ veor q11,q11,q8
+ eor r12,r12,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r10
+ vrev32.16 q7,q7
+ mov r5,r5,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r12
+ vadd.i32 q2,q2,q3
+ mov r6,r6,ror#20
+ vadd.i32 q6,q6,q7
+ eor r5,r5,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r6,r6,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r5
+ veor q13,q5,q6
+ mov r10,r10,ror#24
+ veor q14,q9,q10
+ add r1,r1,r6
+ vshr.u32 q1,q12,#20
+ mov r12,r12,ror#24
+ vshr.u32 q5,q13,#20
+ eor r10,r10,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r12,r12,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r10
+ vsli.32 q5,q13,#12
+ mov r5,r5,ror#25
+ vsli.32 q9,q14,#12
+ str r10,[sp,#4*(16+15)]
+ vadd.i32 q0,q0,q1
+ ldr r10,[sp,#4*(16+13)]
+ vadd.i32 q4,q4,q5
+ add r9,r9,r12
+ vadd.i32 q8,q8,q9
+ mov r6,r6,ror#25
+ veor q12,q3,q0
+ eor r5,r5,r8,ror#25
+ veor q13,q7,q4
+ eor r6,r6,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+10)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+8)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r7
+ vshr.u32 q11,q14,#24
+ mov r10,r10,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+11)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+9)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r4
+ vadd.i32 q2,q2,q3
+ mov r14,r14,ror#16
+ vadd.i32 q6,q6,q7
+ eor r10,r10,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r14,r14,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r10
+ veor q13,q5,q6
+ mov r7,r7,ror#20
+ veor q14,q9,q10
+ add r9,r9,r14
+ vshr.u32 q1,q12,#25
+ mov r4,r4,ror#20
+ vshr.u32 q5,q13,#25
+ eor r7,r7,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r4,r4,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r7
+ vsli.32 q5,q13,#7
+ mov r10,r10,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r4
+ vext.8 q2,q2,q2,#8
+ mov r14,r14,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r10,r10,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r14,r14,r3,ror#24
+ vext.8 q1,q1,q1,#12
+ add r8,r8,r10
+ vext.8 q5,q5,q5,#12
+ mov r7,r7,ror#25
+ vext.8 q9,q9,q9,#12
+ add r9,r9,r14
+ vext.8 q3,q3,q3,#4
+ mov r4,r4,ror#25
+ vext.8 q7,q7,q7,#4
+ eor r7,r7,r8,ror#25
+ vext.8 q11,q11,q11,#4
+ eor r4,r4,r9,ror#25
+ bne .Loop_neon
+
+ add r11,sp,#32
+ vld1.32 {q12-q13},[sp] @ load key material
+ vld1.32 {q14-q15},[r11]
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 q0,q0,q12 @ accumulate key material
+ vadd.i32 q4,q4,q12
+ vadd.i32 q8,q8,q12
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ vadd.i32 q1,q1,q13
+ vadd.i32 q5,q5,q13
+ vadd.i32 q9,q9,q13
+ vldr d26,[sp,#4*(16+2)] @ two
+
+ vadd.i32 q2,q2,q14
+ vadd.i32 q6,q6,q14
+ vadd.i32 q10,q10,q14
+ vadd.i32 d14,d14,d24 @ counter+1
+ vadd.i32 d22,d22,d26 @ counter+2
+
+ vadd.i32 q3,q3,q15
+ vadd.i32 q7,q7,q15
+ vadd.i32 q11,q11,q15
+
+ cmp r11,#64*4
+ blo .Ltail_neon
+
+ vld1.8 {q12-q13},[r12]! @ load input
+ mov r11,sp
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12 @ xor with input
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ vst1.8 {q0-q1},[r14]! @ store output
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vld1.32 {q0-q1},[r11]! @ load for next iteration
+ veor d25,d25,d25
+ vldr d24,[sp,#4*(16+4)] @ four
+ veor q9,q9,q13
+ vld1.32 {q2-q3},[r11]
+ veor q10,q10,q14
+ vst1.8 {q4-q5},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q6-q7},[r14]!
+
+ vadd.i32 d6,d6,d24 @ next counter value
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ vst1.8 {q8-q9},[r14]!
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+ vst1.8 {q10-q11},[r14]!
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ eor r0,r0,r8 @ xor with input
+ add r8,sp,#4*(4)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r5,r5,r9
+ ldr r9,[r12,#-12]
+ add r6,r6,r10
+ ldr r10,[r12,#-8]
+ add r7,r7,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ eor r4,r4,r8
+ add r8,sp,#4*(8)
+ eor r5,r5,r9
+ str r4,[r14],#16 @ store output
+ eor r6,r6,r10
+ str r5,[r14,#-12]
+ eor r7,r7,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+#ifdef __thumb2__
+ it hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+#ifdef __thumb2__
+ it hi
+#endif
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+ eor r0,r0,r8
+ add r8,sp,#4*(12)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,r8,#4 @ next counter value
+ add r5,r5,r9
+ str r8,[sp,#4*(12)] @ save next counter value
+ ldr r8,[r12],#16 @ load input
+ add r6,r6,r10
+ add r4,r4,#3 @ counter+3
+ ldr r9,[r12,#-12]
+ add r7,r7,r11
+ ldr r10,[r12,#-8]
+ ldr r11,[r12,#-4]
+#ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ eor r4,r4,r8
+#ifdef __thumb2__
+ it hi
+#endif
+ ldrhi r8,[sp,#4*(32+2)] @ re-load len
+ eor r5,r5,r9
+ eor r6,r6,r10
+ str r4,[r14],#16 @ store output
+ eor r7,r7,r11
+ str r5,[r14,#-12]
+ sub r11,r8,#64*4 @ len-=64*4
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str r11, [sp,#4*(20+32+2)] @ save len
+ add r11,sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr r12,[sp,#4*(16+10)]
+ ldr r14,[sp,#4*(16+11)]
+ vldmia r11,{d8-d15} @ fulfill ABI requirement
+ str r12,[sp,#4*(20+16+10)] @ copy "rx"
+ str r14,[sp,#4*(20+16+11)] @ copy "rx"
+
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(20+16+15)]
+ add r11,sp,#4*(20)
+ vst1.32 {q0-q1},[r11]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {q2-q3},[r11]
+ mov r11,#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp r11,#64*3
+ bhs .L192_or_more_neon
+ cmp r11,#64*2
+ bhs .L128_or_more_neon
+ cmp r11,#64*1
+ bhs .L64_or_more_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q0-q1},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q2-q3},[r8]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vst1.8 {q0-q1},[r14]!
+ vst1.8 {q2-q3},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q4-q5},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q6-q7},[r8]
+ sub r11,r11,#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vst1.8 {q0-q1},[r14]!
+ veor q6,q6,q14
+ vst1.8 {q2-q3},[r14]!
+ veor q7,q7,q15
+ vst1.8 {q4-q5},[r14]!
+ vst1.8 {q6-q7},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q8-q9},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q10-q11},[r8]
+ sub r11,r11,#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {q12-q13},[r12]!
+ vld1.8 {q14-q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vld1.8 {q12-q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q0-q1},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14-q15},[r12]!
+
+ veor q8,q8,q12
+ vst1.8 {q2-q3},[r14]!
+ veor q9,q9,q13
+ vst1.8 {q4-q5},[r14]!
+ veor q10,q10,q14
+ vst1.8 {q6-q7},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q8-q9},[r14]!
+ vst1.8 {q10-q11},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{r8-r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(4)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r6,r6,r10
+ add r7,r7,r11
+ ldmia r8,{r8-r11} @ load key material
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ stmia sp,{r0-r7}
+ add r0,sp,#4*(16+8)
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(12)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r4,r4,#3 @ counter+3
+ add r6,r6,r10
+ add r7,r7,r11
+ ldr r11,[sp,#4*(32+2)] @ re-load len
+#ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+ stmia r8,{r0-r7}
+ add r10,sp,#4*(0)
+ sub r11,r11,#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb r8,[r10],#1 @ read buffer on stack
+ ldrb r9,[r12],#1 @ read input
+ subs r11,r11,#1
+ eor r8,r8,r9
+ strb r8,[r14],#1 @ store output
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8-d15}
+ add sp,sp,#4*(16+3)
+.Lno_data_neon:
+ ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_neon)
+#endif
+
+.align 5
+.Lsigma2:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone2:
+.long 1,0,0,0
+.word -1
+
+.align 5
+ENTRY(chacha20_arm)
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0-r2,r4-r11,lr}
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data_arm
+.Lshort:
+ ldmia r12,{r4-r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+#if __LINUX_ARM_ARCH__ < 7 && !defined(__thumb2__)
+ sub r14,pc,#100 @ .Lsigma2
+#else
+ adr r14,.Lsigma2 @ .Lsigma2
+#endif
+ stmdb sp!,{r4-r7} @ copy counter and nonce
+ ldmia r3,{r4-r11} @ load key
+ ldmia r14,{r0-r3} @ load sigma
+ stmdb sp!,{r4-r11} @ copy key
+ stmdb sp!,{r0-r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0-r9} @ load key material
+ str r11,[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs r11,r11,#1
+ add r0,r0,r4
+ mov r12,r12,ror#16
+ add r1,r1,r5
+ mov r10,r10,ror#16
+ eor r12,r12,r0,ror#16
+ eor r10,r10,r1,ror#16
+ add r8,r8,r12
+ mov r4,r4,ror#20
+ add r9,r9,r10
+ mov r5,r5,ror#20
+ eor r4,r4,r8,ror#20
+ eor r5,r5,r9,ror#20
+ add r0,r0,r4
+ mov r12,r12,ror#24
+ add r1,r1,r5
+ mov r10,r10,ror#24
+ eor r12,r12,r0,ror#24
+ eor r10,r10,r1,ror#24
+ add r8,r8,r12
+ mov r4,r4,ror#25
+ add r9,r9,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+13)]
+ ldr r10,[sp,#4*(16+15)]
+ eor r4,r4,r8,ror#25
+ eor r5,r5,r9,ror#25
+ str r8,[sp,#4*(16+8)]
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6
+ mov r14,r14,ror#16
+ str r9,[sp,#4*(16+9)]
+ ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7
+ mov r10,r10,ror#16
+ eor r14,r14,r2,ror#16
+ eor r10,r10,r3,ror#16
+ add r8,r8,r14
+ mov r6,r6,ror#20
+ add r9,r9,r10
+ mov r7,r7,ror#20
+ eor r6,r6,r8,ror#20
+ eor r7,r7,r9,ror#20
+ add r2,r2,r6
+ mov r14,r14,ror#24
+ add r3,r3,r7
+ mov r10,r10,ror#24
+ eor r14,r14,r2,ror#24
+ eor r10,r10,r3,ror#24
+ add r8,r8,r14
+ mov r6,r6,ror#25
+ add r9,r9,r10
+ mov r7,r7,ror#25
+ eor r6,r6,r8,ror#25
+ eor r7,r7,r9,ror#25
+ add r0,r0,r5
+ mov r10,r10,ror#16
+ add r1,r1,r6
+ mov r12,r12,ror#16
+ eor r10,r10,r0,ror#16
+ eor r12,r12,r1,ror#16
+ add r8,r8,r10
+ mov r5,r5,ror#20
+ add r9,r9,r12
+ mov r6,r6,ror#20
+ eor r5,r5,r8,ror#20
+ eor r6,r6,r9,ror#20
+ add r0,r0,r5
+ mov r10,r10,ror#24
+ add r1,r1,r6
+ mov r12,r12,ror#24
+ eor r10,r10,r0,ror#24
+ eor r12,r12,r1,ror#24
+ add r8,r8,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+15)]
+ ldr r10,[sp,#4*(16+13)]
+ add r9,r9,r12
+ mov r6,r6,ror#25
+ eor r5,r5,r8,ror#25
+ eor r6,r6,r9,ror#25
+ str r8,[sp,#4*(16+10)]
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7
+ mov r10,r10,ror#16
+ str r9,[sp,#4*(16+11)]
+ ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4
+ mov r14,r14,ror#16
+ eor r10,r10,r2,ror#16
+ eor r14,r14,r3,ror#16
+ add r8,r8,r10
+ mov r7,r7,ror#20
+ add r9,r9,r14
+ mov r4,r4,ror#20
+ eor r7,r7,r8,ror#20
+ eor r4,r4,r9,ror#20
+ add r2,r2,r7
+ mov r10,r10,ror#24
+ add r3,r3,r4
+ mov r14,r14,ror#24
+ eor r10,r10,r2,ror#24
+ eor r14,r14,r3,ror#24
+ add r8,r8,r10
+ mov r7,r7,ror#25
+ add r9,r9,r14
+ mov r4,r4,ror#25
+ eor r7,r7,r8,ror#25
+ eor r4,r4,r9,ror#25
+ bne .Loop
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ cmp r11,#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr r8,[sp,#4*(0)] @ load key material
+ ldr r9,[sp,#4*(1)]
+
+#if __LINUX_ARM_ARCH__ >= 6 || !defined(__ARMEB__)
+#if __LINUX_ARM_ARCH__ < 7
+ orr r10,r12,r14
+ tst r10,#3 @ are input and output aligned?
+ ldr r10,[sp,#4*(2)]
+ bne .Lunaligned
+ cmp r11,#64 @ restore flags
+#else
+ ldr r10,[sp,#4*(2)]
+#endif
+ ldr r11,[sp,#4*(3)]
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+
+ add r2,r2,r10
+ add r3,r3,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r0,r0,r8 @ xor with input
+ eorhs r1,r1,r9
+ add r8,sp,#4*(4)
+ str r0,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ ldmia r8,{r8-r11} @ load key material
+ str r1,[r14,#-12]
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+ add r8,sp,#4*(8)
+ str r4,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r5,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0-r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+#ifdef __thumb2__
+ itt hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r2,r2,r10
+ add r3,r3,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r0,r0,r8
+ eorhs r1,r1,r9
+ add r8,sp,#4*(12)
+ str r0,[r14],#16 @ store output
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ str r1,[r14,#-12]
+ ldmia r8,{r8-r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+#ifdef __thumb2__
+ itt hi
+#endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+#if __LINUX_ARM_ARCH__ >= 6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+#endif
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+#ifdef __thumb2__
+ it ne
+#endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+#ifdef __thumb2__
+ itt hs
+#endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r4,[r14],#16 @ store output
+ str r5,[r14,#-12]
+#ifdef __thumb2__
+ it hs
+#endif
+ subhs r11,r8,#64 @ len-=64
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+#if __LINUX_ARM_ARCH__ < 7
+ b .Ltail
+
+.align 4
+.Lunaligned: @ unaligned endian-neutral path
+ cmp r11,#64 @ restore flags
+#endif
+#endif
+#if __LINUX_ARM_ARCH__ < 7
+ ldr r11,[sp,#4*(3)]
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+0)
+ ldmia r8,{r8-r11} @ load key material
+ add r0,sp,#4*(16+8)
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+ add r6,r6,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+ add r8,sp,#4*(4+4)
+ ldmia r8,{r8-r11} @ load key material
+ ldmia r0,{r0-r7} @ load second half
+#ifdef __thumb2__
+ itt hi
+#endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx"
+ strhi r11,[sp,#4*(16+11)] @ copy "rx"
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+8)
+ ldmia r8,{r8-r11} @ load key material
+ add r4,r4,r8 @ accumulate key material
+#ifdef __thumb2__
+ itt hi
+#endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+ add r5,r5,r9
+ add r6,r6,r10
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+#ifdef __thumb2__
+ itete lo
+#endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+#ifdef __thumb2__
+ itt hs
+#endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+#ifdef __thumb2__
+ it ne
+#endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+#ifdef __thumb2__
+ it hs
+#endif
+ subhs r11,r8,#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add r9,sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb r10,[r9],#1 @ read buffer on stack
+ ldrb r11,[r12],#1 @ read input
+ subs r8,r8,#1
+ eor r11,r11,r10
+ strb r11,[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+.Lno_data_arm:
+ ldmia sp!,{r4-r11,pc}
+ENDPROC(chacha20_arm)
diff --git a/lib/zinc/chacha20/chacha20-arm64.S b/lib/zinc/chacha20/chacha20-arm64.S
new file mode 100644
index 000000000000..c3d1243161a8
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-arm64.S
@@ -0,0 +1,1940 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.text
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+
+.align 5
+ENTRY(chacha20_arm)
+ cbz x2,.Labort
+.Lshort:
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __ARMEB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+.Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+.Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,.Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo .Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add x2,x2,#64
+.Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+ENDPROC(chacha20_arm)
+
+.align 5
+ENTRY(chacha20_neon)
+ cbz x2,.Labort_neon
+ cmp x2,#192
+ b.lo .Lshort
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adr x5,.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+.Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,.Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo .Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo .Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+.Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ ret
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+.Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+.Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs .Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+.Labort_neon:
+ ret
+ENDPROC(chacha20_neon)
--
2.18.0
^ permalink raw reply related
* [PATCH v2 05/17] zinc: ChaCha20 x86_64 implementation
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
To: linux-kernel, netdev, davem
Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
Jean-Philippe Aumasson, Andy Polyakov, Thomas Gleixner,
Ingo Molnar, x86, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>
This provides SSSE3, AVX-2, AVX-512F, and AVX-512VL implementations for
ChaCha20. The AVX-512F implementation is disabled on Skylake, due to
throttling, and the VL ymm implementation is used instead. These come
from Andy Polyakov's implementation, with some heavy modifications from
Samuel Neves and me.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
Cc: linux-crypto@vger.kernel.org
---
lib/zinc/Makefile | 4 +
lib/zinc/chacha20/chacha20-x86_64-glue.h | 84 +
lib/zinc/chacha20/chacha20-x86_64.S | 2630 ++++++++++++++++++++++
3 files changed, 2718 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-x86_64-glue.h
create mode 100644 lib/zinc/chacha20/chacha20-x86_64.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 87be627f3354..fc6ca1c84cd1 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -4,6 +4,10 @@ ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt'
ifeq ($(CONFIG_ZINC_CHACHA20),y)
zinc-y += chacha20/chacha20.o
+ifeq ($(CONFIG_X86_64),y)
+zinc-y += chacha20/chacha20-x86_64.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-x86_64-glue.h
+endif
ifeq ($(CONFIG_ARM),y)
zinc-y += chacha20/chacha20-arm.o
CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
diff --git a/lib/zinc/chacha20/chacha20-x86_64-glue.h b/lib/zinc/chacha20/chacha20-x86_64-glue.h
new file mode 100644
index 000000000000..ece8279e93e0
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64-glue.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+#ifdef CONFIG_AS_SSSE3
+asmlinkage void hchacha20_ssse3(u8 *derived_key, const u8 *nonce, const u8 *key);
+asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+#endif
+
+static bool chacha20_use_ssse3 __ro_after_init;
+static bool chacha20_use_avx2 __ro_after_init;
+static bool chacha20_use_avx512 __ro_after_init;
+static bool chacha20_use_avx512vl __ro_after_init;
+
+void __init chacha20_fpu_init(void)
+{
+#ifndef CONFIG_UML
+ chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
+ chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ chacha20_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+ chacha20_use_avx512vl = boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL);
+#endif
+}
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, const u32 key[8], const u32 counter[4], simd_context_t simd_context)
+{
+ if (simd_context != HAVE_FULL_SIMD)
+ return false;
+
+#ifdef CONFIG_AS_AVX512
+ if (chacha20_use_avx512) {
+ chacha20_avx512(dst, src, len, key, counter);
+ return true;
+ }
+ if (chacha20_use_avx512vl) {
+ chacha20_avx512vl(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_AVX2
+ if (chacha20_use_avx2) {
+ chacha20_avx2(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+#ifdef CONFIG_AS_SSSE3
+ if (chacha20_use_ssse3) {
+ chacha20_ssse3(dst, src, len, key, counter);
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, const u8 *key, simd_context_t simd_context)
+{
+#if defined(CONFIG_AS_SSSE3)
+ if (simd_context == HAVE_FULL_SIMD && chacha20_use_ssse3) {
+ hchacha20_ssse3(derived_key, nonce, key);
+ return true;
+ }
+#endif
+ return false;
+}
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-x86_64.S b/lib/zinc/chacha20/chacha20-x86_64.S
new file mode 100644
index 000000000000..39883f3718b6
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-x86_64.S
@@ -0,0 +1,2630 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.Lzero, "aM", @progbits, 16
+.align 16
+.Lzero:
+.long 0,0,0,0
+.section .rodata.cst16.Lone, "aM", @progbits, 16
+.align 16
+.Lone:
+.long 1,0,0,0
+.section .rodata.cst16.Linc, "aM", @progbits, 16
+.align 16
+.Linc:
+.long 0,1,2,3
+.section .rodata.cst16.Lfour, "aM", @progbits, 16
+.align 16
+.Lfour:
+.long 4,4,4,4
+.section .rodata.cst32.Lincy, "aM", @progbits, 32
+.align 32
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.section .rodata.cst32.Leight, "aM", @progbits, 32
+.align 32
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.section .rodata.cst16.Lrot16, "aM", @progbits, 16
+.align 16
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.section .rodata.cst16.Lrot24, "aM", @progbits, 16
+.align 16
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.section .rodata.cst16.Lsigma, "aM", @progbits, 16
+.align 16
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.section .rodata.cst64.Lfourz, "aM", @progbits, 64
+.align 64
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.section .rodata.cst64.Lincz, "aM", @progbits, 64
+.align 64
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
+.align 64
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
+.align 64
+.Ltwoy:
+.long 2,0,0,0, 2,0,0,0
+
+.text
+
+#ifdef CONFIG_AS_SSSE3
+.align 32
+ENTRY(hchacha20_ssse3)
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rdx),%xmm1
+ movdqu 16(%rdx),%xmm2
+ movdqu (%rsi),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+ movq $10,%r8
+ .align 32
+.Loop_hssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_hssse3
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ ret
+ENDPROC(hchacha20_ssse3)
+
+.align 32
+ENTRY(chacha20_ssse3)
+.Lchacha20_ssse3:
+ cmpq $0,%rdx
+ je .Lssse3_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_4x
+
+.Ldo_sse3_after_all:
+ subq $64+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm6,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm7,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ leaq -8(%r10),%rsp
+
+.Lssse3_epilogue:
+ ret
+
+.align 32
+.Lchacha20_4x:
+ leaq 8(%rsp),%r10
+
+.Lproceed4x:
+ subq $0x140+8,%rsp
+ andq $-32,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r9),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm7,%xmm0
+ pshufb %xmm7,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+ pshufb %xmm6,%xmm0
+ pshufb %xmm6,%xmm1
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm7,%xmm2
+ pshufb %xmm7,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+ pshufb %xmm6,%xmm2
+ pshufb %xmm6,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm7,%xmm3
+ pshufb %xmm7,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+ pshufb %xmm6,%xmm3
+ pshufb %xmm6,%xmm0
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm7,%xmm1
+ pshufb %xmm7,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+ pshufb %xmm6,%xmm1
+ pshufb %xmm6,%xmm2
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r9),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r9,%r9
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ leaq -8(%r10),%rsp
+
+.L4x_epilogue:
+ ret
+ENDPROC(chacha20_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX2
+.align 32
+ENTRY(chacha20_avx2)
+.Lchacha20_avx2:
+ cmpq $0,%rdx
+ je .L8x_epilogue
+ leaq 8(%rsp),%r10
+
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r9
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r9),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r9),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r9,%r9
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r9,%r9
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L8x_epilogue:
+ ret
+ENDPROC(chacha20_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
+.align 32
+ENTRY(chacha20_avx512)
+.Lchacha20_avx512:
+ cmpq $0,%rdx
+ je .Lavx512_epilogue
+ leaq 8(%rsp),%r10
+
+ cmpq $512,%rdx
+ ja .Lchacha20_16x
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti32x4 .Lsigma(%rip),%zmm0
+ vbroadcasti32x4 (%rcx),%zmm1
+ vbroadcasti32x4 16(%rcx),%zmm2
+ vbroadcasti32x4 (%r8),%zmm3
+
+ vmovdqa32 %zmm0,%zmm16
+ vmovdqa32 %zmm1,%zmm17
+ vmovdqa32 %zmm2,%zmm18
+ vpaddd .Lzeroz(%rip),%zmm3,%zmm3
+ vmovdqa32 .Lfourz(%rip),%zmm20
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 16
+.Loop_outer_avx512:
+ vmovdqa32 %zmm16,%zmm0
+ vmovdqa32 %zmm17,%zmm1
+ vmovdqa32 %zmm18,%zmm2
+ vpaddd %zmm20,%zmm19,%zmm3
+ movq $10,%r8
+ vmovdqa32 %zmm3,%zmm19
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $57,%zmm1,%zmm1
+ vpshufd $147,%zmm3,%zmm3
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $16,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $12,%zmm1,%zmm1
+ vpaddd %zmm1,%zmm0,%zmm0
+ vpxord %zmm0,%zmm3,%zmm3
+ vprold $8,%zmm3,%zmm3
+ vpaddd %zmm3,%zmm2,%zmm2
+ vpxord %zmm2,%zmm1,%zmm1
+ vprold $7,%zmm1,%zmm1
+ vpshufd $78,%zmm2,%zmm2
+ vpshufd $147,%zmm1,%zmm1
+ vpshufd $57,%zmm3,%zmm3
+ decq %r8
+ jnz .Loop_avx512
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $1,%zmm0,%xmm4
+ vextracti32x4 $1,%zmm1,%xmm5
+ vextracti32x4 $1,%zmm2,%xmm6
+ vextracti32x4 $1,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $2,%zmm0,%xmm4
+ vextracti32x4 $2,%zmm1,%xmm5
+ vextracti32x4 $2,%zmm2,%xmm6
+ vextracti32x4 $2,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512
+
+ vextracti32x4 $3,%zmm0,%xmm4
+ vextracti32x4 $3,%zmm1,%xmm5
+ vextracti32x4 $3,%zmm2,%xmm6
+ vextracti32x4 $3,%zmm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jnz .Loop_outer_avx512
+
+ jmp .Ldone_avx512
+
+.align 16
+.Ltail64_avx512:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512
+
+.align 16
+.Ltail_avx512:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512
+
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone_avx512:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.Lavx512_epilogue:
+ ret
+
+.align 32
+.Lchacha20_16x:
+ leaq 8(%rsp),%r10
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti32x4 (%r9),%zmm3
+ vbroadcasti32x4 (%rcx),%zmm7
+ vbroadcasti32x4 16(%rcx),%zmm11
+ vbroadcasti32x4 (%r8),%zmm15
+
+ vpshufd $0x00,%zmm3,%zmm0
+ vpshufd $0x55,%zmm3,%zmm1
+ vpshufd $0xaa,%zmm3,%zmm2
+ vpshufd $0xff,%zmm3,%zmm3
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ vpshufd $0x00,%zmm7,%zmm4
+ vpshufd $0x55,%zmm7,%zmm5
+ vpshufd $0xaa,%zmm7,%zmm6
+ vpshufd $0xff,%zmm7,%zmm7
+ vmovdqa64 %zmm4,%zmm20
+ vmovdqa64 %zmm5,%zmm21
+ vmovdqa64 %zmm6,%zmm22
+ vmovdqa64 %zmm7,%zmm23
+
+ vpshufd $0x00,%zmm11,%zmm8
+ vpshufd $0x55,%zmm11,%zmm9
+ vpshufd $0xaa,%zmm11,%zmm10
+ vpshufd $0xff,%zmm11,%zmm11
+ vmovdqa64 %zmm8,%zmm24
+ vmovdqa64 %zmm9,%zmm25
+ vmovdqa64 %zmm10,%zmm26
+ vmovdqa64 %zmm11,%zmm27
+
+ vpshufd $0x00,%zmm15,%zmm12
+ vpshufd $0x55,%zmm15,%zmm13
+ vpshufd $0xaa,%zmm15,%zmm14
+ vpshufd $0xff,%zmm15,%zmm15
+ vpaddd .Lincz(%rip),%zmm12,%zmm12
+ vmovdqa64 %zmm12,%zmm28
+ vmovdqa64 %zmm13,%zmm29
+ vmovdqa64 %zmm14,%zmm30
+ vmovdqa64 %zmm15,%zmm31
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop_outer16x:
+ vpbroadcastd 0(%r9),%zmm0
+ vpbroadcastd 4(%r9),%zmm1
+ vpbroadcastd 8(%r9),%zmm2
+ vpbroadcastd 12(%r9),%zmm3
+ vpaddd .Lsixteen(%rip),%zmm28,%zmm28
+ vmovdqa64 %zmm20,%zmm4
+ vmovdqa64 %zmm21,%zmm5
+ vmovdqa64 %zmm22,%zmm6
+ vmovdqa64 %zmm23,%zmm7
+ vmovdqa64 %zmm24,%zmm8
+ vmovdqa64 %zmm25,%zmm9
+ vmovdqa64 %zmm26,%zmm10
+ vmovdqa64 %zmm27,%zmm11
+ vmovdqa64 %zmm28,%zmm12
+ vmovdqa64 %zmm29,%zmm13
+ vmovdqa64 %zmm30,%zmm14
+ vmovdqa64 %zmm31,%zmm15
+
+ vmovdqa64 %zmm0,%zmm16
+ vmovdqa64 %zmm1,%zmm17
+ vmovdqa64 %zmm2,%zmm18
+ vmovdqa64 %zmm3,%zmm19
+
+ movl $10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop16x:
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vpaddd %zmm4,%zmm0,%zmm0
+ vpaddd %zmm5,%zmm1,%zmm1
+ vpaddd %zmm6,%zmm2,%zmm2
+ vpaddd %zmm7,%zmm3,%zmm3
+ vpxord %zmm0,%zmm12,%zmm12
+ vpxord %zmm1,%zmm13,%zmm13
+ vpxord %zmm2,%zmm14,%zmm14
+ vpxord %zmm3,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vpaddd %zmm12,%zmm8,%zmm8
+ vpaddd %zmm13,%zmm9,%zmm9
+ vpaddd %zmm14,%zmm10,%zmm10
+ vpaddd %zmm15,%zmm11,%zmm11
+ vpxord %zmm8,%zmm4,%zmm4
+ vpxord %zmm9,%zmm5,%zmm5
+ vpxord %zmm10,%zmm6,%zmm6
+ vpxord %zmm11,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $16,%zmm15,%zmm15
+ vprold $16,%zmm12,%zmm12
+ vprold $16,%zmm13,%zmm13
+ vprold $16,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $12,%zmm5,%zmm5
+ vprold $12,%zmm6,%zmm6
+ vprold $12,%zmm7,%zmm7
+ vprold $12,%zmm4,%zmm4
+ vpaddd %zmm5,%zmm0,%zmm0
+ vpaddd %zmm6,%zmm1,%zmm1
+ vpaddd %zmm7,%zmm2,%zmm2
+ vpaddd %zmm4,%zmm3,%zmm3
+ vpxord %zmm0,%zmm15,%zmm15
+ vpxord %zmm1,%zmm12,%zmm12
+ vpxord %zmm2,%zmm13,%zmm13
+ vpxord %zmm3,%zmm14,%zmm14
+ vprold $8,%zmm15,%zmm15
+ vprold $8,%zmm12,%zmm12
+ vprold $8,%zmm13,%zmm13
+ vprold $8,%zmm14,%zmm14
+ vpaddd %zmm15,%zmm10,%zmm10
+ vpaddd %zmm12,%zmm11,%zmm11
+ vpaddd %zmm13,%zmm8,%zmm8
+ vpaddd %zmm14,%zmm9,%zmm9
+ vpxord %zmm10,%zmm5,%zmm5
+ vpxord %zmm11,%zmm6,%zmm6
+ vpxord %zmm8,%zmm7,%zmm7
+ vpxord %zmm9,%zmm4,%zmm4
+ vprold $7,%zmm5,%zmm5
+ vprold $7,%zmm6,%zmm6
+ vprold $7,%zmm7,%zmm7
+ vprold $7,%zmm4,%zmm4
+ decl %eax
+ jnz .Loop16x
+
+ vpaddd %zmm16,%zmm0,%zmm0
+ vpaddd %zmm17,%zmm1,%zmm1
+ vpaddd %zmm18,%zmm2,%zmm2
+ vpaddd %zmm19,%zmm3,%zmm3
+
+ vpunpckldq %zmm1,%zmm0,%zmm18
+ vpunpckldq %zmm3,%zmm2,%zmm19
+ vpunpckhdq %zmm1,%zmm0,%zmm0
+ vpunpckhdq %zmm3,%zmm2,%zmm2
+ vpunpcklqdq %zmm19,%zmm18,%zmm1
+ vpunpckhqdq %zmm19,%zmm18,%zmm18
+ vpunpcklqdq %zmm2,%zmm0,%zmm3
+ vpunpckhqdq %zmm2,%zmm0,%zmm0
+ vpaddd %zmm20,%zmm4,%zmm4
+ vpaddd %zmm21,%zmm5,%zmm5
+ vpaddd %zmm22,%zmm6,%zmm6
+ vpaddd %zmm23,%zmm7,%zmm7
+
+ vpunpckldq %zmm5,%zmm4,%zmm2
+ vpunpckldq %zmm7,%zmm6,%zmm19
+ vpunpckhdq %zmm5,%zmm4,%zmm4
+ vpunpckhdq %zmm7,%zmm6,%zmm6
+ vpunpcklqdq %zmm19,%zmm2,%zmm5
+ vpunpckhqdq %zmm19,%zmm2,%zmm2
+ vpunpcklqdq %zmm6,%zmm4,%zmm7
+ vpunpckhqdq %zmm6,%zmm4,%zmm4
+ vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19
+ vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
+ vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
+ vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
+ vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
+ vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
+ vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
+ vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
+ vpaddd %zmm24,%zmm8,%zmm8
+ vpaddd %zmm25,%zmm9,%zmm9
+ vpaddd %zmm26,%zmm10,%zmm10
+ vpaddd %zmm27,%zmm11,%zmm11
+
+ vpunpckldq %zmm9,%zmm8,%zmm6
+ vpunpckldq %zmm11,%zmm10,%zmm0
+ vpunpckhdq %zmm9,%zmm8,%zmm8
+ vpunpckhdq %zmm11,%zmm10,%zmm10
+ vpunpcklqdq %zmm0,%zmm6,%zmm9
+ vpunpckhqdq %zmm0,%zmm6,%zmm6
+ vpunpcklqdq %zmm10,%zmm8,%zmm11
+ vpunpckhqdq %zmm10,%zmm8,%zmm8
+ vpaddd %zmm28,%zmm12,%zmm12
+ vpaddd %zmm29,%zmm13,%zmm13
+ vpaddd %zmm30,%zmm14,%zmm14
+ vpaddd %zmm31,%zmm15,%zmm15
+
+ vpunpckldq %zmm13,%zmm12,%zmm10
+ vpunpckldq %zmm15,%zmm14,%zmm0
+ vpunpckhdq %zmm13,%zmm12,%zmm12
+ vpunpckhdq %zmm15,%zmm14,%zmm14
+ vpunpcklqdq %zmm0,%zmm10,%zmm13
+ vpunpckhqdq %zmm0,%zmm10,%zmm10
+ vpunpcklqdq %zmm14,%zmm12,%zmm15
+ vpunpckhqdq %zmm14,%zmm12,%zmm12
+ vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0
+ vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
+ vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
+ vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
+ vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
+ vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
+ vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
+ vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
+ vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16
+ vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
+ vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
+ vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
+ vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
+ vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
+ vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
+ vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
+ vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
+ vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
+ vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
+ vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
+ vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
+ vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
+ vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
+ vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
+ cmpq $1024,%rdx
+ jb .Ltail16x
+
+ vpxord 0(%rsi),%zmm16,%zmm16
+ vpxord 64(%rsi),%zmm17,%zmm17
+ vpxord 128(%rsi),%zmm14,%zmm14
+ vpxord 192(%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm16,0(%rdi)
+ vmovdqu32 %zmm17,64(%rdi)
+ vmovdqu32 %zmm14,128(%rdi)
+ vmovdqu32 %zmm8,192(%rdi)
+
+ vpxord 256(%rsi),%zmm19,%zmm19
+ vpxord 320(%rsi),%zmm1,%zmm1
+ vpxord 384(%rsi),%zmm18,%zmm18
+ vpxord 448(%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm19,256(%rdi)
+ vmovdqu32 %zmm1,320(%rdi)
+ vmovdqu32 %zmm18,384(%rdi)
+ vmovdqu32 %zmm3,448(%rdi)
+
+ vpxord 512(%rsi),%zmm0,%zmm0
+ vpxord 576(%rsi),%zmm9,%zmm9
+ vpxord 640(%rsi),%zmm6,%zmm6
+ vpxord 704(%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm0,512(%rdi)
+ vmovdqu32 %zmm9,576(%rdi)
+ vmovdqu32 %zmm6,640(%rdi)
+ vmovdqu32 %zmm11,704(%rdi)
+
+ vpxord 768(%rsi),%zmm13,%zmm13
+ vpxord 832(%rsi),%zmm10,%zmm10
+ vpxord 896(%rsi),%zmm15,%zmm15
+ vpxord 960(%rsi),%zmm12,%zmm12
+ leaq 1024(%rsi),%rsi
+ vmovdqu32 %zmm13,768(%rdi)
+ vmovdqu32 %zmm10,832(%rdi)
+ vmovdqu32 %zmm15,896(%rdi)
+ vmovdqu32 %zmm12,960(%rdi)
+ leaq 1024(%rdi),%rdi
+
+ subq $1024,%rdx
+ jnz .Loop_outer16x
+
+ jmp .Ldone16x
+
+.align 32
+.Ltail16x:
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm16,%zmm16
+ vmovdqu32 %zmm16,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm17,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm17,%zmm17
+ vmovdqu32 %zmm17,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm14,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm14,%zmm14
+ vmovdqu32 %zmm14,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm8,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm8,%zmm8
+ vmovdqu32 %zmm8,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm19,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm19,%zmm19
+ vmovdqu32 %zmm19,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm1,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm1,%zmm1
+ vmovdqu32 %zmm1,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm18,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm18,%zmm18
+ vmovdqu32 %zmm18,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm3,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $512,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm3,%zmm3
+ vmovdqu32 %zmm3,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm0,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $576,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm0,%zmm0
+ vmovdqu32 %zmm0,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm9,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $640,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm9,%zmm9
+ vmovdqu32 %zmm9,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm6,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $704,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm6,%zmm6
+ vmovdqu32 %zmm6,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm11,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $768,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm11,%zmm11
+ vmovdqu32 %zmm11,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm13,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $832,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm13,%zmm13
+ vmovdqu32 %zmm13,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm10,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $896,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm10,%zmm10
+ vmovdqu32 %zmm10,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm15,%zmm16
+ leaq 64(%rsi),%rsi
+
+ cmpq $960,%rdx
+ jb .Less_than_64_16x
+ vpxord (%rsi),%zmm15,%zmm15
+ vmovdqu32 %zmm15,(%rdi,%rsi,1)
+ je .Ldone16x
+ vmovdqa32 %zmm12,%zmm16
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_16x:
+ vmovdqa32 %zmm16,0(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail16x:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail16x
+
+ vpxord %zmm16,%zmm16,%zmm16
+ vmovdqa32 %zmm16,0(%rsp)
+
+.Ldone16x:
+ vzeroall
+ leaq -8(%r10),%rsp
+
+.L16x_epilogue:
+ ret
+ENDPROC(chacha20_avx512)
+
+.align 32
+ENTRY(chacha20_avx512vl)
+ cmpq $0,%rdx
+ je .Lavx512vl_epilogue
+
+ leaq 8(%rsp),%r10
+
+ cmpq $128,%rdx
+ ja .Lchacha20_8xvl
+
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vbroadcasti128 .Lsigma(%rip),%ymm0
+ vbroadcasti128 (%rcx),%ymm1
+ vbroadcasti128 16(%rcx),%ymm2
+ vbroadcasti128 (%r8),%ymm3
+
+ vmovdqa32 %ymm0,%ymm16
+ vmovdqa32 %ymm1,%ymm17
+ vmovdqa32 %ymm2,%ymm18
+ vpaddd .Lzeroz(%rip),%ymm3,%ymm3
+ vmovdqa32 .Ltwoy(%rip),%ymm20
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 16
+.Loop_outer_avx512vl:
+ vmovdqa32 %ymm18,%ymm2
+ vpaddd %ymm20,%ymm19,%ymm3
+ movq $10,%r8
+ vmovdqa32 %ymm3,%ymm19
+ jmp .Loop_avx512vl
+
+.align 32
+.Loop_avx512vl:
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $57,%ymm1,%ymm1
+ vpshufd $147,%ymm3,%ymm3
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+ vpshufd $78,%ymm2,%ymm2
+ vpshufd $147,%ymm1,%ymm1
+ vpshufd $57,%ymm3,%ymm3
+ decq %r8
+ jnz .Loop_avx512vl
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ subq $64,%rdx
+ jb .Ltail64_avx512vl
+
+ vpxor 0(%rsi),%xmm0,%xmm4
+ vpxor 16(%rsi),%xmm1,%xmm5
+ vpxor 32(%rsi),%xmm2,%xmm6
+ vpxor 48(%rsi),%xmm3,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ jz .Ldone_avx512vl
+
+ vextracti128 $1,%ymm0,%xmm4
+ vextracti128 $1,%ymm1,%xmm5
+ vextracti128 $1,%ymm2,%xmm6
+ vextracti128 $1,%ymm3,%xmm7
+
+ subq $64,%rdx
+ jb .Ltail_avx512vl
+
+ vpxor 0(%rsi),%xmm4,%xmm4
+ vpxor 16(%rsi),%xmm5,%xmm5
+ vpxor 32(%rsi),%xmm6,%xmm6
+ vpxor 48(%rsi),%xmm7,%xmm7
+ leaq 64(%rsi),%rsi
+
+ vmovdqu %xmm4,0(%rdi)
+ vmovdqu %xmm5,16(%rdi)
+ vmovdqu %xmm6,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ vmovdqa32 %ymm16,%ymm0
+ vmovdqa32 %ymm17,%ymm1
+ jnz .Loop_outer_avx512vl
+
+ jmp .Ldone_avx512vl
+
+.align 16
+.Ltail64_avx512vl:
+ vmovdqa %xmm0,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm2,32(%rsp)
+ vmovdqa %xmm3,48(%rsp)
+ addq $64,%rdx
+ jmp .Loop_tail_avx512vl
+
+.align 16
+.Ltail_avx512vl:
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovdqa %xmm7,48(%rsp)
+ addq $64,%rdx
+
+.Loop_tail_avx512vl:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_avx512vl
+
+ vmovdqa32 %ymm16,0(%rsp)
+ vmovdqa32 %ymm16,32(%rsp)
+
+.Ldone_avx512vl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.Lavx512vl_epilogue:
+ ret
+
+.align 32
+.Lchacha20_8xvl:
+ leaq 8(%rsp),%r10
+ subq $64+8,%rsp
+ andq $-64,%rsp
+ vzeroupper
+
+ leaq .Lsigma(%rip),%r9
+ vbroadcasti128 (%r9),%ymm3
+ vbroadcasti128 (%rcx),%ymm7
+ vbroadcasti128 16(%rcx),%ymm11
+ vbroadcasti128 (%r8),%ymm15
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vpshufd $0xaa,%ymm3,%ymm2
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpshufd $0xaa,%ymm7,%ymm6
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vpshufd $0xaa,%ymm11,%ymm10
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vpshufd $0xaa,%ymm15,%ymm14
+ vpshufd $0xff,%ymm15,%ymm15
+ vpaddd .Lincy(%rip),%ymm12,%ymm12
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop_outer8xvl:
+
+
+ vpbroadcastd 8(%r9),%ymm2
+ vpbroadcastd 12(%r9),%ymm3
+ vpaddd .Leight(%rip),%ymm28,%ymm28
+ vmovdqa64 %ymm20,%ymm4
+ vmovdqa64 %ymm21,%ymm5
+ vmovdqa64 %ymm22,%ymm6
+ vmovdqa64 %ymm23,%ymm7
+ vmovdqa64 %ymm24,%ymm8
+ vmovdqa64 %ymm25,%ymm9
+ vmovdqa64 %ymm26,%ymm10
+ vmovdqa64 %ymm27,%ymm11
+ vmovdqa64 %ymm28,%ymm12
+ vmovdqa64 %ymm29,%ymm13
+ vmovdqa64 %ymm30,%ymm14
+ vmovdqa64 %ymm31,%ymm15
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+
+ movl $10,%eax
+ jmp .Loop8xvl
+
+.align 32
+.Loop8xvl:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpxor %ymm0,%ymm12,%ymm12
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm15,%ymm15
+ vprold $16,%ymm12,%ymm12
+ vprold $16,%ymm13,%ymm13
+ vprold $16,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm5,%ymm5
+ vprold $12,%ymm6,%ymm6
+ vprold $12,%ymm7,%ymm7
+ vprold $12,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpaddd %ymm4,%ymm3,%ymm3
+ vpxor %ymm0,%ymm15,%ymm15
+ vpxor %ymm1,%ymm12,%ymm12
+ vpxor %ymm2,%ymm13,%ymm13
+ vpxor %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm15,%ymm15
+ vprold $8,%ymm12,%ymm12
+ vprold $8,%ymm13,%ymm13
+ vprold $8,%ymm14,%ymm14
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm10,%ymm5,%ymm5
+ vpxor %ymm11,%ymm6,%ymm6
+ vpxor %ymm8,%ymm7,%ymm7
+ vpxor %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm5,%ymm5
+ vprold $7,%ymm6,%ymm6
+ vprold $7,%ymm7,%ymm7
+ vprold $7,%ymm4,%ymm4
+ decl %eax
+ jnz .Loop8xvl
+
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm18
+ vpunpckldq %ymm3,%ymm2,%ymm19
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm19,%ymm18,%ymm1
+ vpunpckhqdq %ymm19,%ymm18,%ymm18
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm2
+ vpunpckldq %ymm7,%ymm6,%ymm19
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm19,%ymm2,%ymm5
+ vpunpckhqdq %ymm19,%ymm2,%ymm2
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vshufi32x4 $0,%ymm5,%ymm1,%ymm19
+ vshufi32x4 $3,%ymm5,%ymm1,%ymm5
+ vshufi32x4 $0,%ymm2,%ymm18,%ymm1
+ vshufi32x4 $3,%ymm2,%ymm18,%ymm2
+ vshufi32x4 $0,%ymm7,%ymm3,%ymm18
+ vshufi32x4 $3,%ymm7,%ymm3,%ymm7
+ vshufi32x4 $0,%ymm4,%ymm0,%ymm3
+ vshufi32x4 $3,%ymm4,%ymm0,%ymm4
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm6
+ vpunpckldq %ymm11,%ymm10,%ymm0
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm0,%ymm6,%ymm9
+ vpunpckhqdq %ymm0,%ymm6,%ymm6
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ vpunpckldq %ymm13,%ymm12,%ymm10
+ vpunpckldq %ymm15,%ymm14,%ymm0
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm0,%ymm10,%ymm13
+ vpunpckhqdq %ymm0,%ymm10,%ymm10
+ vpunpcklqdq %ymm14,%ymm12,%ymm15
+ vpunpckhqdq %ymm14,%ymm12,%ymm12
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+ vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+ cmpq $512,%rdx
+ jb .Ltail8xvl
+
+ movl $0x80,%eax
+ vpxord 0(%rsi),%ymm19,%ymm19
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vpxor 64(%rsi),%ymm5,%ymm5
+ vpxor 96(%rsi),%ymm13,%ymm13
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm19,0(%rdi)
+ vmovdqu %ymm0,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm13,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm2,%ymm2
+ vpxor 96(%rsi),%ymm10,%ymm10
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm1,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm10,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vpxor 64(%rsi),%ymm7,%ymm7
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu32 %ymm18,0(%rdi)
+ vmovdqu %ymm6,32(%rdi)
+ vmovdqu %ymm7,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vpxor 64(%rsi),%ymm4,%ymm4
+ vpxor 96(%rsi),%ymm12,%ymm12
+ leaq (%rsi,%rax,1),%rsi
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vmovdqu %ymm12,96(%rdi)
+ leaq (%rdi,%rax,1),%rdi
+
+ vpbroadcastd 0(%r9),%ymm0
+ vpbroadcastd 4(%r9),%ymm1
+
+ subq $512,%rdx
+ jnz .Loop_outer8xvl
+
+ jmp .Ldone8xvl
+
+.align 32
+.Ltail8xvl:
+ vmovdqa64 %ymm19,%ymm8
+ xorq %r9,%r9
+ subq %rsi,%rdi
+ cmpq $64,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm8,%ymm8
+ vpxor 32(%rsi),%ymm0,%ymm0
+ vmovdqu %ymm8,0(%rdi,%rsi,1)
+ vmovdqu %ymm0,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm5,%ymm8
+ vmovdqa %ymm13,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $128,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm5,%ymm5
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm5,0(%rdi,%rsi,1)
+ vmovdqu %ymm13,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm1,%ymm8
+ vmovdqa %ymm9,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $192,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm1,%ymm1
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm1,0(%rdi,%rsi,1)
+ vmovdqu %ymm9,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm2,%ymm8
+ vmovdqa %ymm10,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $256,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm2,%ymm2
+ vpxor 32(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm2,0(%rdi,%rsi,1)
+ vmovdqu %ymm10,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa32 %ymm18,%ymm8
+ vmovdqa %ymm6,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $320,%rdx
+ jb .Less_than_64_8xvl
+ vpxord 0(%rsi),%ymm18,%ymm18
+ vpxor 32(%rsi),%ymm6,%ymm6
+ vmovdqu32 %ymm18,0(%rdi,%rsi,1)
+ vmovdqu %ymm6,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm7,%ymm8
+ vmovdqa %ymm15,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $384,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm7,%ymm7
+ vpxor 32(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm7,0(%rdi,%rsi,1)
+ vmovdqu %ymm15,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm3,%ymm8
+ vmovdqa %ymm11,%ymm0
+ leaq 64(%rsi),%rsi
+
+ cmpq $448,%rdx
+ jb .Less_than_64_8xvl
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi,%rsi,1)
+ vmovdqu %ymm11,32(%rdi,%rsi,1)
+ je .Ldone8xvl
+ vmovdqa %ymm4,%ymm8
+ vmovdqa %ymm12,%ymm0
+ leaq 64(%rsi),%rsi
+
+.Less_than_64_8xvl:
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm0,32(%rsp)
+ leaq (%rdi,%rsi,1),%rdi
+ andq $63,%rdx
+
+.Loop_tail8xvl:
+ movzbl (%rsi,%r9,1),%eax
+ movzbl (%rsp,%r9,1),%ecx
+ leaq 1(%r9),%r9
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r9,1)
+ decq %rdx
+ jnz .Loop_tail8xvl
+
+ vpxor %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm8,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+
+.Ldone8xvl:
+ vzeroall
+ leaq -8(%r10),%rsp
+.L8xvl_epilogue:
+ ret
+ENDPROC(chacha20_avx512vl)
+
+#endif /* CONFIG_AS_AVX512 */
--
2.18.0
^ permalink raw reply related
* [PATCH v2 06/17] zinc: ChaCha20 MIPS32r2 implementation
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
To: linux-kernel, netdev, davem
Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
Jean-Philippe Aumasson, René van Dorst, Ralf Baechle,
Paul Burton, James Hogan, linux-mips, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>
This MIPS32r2 implementation comes from René van Dorst and me and
results in a nice speedup on the usual OpenWRT targets.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: René van Dorst <opensource@vdorst.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: linux-mips@linux-mips.org
Cc: linux-crypto@vger.kernel.org
---
lib/zinc/Makefile | 4 +
lib/zinc/chacha20/chacha20-mips-glue.h | 19 +
lib/zinc/chacha20/chacha20-mips.S | 474 +++++++++++++++++++++++++
3 files changed, 497 insertions(+)
create mode 100644 lib/zinc/chacha20/chacha20-mips-glue.h
create mode 100644 lib/zinc/chacha20/chacha20-mips.S
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index fc6ca1c84cd1..3ba830b51695 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -16,6 +16,10 @@ ifeq ($(CONFIG_ARM64),y)
zinc-y += chacha20/chacha20-arm64.o
CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-arm-glue.h
endif
+ifeq ($(CONFIG_MIPS)$(CONFIG_CPU_MIPS32_R2),yy)
+zinc-y += chacha20/chacha20-mips.o
+CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
+endif
endif
zinc-y += main.o
diff --git a/lib/zinc/chacha20/chacha20-mips-glue.h b/lib/zinc/chacha20/chacha20-mips-glue.h
new file mode 100644
index 000000000000..54c71a0b5bd0
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips-glue.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <zinc/chacha20.h>
+
+asmlinkage void chacha20_mips(u8 *out, const u8 *in, const size_t len, const u32 key[8], const u32 counter[4]);
+void __init chacha20_fpu_init(void) { }
+
+static inline bool chacha20_arch(u8 *dst, const u8 *src, const size_t len, const u32 key[8], const u32 counter[4], simd_context_t simd_context)
+{
+ chacha20_mips(dst, src, len, key, counter);
+ return true;
+}
+
+static inline bool hchacha20_arch(u8 *derived_key, const u8 *nonce, const u8 *key, simd_context_t simd_context) { return false; }
+
+#define HAVE_CHACHA20_ARCH_IMPLEMENTATION
diff --git a/lib/zinc/chacha20/chacha20-mips.S b/lib/zinc/chacha20/chacha20-mips.S
new file mode 100644
index 000000000000..77da2c2fb240
--- /dev/null
+++ b/lib/zinc/chacha20/chacha20-mips.S
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32 0x3c
+#define MASK_BYTES 0x03
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE 4*16
+
+#define X0 $t0
+#define X1 $t1
+#define X2 $t2
+#define X3 $t3
+#define X4 $t4
+#define X5 $t5
+#define X6 $t6
+#define X7 $t7
+#define X8 $v1
+#define X9 $fp
+#define X10 $s7
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0 $s1
+#define T1 $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define OUT $a0
+#define IN $a1
+#define BYTES $a2
+/* KEY and NONCE argument must be u32 aligned */
+#define KEY $a3
+/* NONCE pointer is given via stack */
+#define NONCE $t9
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0 $v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X X15
+#define SAVED_CA $ra
+
+#define PTR_LAST_ROUND $t8
+
+/* ChaCha20 constants and stack location */
+#define CONSTANT_OFS_SP 48
+#define UNALIGNED_OFS_SP 40
+
+#define CONSTANT_1 0x61707865
+#define CONSTANT_2 0x3320646e
+#define CONSTANT_3 0x79622d32
+#define CONSTANT_4 0x6b206574
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define CPU_TO_LE32(n) \
+ wsbh n; \
+ rotr n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define STORE_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_unaligned_ ## x ## _b: ; \
+ .if ((s != NONCE) || (o != 0)); \
+ lw T0, o(s); \
+ .endif; \
+ lwl T1, x-4+MSB ## (IN); \
+ lwr T1, x-4+LSB ## (IN); \
+ .if ((s == NONCE) && (o == 0)); \
+ addu X ## a, NONCE_0; \
+ .else; \
+ addu X ## a, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## a); \
+ xor X ## a, T1; \
+ swl X ## a, x-4+MSB ## (OUT); \
+ swr X ## a, x-4+LSB ## (OUT);
+
+#define STORE_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_xor_aligned_ ## x ## _b: ; \
+ .if ((s != NONCE) || (o != 0)); \
+ lw T0, o(s); \
+ .endif; \
+ lw T1, x-4 ## (IN); \
+ .if ((s == NONCE) && (o == 0)); \
+ addu X ## a, NONCE_0; \
+ .else; \
+ addu X ## a, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## a); \
+ xor X ## a, T1; \
+ sw X ## a, x-4 ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_aligned_ ## a: ; \
+ .if ((s == NONCE) && (o == 0)); \
+ move SAVED_CA, NONCE_0; \
+ .else; \
+ lw SAVED_CA, o(s);\
+ .endif; \
+ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
+ move SAVED_X, X ## a;
+
+#define JMPTBL_UNALIGNED(x, a, s, o) \
+.Lchacha20_mips_jmptbl_unaligned_ ## a: ; \
+ .if ((s == NONCE) && (o == 0)); \
+ move SAVED_CA, NONCE_0; \
+ .else; \
+ lw SAVED_CA, o(s);\
+ .endif; \
+ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+ move SAVED_X, X ## a;
+
+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
+ addu X(A), X(K); \
+ addu X(B), X(L); \
+ addu X(C), X(M); \
+ addu X(D), X(N); \
+ xor X(V), X(A); \
+ xor X(W), X(B); \
+ xor X(Y), X(C); \
+ xor X(Z), X(D); \
+ rotl X(V), S; \
+ rotl X(W), S; \
+ rotl X(Y), S; \
+ rotl X(Z), S;
+
+.text
+.set reorder
+.set noat
+.globl chacha20_mips
+.ent chacha20_mips
+chacha20_mips:
+ .frame $sp, STACK_SIZE, $ra
+ /* This is in the fifth argument */
+ lw NONCE, 16($sp)
+
+ /* Return bytes = 0. */
+ .set noreorder
+ beqz BYTES, .Lchacha20_mips_end
+ addiu $sp, -STACK_SIZE
+ .set reorder
+
+ /* Calculate PTR_LAST_ROUND */
+ addiu PTR_LAST_ROUND, BYTES, -1
+ ins PTR_LAST_ROUND, $zero, 0, 6
+ addu PTR_LAST_ROUND, OUT
+
+ /* Save s0-s7, fp, ra. */
+ sw $ra, 0($sp)
+ sw $fp, 4($sp)
+ sw $s0, 8($sp)
+ sw $s1, 12($sp)
+ sw $s2, 16($sp)
+ sw $s3, 20($sp)
+ sw $s4, 24($sp)
+ sw $s5, 28($sp)
+ sw $s6, 32($sp)
+ sw $s7, 36($sp)
+
+ lw NONCE_0, 0(NONCE)
+ /* Test IN or OUT is unaligned.
+ * UNALIGNED (T1) = ( IN | OUT ) & 0x00000003
+ */
+ or T1, IN, OUT
+ andi T1, 0x3
+
+ /* Load constant */
+ lui X0, %hi(CONSTANT_1)
+ lui X1, %hi(CONSTANT_2)
+ lui X2, %hi(CONSTANT_3)
+ lui X3, %hi(CONSTANT_4)
+ ori X0, %lo(CONSTANT_1)
+ ori X1, %lo(CONSTANT_2)
+ ori X2, %lo(CONSTANT_3)
+ ori X3, %lo(CONSTANT_4)
+
+ /* Store constant on stack. */
+ sw X0, 0+CONSTANT_OFS_SP($sp)
+ sw X1, 4+CONSTANT_OFS_SP($sp)
+ sw X2, 8+CONSTANT_OFS_SP($sp)
+ sw X3, 12+CONSTANT_OFS_SP($sp)
+
+ sw T1, UNALIGNED_OFS_SP($sp)
+
+ .set noreorder
+ b .Lchacha20_rounds_start
+ andi BYTES, (CHACHA20_BLOCK_SIZE-1)
+ .set reorder
+
+.align 4
+.Loop_chacha20_rounds:
+ addiu IN, CHACHA20_BLOCK_SIZE
+ addiu OUT, CHACHA20_BLOCK_SIZE
+ addiu NONCE_0, 1
+
+ lw X0, 0+CONSTANT_OFS_SP($sp)
+ lw X1, 4+CONSTANT_OFS_SP($sp)
+ lw X2, 8+CONSTANT_OFS_SP($sp)
+ lw X3, 12+CONSTANT_OFS_SP($sp)
+ lw T1, UNALIGNED_OFS_SP($sp)
+
+.Lchacha20_rounds_start:
+ lw X4, 0(KEY)
+ lw X5, 4(KEY)
+ lw X6, 8(KEY)
+ lw X7, 12(KEY)
+ lw X8, 16(KEY)
+ lw X9, 20(KEY)
+ lw X10, 24(KEY)
+ lw X11, 28(KEY)
+
+ move X12, NONCE_0
+ lw X13, 4(NONCE)
+ lw X14, 8(NONCE)
+ lw X15, 12(NONCE)
+
+ li $at, 9
+.Loop_chacha20_xor_rounds:
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ .set noreorder
+ bnez $at, .Loop_chacha20_xor_rounds
+ addiu $at, -1
+
+ /* Unaligned? Jump */
+ bnez T1, .Loop_chacha20_unaligned
+ andi $at, BYTES, MASK_U32
+
+ /* Last round? No jump */
+ bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_aligned_64_b
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+
+ /* Full block? Jump */
+ beqz BYTES, .Lchacha20_mips_xor_aligned_64_b
+ /* Calculate lower half jump table addr and offset */
+ ins T0, $at, 2, 6
+
+ subu T0, $at
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+
+ jr T0
+ /* Delay slot */
+ nop
+
+ .set reorder
+
+.Loop_chacha20_unaligned:
+ .set noreorder
+
+ /* Last round? no jump */
+ bne OUT, PTR_LAST_ROUND, .Lchacha20_mips_xor_unaligned_64_b
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ /* Full block? Jump */
+ beqz BYTES, .Lchacha20_mips_xor_unaligned_64_b
+
+ /* Calculate lower half jump table addr and offset */
+ ins T0, $at, 2, 6
+ subu T0, $at
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+ jr T0
+ /* Delay slot */
+ nop
+
+ .set reorder
+
+/* Aligned code path
+ */
+.align 4
+ STORE_ALIGNED(64, 15, NONCE,12)
+ STORE_ALIGNED(60, 14, NONCE, 8)
+ STORE_ALIGNED(56, 13, NONCE, 4)
+ STORE_ALIGNED(52, 12, NONCE, 0)
+ STORE_ALIGNED(48, 11, KEY, 28)
+ STORE_ALIGNED(44, 10, KEY, 24)
+ STORE_ALIGNED(40, 9, KEY, 20)
+ STORE_ALIGNED(36, 8, KEY, 16)
+ STORE_ALIGNED(32, 7, KEY, 12)
+ STORE_ALIGNED(28, 6, KEY, 8)
+ STORE_ALIGNED(24, 5, KEY, 4)
+ STORE_ALIGNED(20, 4, KEY, 0)
+ STORE_ALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP)
+ STORE_ALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP)
+ STORE_ALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_aligned_4_b:
+ /* STORE_ALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */
+ lw T0, 0+CONSTANT_OFS_SP($sp)
+ lw T1, 0(IN)
+ addu X0, T0
+ CPU_TO_LE32(X0)
+ xor X0, T1
+ .set noreorder
+ bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ sw X0, 0(OUT)
+ .set reorder
+
+ .set noreorder
+ bne $at, BYTES, .Lchacha20_mips_xor_bytes
+ /* Empty delayslot, Increase NONCE_0, return NONCE_0 value */
+ addiu NONCE_0, 1
+ .set noreorder
+
+.Lchacha20_mips_xor_done:
+ /* Restore used registers */
+ lw $ra, 0($sp)
+ lw $fp, 4($sp)
+ lw $s0, 8($sp)
+ lw $s1, 12($sp)
+ lw $s2, 16($sp)
+ lw $s3, 20($sp)
+ lw $s4, 24($sp)
+ lw $s5, 28($sp)
+ lw $s6, 32($sp)
+ lw $s7, 36($sp)
+.Lchacha20_mips_end:
+ .set noreorder
+ jr $ra
+ addiu $sp, STACK_SIZE
+ .set reorder
+
+ .set noreorder
+ /* Start jump table */
+ JMPTBL_ALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP)
+ JMPTBL_ALIGNED(16, 4, KEY, 0)
+ JMPTBL_ALIGNED(20, 5, KEY, 4)
+ JMPTBL_ALIGNED(24, 6, KEY, 8)
+ JMPTBL_ALIGNED(28, 7, KEY, 12)
+ JMPTBL_ALIGNED(32, 8, KEY, 16)
+ JMPTBL_ALIGNED(36, 9, KEY, 20)
+ JMPTBL_ALIGNED(40, 10, KEY, 24)
+ JMPTBL_ALIGNED(44, 11, KEY, 28)
+ JMPTBL_ALIGNED(48, 12, NONCE, 0)
+ JMPTBL_ALIGNED(52, 13, NONCE, 4)
+ JMPTBL_ALIGNED(56, 14, NONCE, 8)
+ JMPTBL_ALIGNED(60, 15, NONCE,12)
+ /* End jump table */
+ .set reorder
+
+/* Unaligned code path
+ */
+ STORE_UNALIGNED(64, 15, NONCE,12)
+ STORE_UNALIGNED(60, 14, NONCE, 8)
+ STORE_UNALIGNED(56, 13, NONCE, 4)
+ STORE_UNALIGNED(52, 12, NONCE, 0)
+ STORE_UNALIGNED(48, 11, KEY, 28)
+ STORE_UNALIGNED(44, 10, KEY, 24)
+ STORE_UNALIGNED(40, 9, KEY, 20)
+ STORE_UNALIGNED(36, 8, KEY, 16)
+ STORE_UNALIGNED(32, 7, KEY, 12)
+ STORE_UNALIGNED(28, 6, KEY, 8)
+ STORE_UNALIGNED(24, 5, KEY, 4)
+ STORE_UNALIGNED(20, 4, KEY, 0)
+ STORE_UNALIGNED(16, 3, $sp, 12+CONSTANT_OFS_SP)
+ STORE_UNALIGNED(12, 2, $sp, 8+CONSTANT_OFS_SP)
+ STORE_UNALIGNED( 8, 1, $sp, 4+CONSTANT_OFS_SP)
+.Lchacha20_mips_xor_unaligned_4_b:
+ /* STORE_UNALIGNED( 4, 0, $sp, 0+CONSTANT_OFS_SP) */
+ lw T0, 0+CONSTANT_OFS_SP($sp)
+ lwl T1, 0+MSB(IN)
+ lwr T1, 0+LSB(IN)
+ addu X0, T0
+ CPU_TO_LE32(X0)
+ xor X0, T1
+ swl X0, 0+MSB(OUT)
+ .set noreorder
+ bne OUT, PTR_LAST_ROUND, .Loop_chacha20_rounds
+ swr X0, 0+LSB(OUT)
+ .set reorder
+
+ /* Fall through to byte handling */
+ .set noreorder
+ beq $at, BYTES, .Lchacha20_mips_xor_done
+ /* Empty delayslot, increase NONCE_0, return NONCE_0 value */
+.Lchacha20_mips_xor_unaligned_0_b:
+.Lchacha20_mips_xor_aligned_0_b:
+ addiu NONCE_0, 1
+ .set reorder
+
+.Lchacha20_mips_xor_bytes:
+ addu OUT, $at
+ addu IN, $at
+ addu SAVED_X, SAVED_CA
+ /* First byte */
+ lbu T1, 0(IN)
+ andi $at, BYTES, 2
+ CPU_TO_LE32(SAVED_X)
+ ROTR(SAVED_X)
+ xor T1, SAVED_X
+ .set noreorder
+ beqz $at, .Lchacha20_mips_xor_done
+ sb T1, 0(OUT)
+ .set reorder
+ /* Second byte */
+ lbu T1, 1(IN)
+ andi $at, BYTES, 1
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ .set noreorder
+ beqz $at, .Lchacha20_mips_xor_done
+ sb T1, 1(OUT)
+ .set reorder
+ /* Third byte */
+ lbu T1, 2(IN)
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ .set noreorder
+ b .Lchacha20_mips_xor_done
+ sb T1, 2(OUT)
+ .set reorder
+.set noreorder
+
+.Lchacha20_mips_jmptbl_unaligned:
+ /* Start jump table */
+ JMPTBL_UNALIGNED( 0, 0, $sp, 0+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED( 4, 1, $sp, 4+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED( 8, 2, $sp, 8+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED(12, 3, $sp, 12+CONSTANT_OFS_SP)
+ JMPTBL_UNALIGNED(16, 4, KEY, 0)
+ JMPTBL_UNALIGNED(20, 5, KEY, 4)
+ JMPTBL_UNALIGNED(24, 6, KEY, 8)
+ JMPTBL_UNALIGNED(28, 7, KEY, 12)
+ JMPTBL_UNALIGNED(32, 8, KEY, 16)
+ JMPTBL_UNALIGNED(36, 9, KEY, 20)
+ JMPTBL_UNALIGNED(40, 10, KEY, 24)
+ JMPTBL_UNALIGNED(44, 11, KEY, 28)
+ JMPTBL_UNALIGNED(48, 12, NONCE, 0)
+ JMPTBL_UNALIGNED(52, 13, NONCE, 4)
+ JMPTBL_UNALIGNED(56, 14, NONCE, 8)
+ JMPTBL_UNALIGNED(60, 15, NONCE,12)
+ /* End jump table */
+.set reorder
+
+.end chacha20_mips
+.set at
--
2.18.0
^ permalink raw reply related
* [PATCH v2 07/17] zinc: Poly1305 generic C implementation and selftest
From: Jason A. Donenfeld @ 2018-08-24 21:38 UTC (permalink / raw)
To: linux-kernel, netdev, davem
Cc: Jason A. Donenfeld, Andy Lutomirski, Greg KH, Samuel Neves,
Jean-Philippe Aumasson, Andy Polyakov, linux-crypto
In-Reply-To: <20180824213849.23647-1-Jason@zx2c4.com>
The C implementation is based on Andy Polyakov's implementation, heavily
modified by Samuel Neves.
Information: https://cr.yp.to/mac.html
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Polyakov <appro@openssl.org>
Cc: linux-crypto@vger.kernel.org
---
include/zinc/poly1305.h | 35 +
lib/zinc/Kconfig | 4 +
lib/zinc/Makefile | 4 +
lib/zinc/main.c | 5 +
lib/zinc/poly1305/poly1305.c | 257 ++++++
lib/zinc/selftest/poly1305.h | 1566 ++++++++++++++++++++++++++++++++++
6 files changed, 1871 insertions(+)
create mode 100644 include/zinc/poly1305.h
create mode 100644 lib/zinc/poly1305/poly1305.c
create mode 100644 lib/zinc/selftest/poly1305.h
diff --git a/include/zinc/poly1305.h b/include/zinc/poly1305.h
new file mode 100644
index 000000000000..9fcae37a41ee
--- /dev/null
+++ b/include/zinc/poly1305.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _ZINC_POLY1305_H
+#define _ZINC_POLY1305_H
+
+#include <linux/simd.h>
+#include <linux/types.h>
+
+enum poly1305_lengths {
+ POLY1305_BLOCK_SIZE = 16,
+ POLY1305_KEY_SIZE = 32,
+ POLY1305_MAC_SIZE = 16
+};
+
+struct poly1305_ctx {
+ u8 opaque[24 * sizeof(u64)];
+ u32 nonce[4];
+ u8 data[POLY1305_BLOCK_SIZE];
+ size_t num;
+} __aligned(8);
+
+void poly1305_fpu_init(void);
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE], simd_context_t simd_context);
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, const size_t len, simd_context_t simd_context);
+void poly1305_finish(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE], simd_context_t simd_context);
+
+#ifdef CONFIG_ZINC_DEBUG
+bool poly1305_selftest(void);
+#endif
+
+#endif /* _ZINC_POLY1305_H */
diff --git a/lib/zinc/Kconfig b/lib/zinc/Kconfig
index 5311a0d6ba8b..86936739a05f 100644
--- a/lib/zinc/Kconfig
+++ b/lib/zinc/Kconfig
@@ -10,6 +10,10 @@ config ZINC_CHACHA20
bool
select ZINC
+config ZINC_POLY1305
+ bool
+ select ZINC
+
config ZINC_DEBUG
bool "Zinc cryptography library debugging and self-tests"
depends on ZINC
diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
index 3ba830b51695..bf7b3a75f4c2 100644
--- a/lib/zinc/Makefile
+++ b/lib/zinc/Makefile
@@ -22,6 +22,10 @@ CFLAGS_chacha20.o += -include $(srctree)/$(src)/chacha20/chacha20-mips-glue.h
endif
endif
+ifeq ($(CONFIG_ZINC_POLY1305),y)
+zinc-y += poly1305/poly1305.o
+endif
+
zinc-y += main.o
obj-$(CONFIG_ZINC) := zinc.o
diff --git a/lib/zinc/main.c b/lib/zinc/main.c
index 79b85045f22f..80a2e9f963f7 100644
--- a/lib/zinc/main.c
+++ b/lib/zinc/main.c
@@ -4,6 +4,7 @@
*/
#include <zinc/chacha20.h>
+#include <zinc/poly1305.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -21,6 +22,10 @@ static int __init mod_init(void)
{
#ifdef CONFIG_ZINC_CHACHA20
chacha20_fpu_init();
+#endif
+#ifdef CONFIG_ZINC_POLY1305
+ poly1305_fpu_init();
+ selftest(poly1305);
#endif
return 0;
}
diff --git a/lib/zinc/poly1305/poly1305.c b/lib/zinc/poly1305/poly1305.c
new file mode 100644
index 000000000000..fd4ad5bbb76c
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305.c
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: OpenSSL OR (BSD-3-Clause OR GPL-2.0)
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#include <zinc/poly1305.h>
+
+#include <asm/unaligned.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#ifndef HAVE_POLY1305_ARCH_IMPLEMENTATION
+static inline bool poly1305_init_arch(void *ctx, const u8 key[POLY1305_KEY_SIZE], simd_context_t simd_context) { return false; }
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp, const size_t len, const u32 padbit, simd_context_t simd_context) { return false; }
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE], const u32 nonce[4], simd_context_t simd_context) { return false; }
+void __init poly1305_fpu_init(void) { }
+#endif
+
+struct poly1305_internal {
+ u32 h[5];
+ u32 r[4];
+};
+
+static void poly1305_init_generic(void *ctx, const u8 key[16])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ st->r[0] = get_unaligned_le32(&key[ 0]) & 0x0fffffff;
+ st->r[1] = get_unaligned_le32(&key[ 4]) & 0x0ffffffc;
+ st->r[2] = get_unaligned_le32(&key[ 8]) & 0x0ffffffc;
+ st->r[3] = get_unaligned_le32(&key[12]) & 0x0ffffffc;
+}
+
+static void poly1305_blocks_generic(void *ctx, const u8 *inp, size_t len, const u32 padbit)
+{
+#define CONSTANT_TIME_CARRY(a,b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u32 r0, r1, r2, r3;
+ u32 s1, s2, s3;
+ u32 h0, h1, h2, h3, h4, c;
+ u64 d0, d1, d2, d3;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+
+ s1 = r1 + (r1 >> 2);
+ s2 = r2 + (r2 >> 2);
+ s3 = r3 + (r3 >> 2);
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (len >= POLY1305_BLOCK_SIZE) {
+ /* h += m[i] */
+ h0 = (u32)(d0 = (u64)h0 + (0 ) + get_unaligned_le32(&inp[ 0]));
+ h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + get_unaligned_le32(&inp[ 4]));
+ h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + get_unaligned_le32(&inp[ 8]));
+ h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + get_unaligned_le32(&inp[12]));
+ h4 += (u32)(d3 >> 32) + padbit;
+
+ /* h *= r "%" p, where "%" stands for "partial remainder" */
+ d0 = ((u64)h0 * r0) +
+ ((u64)h1 * s3) +
+ ((u64)h2 * s2) +
+ ((u64)h3 * s1);
+ d1 = ((u64)h0 * r1) +
+ ((u64)h1 * r0) +
+ ((u64)h2 * s3) +
+ ((u64)h3 * s2) +
+ (h4 * s1);
+ d2 = ((u64)h0 * r2) +
+ ((u64)h1 * r1) +
+ ((u64)h2 * r0) +
+ ((u64)h3 * s3) +
+ (h4 * s2);
+ d3 = ((u64)h0 * r3) +
+ ((u64)h1 * r2) +
+ ((u64)h2 * r1) +
+ ((u64)h3 * r0) +
+ (h4 * s3);
+ h4 = (h4 * r0);
+
+ /* last reduction step: */
+ /* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */
+ h0 = (u32)d0;
+ h1 = (u32)(d1 += d0 >> 32);
+ h2 = (u32)(d2 += d1 >> 32);
+ h3 = (u32)(d3 += d2 >> 32);
+ h4 += (u32)(d3 >> 32);
+ /* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */
+ c = (h4 >> 2) + (h4 & ~3U);
+ h4 &= 3;
+ h0 += c;
+ h1 += (c = CONSTANT_TIME_CARRY(h0, c));
+ h2 += (c = CONSTANT_TIME_CARRY(h1, c));
+ h3 += (c = CONSTANT_TIME_CARRY(h2, c));
+ h4 += CONSTANT_TIME_CARRY(h3, c);
+ /*
+ * Occasional overflows to 3rd bit of h4 are taken care of
+ * "naturally". If after this point we end up at the top of
+ * this loop, then the overflow bit will be accounted for
+ * in next iteration. If we end up in poly1305_emit, then
+ * comparison to modulus below will still count as "carry
+ * into 131st bit", so that properly reduced value will be
+ * picked in conditional move.
+ */
+
+ inp += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+#undef CONSTANT_TIME_CARRY
+}
+
+static void poly1305_emit_generic(void *ctx, u8 mac[16], const u32 nonce[4])
+{
+ struct poly1305_internal *st = (struct poly1305_internal *)ctx;
+ u32 h0, h1, h2, h3, h4;
+ u32 g0, g1, g2, g3, g4;
+ u64 t;
+ u32 mask;
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ /* compare to modulus by computing h + -p */
+ g0 = (u32)(t = (u64)h0 + 5);
+ g1 = (u32)(t = (u64)h1 + (t >> 32));
+ g2 = (u32)(t = (u64)h2 + (t >> 32));
+ g3 = (u32)(t = (u64)h3 + (t >> 32));
+ g4 = h4 + (u32)(t >> 32);
+
+ /* if there was carry into 131st bit, h3:h0 = g3:g0 */
+ mask = 0 - (g4 >> 2);
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ mask = ~mask;
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+
+ /* mac = (h + nonce) % (2^128) */
+ h0 = (u32)(t = (u64)h0 + nonce[0]);
+ h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]);
+ h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]);
+ h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]);
+
+ put_unaligned_le32(h0, &mac[ 0]);
+ put_unaligned_le32(h1, &mac[ 4]);
+ put_unaligned_le32(h2, &mac[ 8]);
+ put_unaligned_le32(h3, &mac[12]);
+}
+
+void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE], simd_context_t simd_context)
+{
+ ctx->nonce[0] = get_unaligned_le32(&key[16]);
+ ctx->nonce[1] = get_unaligned_le32(&key[20]);
+ ctx->nonce[2] = get_unaligned_le32(&key[24]);
+ ctx->nonce[3] = get_unaligned_le32(&key[28]);
+
+ if (!poly1305_init_arch(ctx->opaque, key, simd_context))
+ poly1305_init_generic(ctx->opaque, key);
+ ctx->num = 0;
+}
+EXPORT_SYMBOL(poly1305_init);
+
+static inline void poly1305_blocks(void *ctx, const u8 *inp, const size_t len, const u32 padbit, simd_context_t simd_context)
+{
+ if (!poly1305_blocks_arch(ctx, inp, len, padbit, simd_context))
+ poly1305_blocks_generic(ctx, inp, len, padbit);
+}
+
+static inline void poly1305_emit(void *ctx, u8 mac[POLY1305_KEY_SIZE], const u32 nonce[4], simd_context_t simd_context)
+{
+ if (!poly1305_emit_arch(ctx, mac, nonce, simd_context))
+ poly1305_emit_generic(ctx, mac, nonce);
+}
+
+void poly1305_update(struct poly1305_ctx *ctx, const u8 *inp, size_t len, simd_context_t simd_context)
+{
+ const size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+ size_t rem;
+
+ if (num) {
+ rem = POLY1305_BLOCK_SIZE - num;
+ if (len >= rem) {
+ memcpy(ctx->data + num, inp, rem);
+ poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1, simd_context);
+ inp += rem;
+ len -= rem;
+ } else {
+ /* Still not enough data to process a block. */
+ memcpy(ctx->data + num, inp, len);
+ ctx->num = num + len;
+ return;
+ }
+ }
+
+ rem = len % POLY1305_BLOCK_SIZE;
+ len -= rem;
+
+ if (len >= POLY1305_BLOCK_SIZE) {
+ poly1305_blocks(ctx->opaque, inp, len, 1, simd_context);
+ inp += len;
+ }
+
+ if (rem)
+ memcpy(ctx->data, inp, rem);
+
+ ctx->num = rem;
+}
+EXPORT_SYMBOL(poly1305_update);
+
+void poly1305_finish(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE], simd_context_t simd_context)
+{
+ size_t num = ctx->num % POLY1305_BLOCK_SIZE;
+
+ if (num) {
+ ctx->data[num++] = 1; /* pad bit */
+ while (num < POLY1305_BLOCK_SIZE)
+ ctx->data[num++] = 0;
+ poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0, simd_context);
+ }
+
+ poly1305_emit(ctx->opaque, mac, ctx->nonce, simd_context);
+
+ /* zero out the state */
+ memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(poly1305_finish);
+
+#include "../selftest/poly1305.h"
diff --git a/lib/zinc/selftest/poly1305.h b/lib/zinc/selftest/poly1305.h
new file mode 100644
index 000000000000..f9fbe41b75e4
--- /dev/null
+++ b/lib/zinc/selftest/poly1305.h
@@ -0,0 +1,1566 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2016-2017 The OpenSSL Project Authors. All Rights Reserved.
+ */
+
+#ifdef CONFIG_ZINC_DEBUG
+
+struct poly1305_testdata {
+ size_t size;
+ const u8 data[1024];
+};
+
+struct poly1305_testvec {
+ struct poly1305_testdata input, key, expected;
+};
+
+static const struct poly1305_testvec poly1305_testvecs[] = {
+ /*
+ * RFC7539
+ */
+ {
+ {
+ 34,
+ {
+ 0x43, 0x72, 0x79, 0x70, 0x74, 0x6f, 0x67, 0x72,
+ 0x61, 0x70, 0x68, 0x69, 0x63, 0x20, 0x46, 0x6f,
+ 0x72, 0x75, 0x6d, 0x20, 0x52, 0x65, 0x73, 0x65,
+ 0x61, 0x72, 0x63, 0x68, 0x20, 0x47, 0x72, 0x6f,
+
+ 0x75, 0x70
+ }
+ },
+ {
+ 32,
+ {
+ 0x85, 0xd6, 0xbe, 0x78, 0x57, 0x55, 0x6d, 0x33,
+ 0x7f, 0x44, 0x52, 0xfe, 0x42, 0xd5, 0x06, 0xa8,
+ 0x01, 0x03, 0x80, 0x8a, 0xfb, 0x0d, 0xb2, 0xfd,
+ 0x4a, 0xbf, 0xf6, 0xaf, 0x41, 0x49, 0xf5, 0x1b
+ }
+ },
+ {
+ 16,
+ {
+ 0xa8, 0x06, 0x1d, 0xc1, 0x30, 0x51, 0x36, 0xc6,
+ 0xc2, 0x2b, 0x8b, 0xaf, 0x0c, 0x01, 0x27, 0xa9
+ }
+ }
+ },
+ /*
+ * test vectors from "The Poly1305-AES message-authentication code"
+ */
+ {
+ {
+ 2,
+ {
+ 0xf3, 0xf6
+ }
+ },
+ {
+ 32,
+ {
+ 0x85, 0x1f, 0xc4, 0x0c, 0x34, 0x67, 0xac, 0x0b,
+ 0xe0, 0x5c, 0xc2, 0x04, 0x04, 0xf3, 0xf7, 0x00,
+ 0x58, 0x0b, 0x3b, 0x0f, 0x94, 0x47, 0xbb, 0x1e,
+ 0x69, 0xd0, 0x95, 0xb5, 0x92, 0x8b, 0x6d, 0xbc
+ }
+ },
+ {
+ 16,
+ {
+ 0xf4, 0xc6, 0x33, 0xc3, 0x04, 0x4f, 0xc1, 0x45,
+ 0xf8, 0x4f, 0x33, 0x5c, 0xb8, 0x19, 0x53, 0xde
+ }
+ }
+ },
+ {
+ {
+ 0,
+ {
+ 0
+ }
+ },
+ {
+ 32,
+ {
+ 0xa0, 0xf3, 0x08, 0x00, 0x00, 0xf4, 0x64, 0x00,
+ 0xd0, 0xc7, 0xe9, 0x07, 0x6c, 0x83, 0x44, 0x03,
+ 0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+ 0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7
+ }
+ },
+ {
+ 16,
+ {
+ 0xdd, 0x3f, 0xab, 0x22, 0x51, 0xf1, 0x1a, 0xc7,
+ 0x59, 0xf0, 0x88, 0x71, 0x29, 0xcc, 0x2e, 0xe7
+ }
+ }
+ },
+ {
+ {
+ 32,
+ {
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+ }
+ },
+ {
+ 32,
+ {
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef
+ }
+ },
+ {
+ 16,
+ {
+ 0x0e, 0xe1, 0xc1, 0x6b, 0xb7, 0x3f, 0x0f, 0x4f,
+ 0xd1, 0x98, 0x81, 0x75, 0x3c, 0x01, 0xcd, 0xbe
+ }
+ }
+ },
+ {
+ {
+ 63,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0x51, 0x54, 0xad, 0x0d, 0x2c, 0xb2, 0x6e, 0x01,
+ 0x27, 0x4f, 0xc5, 0x11, 0x48, 0x49, 0x1f, 0x1b
+ }
+ },
+ },
+ /*
+ * self-generated vectors exercise "significant" lengths, such that
+ * are handled by different code paths
+ */
+ {
+ {
+ 64,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66
+ }
+ },
+ },
+ {
+ {
+ 48,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+
+ }
+ },
+ {
+ 16,
+ {
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61
+ }
+ },
+ },
+ {
+ {
+ 96,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0xbb, 0xb6, 0x13, 0xb2, 0xb6, 0xd7, 0x53, 0xba,
+ 0x07, 0x39, 0x5b, 0x91, 0x6a, 0xae, 0xce, 0x15
+ }
+ },
+ },
+ {
+ {
+ 112,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0xc7, 0x94, 0xd7, 0x05, 0x7d, 0x17, 0x78, 0xc4,
+ 0xbb, 0xee, 0x0a, 0x39, 0xb3, 0xd9, 0x73, 0x42
+ }
+ },
+ },
+ {
+ {
+ 128,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0xff, 0xbc, 0xb9, 0xb3, 0x71, 0x42, 0x31, 0x52,
+ 0xd7, 0xfc, 0xa5, 0xad, 0x04, 0x2f, 0xba, 0xa9
+ }
+ },
+ },
+ {
+ {
+ 144,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0x06, 0x9e, 0xd6, 0xb8, 0xef, 0x0f, 0x20, 0x7b,
+ 0x3e, 0x24, 0x3b, 0xb1, 0x01, 0x9f, 0xe6, 0x32
+ }
+ },
+ },
+ {
+ {
+ 160,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0xcc, 0xa3, 0x39, 0xd9, 0xa4, 0x5f, 0xa2, 0x36,
+ 0x8c, 0x2c, 0x68, 0xb3, 0xa4, 0x17, 0x91, 0x33
+ }
+ },
+ },
+ {
+ {
+ 288,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0x53, 0xf6, 0xe8, 0x28, 0xa2, 0xf0, 0xfe, 0x0e,
+ 0xe8, 0x15, 0xbf, 0x0b, 0xd5, 0x84, 0x1a, 0x34
+ }
+ },
+ },
+ {
+ {
+ 320,
+ {
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61,
+
+ 0xab, 0x08, 0x12, 0x72, 0x4a, 0x7f, 0x1e, 0x34,
+ 0x27, 0x42, 0xcb, 0xed, 0x37, 0x4d, 0x94, 0xd1,
+ 0x36, 0xc6, 0xb8, 0x79, 0x5d, 0x45, 0xb3, 0x81,
+ 0x98, 0x30, 0xf2, 0xc0, 0x44, 0x91, 0xfa, 0xf0,
+
+ 0x99, 0x0c, 0x62, 0xe4, 0x8b, 0x80, 0x18, 0xb2,
+ 0xc3, 0xe4, 0xa0, 0xfa, 0x31, 0x34, 0xcb, 0x67,
+ 0xfa, 0x83, 0xe1, 0x58, 0xc9, 0x94, 0xd9, 0x61,
+ 0xc4, 0xcb, 0x21, 0x09, 0x5c, 0x1b, 0xf9, 0xaf,
+
+ 0x48, 0x44, 0x3d, 0x0b, 0xb0, 0xd2, 0x11, 0x09,
+ 0xc8, 0x9a, 0x10, 0x0b, 0x5c, 0xe2, 0xc2, 0x08,
+ 0x83, 0x14, 0x9c, 0x69, 0xb5, 0x61, 0xdd, 0x88,
+ 0x29, 0x8a, 0x17, 0x98, 0xb1, 0x07, 0x16, 0xef,
+
+ 0x66, 0x3c, 0xea, 0x19, 0x0f, 0xfb, 0x83, 0xd8,
+ 0x95, 0x93, 0xf3, 0xf4, 0x76, 0xb6, 0xbc, 0x24,
+ 0xd7, 0xe6, 0x79, 0x10, 0x7e, 0xa2, 0x6a, 0xdb,
+ 0x8c, 0xaf, 0x66, 0x52, 0xd0, 0x65, 0x61, 0x36,
+
+ 0x81, 0x20, 0x59, 0xa5, 0xda, 0x19, 0x86, 0x37,
+ 0xca, 0xc7, 0xc4, 0xa6, 0x31, 0xbe, 0xe4, 0x66,
+ 0x5b, 0x88, 0xd7, 0xf6, 0x22, 0x8b, 0x11, 0xe2,
+ 0xe2, 0x85, 0x79, 0xa5, 0xc0, 0xc1, 0xf7, 0x61
+ }
+ },
+ {
+ 32,
+ {
+ 0x12, 0x97, 0x6a, 0x08, 0xc4, 0x42, 0x6d, 0x0c,
+ 0xe8, 0xa8, 0x24, 0x07, 0xc4, 0xf4, 0x82, 0x07,
+ 0x80, 0xf8, 0xc2, 0x0a, 0xa7, 0x12, 0x02, 0xd1,
+ 0xe2, 0x91, 0x79, 0xcb, 0xcb, 0x55, 0x5a, 0x57
+ }
+ },
+ {
+ 16,
+ {
+ 0xb8, 0x46, 0xd4, 0x4e, 0x9b, 0xbd, 0x53, 0xce,
+ 0xdf, 0xfb, 0xfb, 0xb6, 0xb7, 0xfa, 0x49, 0x33
+ }
+ },
+ },
+ /*
+ * 4th power of the key spills to 131th bit in SIMD key setup
+ */
+ {
+ {
+ 256,
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ }
+ },
+ {
+ 32,
+ {
+ 0xad, 0x62, 0x81, 0x07, 0xe8, 0x35, 0x1d, 0x0f,
+ 0x2c, 0x23, 0x1a, 0x05, 0xdc, 0x4a, 0x41, 0x06,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0x07, 0x14, 0x5a, 0x4c, 0x02, 0xfe, 0x5f, 0xa3,
+ 0x20, 0x36, 0xde, 0x68, 0xfa, 0xbe, 0x90, 0x66
+ }
+ },
+ },
+ /*
+ * OpenSSL's poly1305_ieee754.c failed this in final stage
+ */
+ {
+ {
+ 252,
+ {
+ 0x84, 0x23, 0x64, 0xe1, 0x56, 0x33, 0x6c, 0x09,
+ 0x98, 0xb9, 0x33, 0xa6, 0x23, 0x77, 0x26, 0x18,
+ 0x0d, 0x9e, 0x3f, 0xdc, 0xbd, 0xe4, 0xcd, 0x5d,
+ 0x17, 0x08, 0x0f, 0xc3, 0xbe, 0xb4, 0x96, 0x14,
+
+ 0xd7, 0x12, 0x2c, 0x03, 0x74, 0x63, 0xff, 0x10,
+ 0x4d, 0x73, 0xf1, 0x9c, 0x12, 0x70, 0x46, 0x28,
+ 0xd4, 0x17, 0xc4, 0xc5, 0x4a, 0x3f, 0xe3, 0x0d,
+ 0x3c, 0x3d, 0x77, 0x14, 0x38, 0x2d, 0x43, 0xb0,
+
+ 0x38, 0x2a, 0x50, 0xa5, 0xde, 0xe5, 0x4b, 0xe8,
+ 0x44, 0xb0, 0x76, 0xe8, 0xdf, 0x88, 0x20, 0x1a,
+ 0x1c, 0xd4, 0x3b, 0x90, 0xeb, 0x21, 0x64, 0x3f,
+ 0xa9, 0x6f, 0x39, 0xb5, 0x18, 0xaa, 0x83, 0x40,
+
+ 0xc9, 0x42, 0xff, 0x3c, 0x31, 0xba, 0xf7, 0xc9,
+ 0xbd, 0xbf, 0x0f, 0x31, 0xae, 0x3f, 0xa0, 0x96,
+ 0xbf, 0x8c, 0x63, 0x03, 0x06, 0x09, 0x82, 0x9f,
+ 0xe7, 0x2e, 0x17, 0x98, 0x24, 0x89, 0x0b, 0xc8,
+
+ 0xe0, 0x8c, 0x31, 0x5c, 0x1c, 0xce, 0x2a, 0x83,
+ 0x14, 0x4d, 0xbb, 0xff, 0x09, 0xf7, 0x4e, 0x3e,
+ 0xfc, 0x77, 0x0b, 0x54, 0xd0, 0x98, 0x4a, 0x8f,
+ 0x19, 0xb1, 0x47, 0x19, 0xe6, 0x36, 0x35, 0x64,
+
+ 0x1d, 0x6b, 0x1e, 0xed, 0xf6, 0x3e, 0xfb, 0xf0,
+ 0x80, 0xe1, 0x78, 0x3d, 0x32, 0x44, 0x54, 0x12,
+ 0x11, 0x4c, 0x20, 0xde, 0x0b, 0x83, 0x7a, 0x0d,
+ 0xfa, 0x33, 0xd6, 0xb8, 0x28, 0x25, 0xff, 0xf4,
+
+ 0x4c, 0x9a, 0x70, 0xea, 0x54, 0xce, 0x47, 0xf0,
+ 0x7d, 0xf6, 0x98, 0xe6, 0xb0, 0x33, 0x23, 0xb5,
+ 0x30, 0x79, 0x36, 0x4a, 0x5f, 0xc3, 0xe9, 0xdd,
+ 0x03, 0x43, 0x92, 0xbd, 0xde, 0x86, 0xdc, 0xcd,
+
+ 0xda, 0x94, 0x32, 0x1c, 0x5e, 0x44, 0x06, 0x04,
+ 0x89, 0x33, 0x6c, 0xb6, 0x5b, 0xf3, 0x98, 0x9c,
+ 0x36, 0xf7, 0x28, 0x2c, 0x2f, 0x5d, 0x2b, 0x88,
+ 0x2c, 0x17, 0x1e, 0x74
+ }
+ },
+ {
+ 32,
+ {
+ 0x95, 0xd5, 0xc0, 0x05, 0x50, 0x3e, 0x51, 0x0d,
+ 0x8c, 0xd0, 0xaa, 0x07, 0x2c, 0x4a, 0x4d, 0x06,
+ 0x6e, 0xab, 0xc5, 0x2d, 0x11, 0x65, 0x3d, 0xf4,
+ 0x7f, 0xbf, 0x63, 0xab, 0x19, 0x8b, 0xcc, 0x26
+ }
+ },
+ {
+ 16,
+ {
+ 0xf2, 0x48, 0x31, 0x2e, 0x57, 0x8d, 0x9d, 0x58,
+ 0xf8, 0xb7, 0xbb, 0x4d, 0x19, 0x10, 0x54, 0x31
+ }
+ },
+ },
+ /*
+ * AVX2 in OpenSSL's poly1305-x86.pl failed this with 176+32 split
+ */
+ {
+ {
+ 208,
+ {
+ 0x24, 0x8a, 0xc3, 0x10, 0x85, 0xb6, 0xc2, 0xad,
+ 0xaa, 0xa3, 0x82, 0x59, 0xa0, 0xd7, 0x19, 0x2c,
+ 0x5c, 0x35, 0xd1, 0xbb, 0x4e, 0xf3, 0x9a, 0xd9,
+ 0x4c, 0x38, 0xd1, 0xc8, 0x24, 0x79, 0xe2, 0xdd,
+
+ 0x21, 0x59, 0xa0, 0x77, 0x02, 0x4b, 0x05, 0x89,
+ 0xbc, 0x8a, 0x20, 0x10, 0x1b, 0x50, 0x6f, 0x0a,
+ 0x1a, 0xd0, 0xbb, 0xab, 0x76, 0xe8, 0x3a, 0x83,
+ 0xf1, 0xb9, 0x4b, 0xe6, 0xbe, 0xae, 0x74, 0xe8,
+
+ 0x74, 0xca, 0xb6, 0x92, 0xc5, 0x96, 0x3a, 0x75,
+ 0x43, 0x6b, 0x77, 0x61, 0x21, 0xec, 0x9f, 0x62,
+ 0x39, 0x9a, 0x3e, 0x66, 0xb2, 0xd2, 0x27, 0x07,
+ 0xda, 0xe8, 0x19, 0x33, 0xb6, 0x27, 0x7f, 0x3c,
+
+ 0x85, 0x16, 0xbc, 0xbe, 0x26, 0xdb, 0xbd, 0x86,
+ 0xf3, 0x73, 0x10, 0x3d, 0x7c, 0xf4, 0xca, 0xd1,
+ 0x88, 0x8c, 0x95, 0x21, 0x18, 0xfb, 0xfb, 0xd0,
+ 0xd7, 0xb4, 0xbe, 0xdc, 0x4a, 0xe4, 0x93, 0x6a,
+
+ 0xff, 0x91, 0x15, 0x7e, 0x7a, 0xa4, 0x7c, 0x54,
+ 0x44, 0x2e, 0xa7, 0x8d, 0x6a, 0xc2, 0x51, 0xd3,
+ 0x24, 0xa0, 0xfb, 0xe4, 0x9d, 0x89, 0xcc, 0x35,
+ 0x21, 0xb6, 0x6d, 0x16, 0xe9, 0xc6, 0x6a, 0x37,
+
+ 0x09, 0x89, 0x4e, 0x4e, 0xb0, 0xa4, 0xee, 0xdc,
+ 0x4a, 0xe1, 0x94, 0x68, 0xe6, 0x6b, 0x81, 0xf2,
+
+ 0x71, 0x35, 0x1b, 0x1d, 0x92, 0x1e, 0xa5, 0x51,
+ 0x04, 0x7a, 0xbc, 0xc6, 0xb8, 0x7a, 0x90, 0x1f,
+ 0xde, 0x7d, 0xb7, 0x9f, 0xa1, 0x81, 0x8c, 0x11,
+ 0x33, 0x6d, 0xbc, 0x07, 0x24, 0x4a, 0x40, 0xeb
+ }
+ },
+ {
+ 32,
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0xbc, 0x93, 0x9b, 0xc5, 0x28, 0x14, 0x80, 0xfa,
+ 0x99, 0xc6, 0xd6, 0x8c, 0x25, 0x8e, 0xc4, 0x2f
+ }
+ },
+ },
+ /*
+ * test vectors from Google
+ */
+ {
+ {
+ 0,
+ {
+ 0x00,
+ }
+ },
+ {
+ 32,
+ {
+ 0xc8, 0xaf, 0xaa, 0xc3, 0x31, 0xee, 0x37, 0x2c,
+ 0xd6, 0x08, 0x2d, 0xe1, 0x34, 0x94, 0x3b, 0x17,
+ 0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+ 0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c
+ }
+ },
+ {
+ 16,
+ {
+ 0x47, 0x10, 0x13, 0x0e, 0x9f, 0x6f, 0xea, 0x8d,
+ 0x72, 0x29, 0x38, 0x50, 0xa6, 0x67, 0xd8, 0x6c
+ }
+ },
+ },
+ {
+ {
+ 12,
+ {
+ 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f,
+ 0x72, 0x6c, 0x64, 0x21
+ }
+ },
+ {
+ 32,
+ {
+ 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+ 0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+ 0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+ 0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35
+ }
+ },
+ {
+ 16,
+ {
+ 0xa6, 0xf7, 0x45, 0x00, 0x8f, 0x81, 0xc9, 0x16,
+ 0xa2, 0x0d, 0xcc, 0x74, 0xee, 0xf2, 0xb2, 0xf0
+ }
+ },
+ },
+ {
+ {
+ 32,
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 32,
+ {
+ 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20,
+ 0x33, 0x32, 0x2d, 0x62, 0x79, 0x74, 0x65, 0x20,
+ 0x6b, 0x65, 0x79, 0x20, 0x66, 0x6f, 0x72, 0x20,
+ 0x50, 0x6f, 0x6c, 0x79, 0x31, 0x33, 0x30, 0x35
+ }
+ },
+ {
+ 16,
+ {
+ 0x49, 0xec, 0x78, 0x09, 0x0e, 0x48, 0x1e, 0xc6,
+ 0xc2, 0x6b, 0x33, 0xb9, 0x1c, 0xcc, 0x03, 0x07
+ }
+ },
+ },
+ {
+ {
+ 128,
+ {
+ 0x89, 0xda, 0xb8, 0x0b, 0x77, 0x17, 0xc1, 0xdb,
+ 0x5d, 0xb4, 0x37, 0x86, 0x0a, 0x3f, 0x70, 0x21,
+ 0x8e, 0x93, 0xe1, 0xb8, 0xf4, 0x61, 0xfb, 0x67,
+ 0x7f, 0x16, 0xf3, 0x5f, 0x6f, 0x87, 0xe2, 0xa9,
+
+ 0x1c, 0x99, 0xbc, 0x3a, 0x47, 0xac, 0xe4, 0x76,
+ 0x40, 0xcc, 0x95, 0xc3, 0x45, 0xbe, 0x5e, 0xcc,
+ 0xa5, 0xa3, 0x52, 0x3c, 0x35, 0xcc, 0x01, 0x89,
+ 0x3a, 0xf0, 0xb6, 0x4a, 0x62, 0x03, 0x34, 0x27,
+
+ 0x03, 0x72, 0xec, 0x12, 0x48, 0x2d, 0x1b, 0x1e,
+ 0x36, 0x35, 0x61, 0x69, 0x8a, 0x57, 0x8b, 0x35,
+ 0x98, 0x03, 0x49, 0x5b, 0xb4, 0xe2, 0xef, 0x19,
+ 0x30, 0xb1, 0x7a, 0x51, 0x90, 0xb5, 0x80, 0xf1,
+
+ 0x41, 0x30, 0x0d, 0xf3, 0x0a, 0xdb, 0xec, 0xa2,
+ 0x8f, 0x64, 0x27, 0xa8, 0xbc, 0x1a, 0x99, 0x9f,
+ 0xd5, 0x1c, 0x55, 0x4a, 0x01, 0x7d, 0x09, 0x5d,
+ 0x8c, 0x3e, 0x31, 0x27, 0xda, 0xf9, 0xf5, 0x95
+ }
+ },
+ {
+ 32,
+ {
+ 0x2d, 0x77, 0x3b, 0xe3, 0x7a, 0xdb, 0x1e, 0x4d,
+ 0x68, 0x3b, 0xf0, 0x07, 0x5e, 0x79, 0xc4, 0xee,
+ 0x03, 0x79, 0x18, 0x53, 0x5a, 0x7f, 0x99, 0xcc,
+ 0xb7, 0x04, 0x0f, 0xb5, 0xf5, 0xf4, 0x3a, 0xea
+ }
+ },
+ {
+ 16,
+ {
+ 0xc8, 0x5d, 0x15, 0xed, 0x44, 0xc3, 0x78, 0xd6,
+ 0xb0, 0x0e, 0x23, 0x06, 0x4c, 0x7b, 0xcd, 0x51
+ }
+ },
+ },
+ {
+ {
+ 528,
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b,
+ 0x17, 0x03, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00,
+
+ 0x06, 0xdb, 0x1f, 0x1f, 0x36, 0x8d, 0x69, 0x6a,
+ 0x81, 0x0a, 0x34, 0x9c, 0x0c, 0x71, 0x4c, 0x9a,
+ 0x5e, 0x78, 0x50, 0xc2, 0x40, 0x7d, 0x72, 0x1a,
+ 0xcd, 0xed, 0x95, 0xe0, 0x18, 0xd7, 0xa8, 0x52,
+
+ 0x66, 0xa6, 0xe1, 0x28, 0x9c, 0xdb, 0x4a, 0xeb,
+ 0x18, 0xda, 0x5a, 0xc8, 0xa2, 0xb0, 0x02, 0x6d,
+ 0x24, 0xa5, 0x9a, 0xd4, 0x85, 0x22, 0x7f, 0x3e,
+ 0xae, 0xdb, 0xb2, 0xe7, 0xe3, 0x5e, 0x1c, 0x66,
+
+ 0xcd, 0x60, 0xf9, 0xab, 0xf7, 0x16, 0xdc, 0xc9,
+ 0xac, 0x42, 0x68, 0x2d, 0xd7, 0xda, 0xb2, 0x87,
+ 0xa7, 0x02, 0x4c, 0x4e, 0xef, 0xc3, 0x21, 0xcc,
+ 0x05, 0x74, 0xe1, 0x67, 0x93, 0xe3, 0x7c, 0xec,
+
+ 0x03, 0xc5, 0xbd, 0xa4, 0x2b, 0x54, 0xc1, 0x14,
+ 0xa8, 0x0b, 0x57, 0xaf, 0x26, 0x41, 0x6c, 0x7b,
+ 0xe7, 0x42, 0x00, 0x5e, 0x20, 0x85, 0x5c, 0x73,
+ 0xe2, 0x1d, 0xc8, 0xe2, 0xed, 0xc9, 0xd4, 0x35,
+
+ 0xcb, 0x6f, 0x60, 0x59, 0x28, 0x00, 0x11, 0xc2,
+ 0x70, 0xb7, 0x15, 0x70, 0x05, 0x1c, 0x1c, 0x9b,
+ 0x30, 0x52, 0x12, 0x66, 0x20, 0xbc, 0x1e, 0x27,
+ 0x30, 0xfa, 0x06, 0x6c, 0x7a, 0x50, 0x9d, 0x53,
+
+ 0xc6, 0x0e, 0x5a, 0xe1, 0xb4, 0x0a, 0xa6, 0xe3,
+ 0x9e, 0x49, 0x66, 0x92, 0x28, 0xc9, 0x0e, 0xec,
+ 0xb4, 0xa5, 0x0d, 0xb3, 0x2a, 0x50, 0xbc, 0x49,
+ 0xe9, 0x0b, 0x4f, 0x4b, 0x35, 0x9a, 0x1d, 0xfd,
+
+ 0x11, 0x74, 0x9c, 0xd3, 0x86, 0x7f, 0xcf, 0x2f,
+ 0xb7, 0xbb, 0x6c, 0xd4, 0x73, 0x8f, 0x6a, 0x4a,
+ 0xd6, 0xf7, 0xca, 0x50, 0x58, 0xf7, 0x61, 0x88,
+ 0x45, 0xaf, 0x9f, 0x02, 0x0f, 0x6c, 0x3b, 0x96,
+
+ 0x7b, 0x8f, 0x4c, 0xd4, 0xa9, 0x1e, 0x28, 0x13,
+ 0xb5, 0x07, 0xae, 0x66, 0xf2, 0xd3, 0x5c, 0x18,
+ 0x28, 0x4f, 0x72, 0x92, 0x18, 0x60, 0x62, 0xe1,
+ 0x0f, 0xd5, 0x51, 0x0d, 0x18, 0x77, 0x53, 0x51,
+
+ 0xef, 0x33, 0x4e, 0x76, 0x34, 0xab, 0x47, 0x43,
+ 0xf5, 0xb6, 0x8f, 0x49, 0xad, 0xca, 0xb3, 0x84,
+ 0xd3, 0xfd, 0x75, 0xf7, 0x39, 0x0f, 0x40, 0x06,
+ 0xef, 0x2a, 0x29, 0x5c, 0x8c, 0x7a, 0x07, 0x6a,
+
+ 0xd5, 0x45, 0x46, 0xcd, 0x25, 0xd2, 0x10, 0x7f,
+ 0xbe, 0x14, 0x36, 0xc8, 0x40, 0x92, 0x4a, 0xae,
+ 0xbe, 0x5b, 0x37, 0x08, 0x93, 0xcd, 0x63, 0xd1,
+ 0x32, 0x5b, 0x86, 0x16, 0xfc, 0x48, 0x10, 0x88,
+
+ 0x6b, 0xc1, 0x52, 0xc5, 0x32, 0x21, 0xb6, 0xdf,
+ 0x37, 0x31, 0x19, 0x39, 0x32, 0x55, 0xee, 0x72,
+ 0xbc, 0xaa, 0x88, 0x01, 0x74, 0xf1, 0x71, 0x7f,
+ 0x91, 0x84, 0xfa, 0x91, 0x64, 0x6f, 0x17, 0xa2,
+
+ 0x4a, 0xc5, 0x5d, 0x16, 0xbf, 0xdd, 0xca, 0x95,
+ 0x81, 0xa9, 0x2e, 0xda, 0x47, 0x92, 0x01, 0xf0,
+ 0xed, 0xbf, 0x63, 0x36, 0x00, 0xd6, 0x06, 0x6d,
+ 0x1a, 0xb3, 0x6d, 0x5d, 0x24, 0x15, 0xd7, 0x13,
+
+ 0x51, 0xbb, 0xcd, 0x60, 0x8a, 0x25, 0x10, 0x8d,
+ 0x25, 0x64, 0x19, 0x92, 0xc1, 0xf2, 0x6c, 0x53,
+ 0x1c, 0xf9, 0xf9, 0x02, 0x03, 0xbc, 0x4c, 0xc1,
+ 0x9f, 0x59, 0x27, 0xd8, 0x34, 0xb0, 0xa4, 0x71,
+
+ 0x16, 0xd3, 0x88, 0x4b, 0xbb, 0x16, 0x4b, 0x8e,
+ 0xc8, 0x83, 0xd1, 0xac, 0x83, 0x2e, 0x56, 0xb3,
+ 0x91, 0x8a, 0x98, 0x60, 0x1a, 0x08, 0xd1, 0x71,
+ 0x88, 0x15, 0x41, 0xd5, 0x94, 0xdb, 0x39, 0x9c,
+
+ 0x6a, 0xe6, 0x15, 0x12, 0x21, 0x74, 0x5a, 0xec,
+ 0x81, 0x4c, 0x45, 0xb0, 0xb0, 0x5b, 0x56, 0x54,
+ 0x36, 0xfd, 0x6f, 0x13, 0x7a, 0xa1, 0x0a, 0x0c,
+ 0x0b, 0x64, 0x37, 0x61, 0xdb, 0xd6, 0xf9, 0xa9,
+
+ 0xdc, 0xb9, 0x9b, 0x1a, 0x6e, 0x69, 0x08, 0x54,
+ 0xce, 0x07, 0x69, 0xcd, 0xe3, 0x97, 0x61, 0xd8,
+ 0x2f, 0xcd, 0xec, 0x15, 0xf0, 0xd9, 0x2d, 0x7d,
+ 0x8e, 0x94, 0xad, 0xe8, 0xeb, 0x83, 0xfb, 0xe0
+ }
+ },
+ {
+ 32,
+ {
+ 0x99, 0xe5, 0x82, 0x2d, 0xd4, 0x17, 0x3c, 0x99,
+ 0x5e, 0x3d, 0xae, 0x0d, 0xde, 0xfb, 0x97, 0x74,
+ 0x3f, 0xde, 0x3b, 0x08, 0x01, 0x34, 0xb3, 0x9f,
+ 0x76, 0xe9, 0xbf, 0x8d, 0x0e, 0x88, 0xd5, 0x46
+ }
+ },
+ {
+ 16,
+ {
+ 0x26, 0x37, 0x40, 0x8f, 0xe1, 0x30, 0x86, 0xea,
+ 0x73, 0xf9, 0x71, 0xe3, 0x42, 0x5e, 0x28, 0x20
+ }
+ },
+ },
+ /*
+ * test vectors from Hanno Böck
+ */
+ {
+ {
+ 257,
+ {
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0x80, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xce, 0xcc, 0xcc, 0xcc,
+
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xc5,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe3, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xac, 0xcc, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xe6,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0x00, 0x00, 0x00,
+ 0xaf, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+
+ 0xcc, 0xcc, 0xff, 0xff, 0xff, 0xf5, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0x00, 0xff, 0xff, 0xff, 0xe7, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x71, 0x92, 0x05, 0xa8, 0x52, 0x1d,
+
+ 0xfc
+ }
+ },
+ {
+ 32,
+ {
+ 0x7f, 0x1b, 0x02, 0x64, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc
+ }
+ },
+ {
+ 16,
+ {
+ 0x85, 0x59, 0xb8, 0x76, 0xec, 0xee, 0xd6, 0x6e,
+ 0xb3, 0x77, 0x98, 0xc0, 0x45, 0x7b, 0xaf, 0xf9
+ }
+ },
+ },
+ {
+ {
+ 39,
+ {
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x64
+ }
+ },
+ {
+ 32,
+ {
+ 0xe0, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa
+ }
+ },
+ {
+ 16,
+ {
+ 0x00, 0xbd, 0x12, 0x58, 0x97, 0x8e, 0x20, 0x54,
+ 0x44, 0xc9, 0xaa, 0xaa, 0x82, 0x00, 0x6f, 0xed
+ }
+ },
+ },
+ {
+ {
+ 2,
+ {
+ 0x02, 0xfc
+ }
+ },
+ {
+ 32,
+ {
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c
+ }
+ },
+ {
+ 16,
+ {
+ 0x06, 0x12, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c
+ }
+ },
+ },
+ {
+ {
+ 415,
+ {
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7a, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+ 0x7b, 0x7b, 0x5c, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x6e, 0x7b, 0x00, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7a, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x5c,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+ 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b, 0x7b,
+
+ 0x7b, 0x6e, 0x7b, 0x00, 0x13, 0x00, 0x00, 0x00,
+ 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00,
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x00,
+
+ 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x20, 0x00, 0xef, 0xff, 0x00, 0x09,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x7a, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+
+ 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc
+ }
+ },
+ {
+ 32,
+ {
+ 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7b, 0x7b
+ }
+ },
+ {
+ 16,
+ {
+ 0x33, 0x20, 0x5b, 0xbf, 0x9e, 0x9f, 0x8f, 0x72,
+ 0x12, 0xab, 0x9e, 0x2a, 0xb9, 0xb7, 0xe4, 0xa5
+ }
+ },
+ },
+ {
+ {
+ 118,
+ {
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0xff, 0xff, 0xff, 0xe9,
+ 0xe9, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+ 0xac, 0xac, 0xac, 0xac, 0x00, 0x00, 0xac, 0xac,
+
+ 0xec, 0x01, 0x00, 0xac, 0xac, 0xac, 0x2c, 0xac,
+ 0xa2, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac, 0xac,
+ 0xac, 0xac, 0xac, 0xac, 0x64, 0xf2
+ }
+ },
+ {
+ 32,
+ {
+ 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7f,
+ 0x01, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0xcf, 0x77, 0x77, 0x77, 0x77, 0x77,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77
+ }
+ },
+ {
+ 16,
+ {
+ 0x02, 0xee, 0x7c, 0x8c, 0x54, 0x6d, 0xde, 0xb1,
+ 0xa4, 0x67, 0xe4, 0xc3, 0x98, 0x11, 0x58, 0xb9
+ }
+ },
+ },
+ /*
+ * test vectors from Andrew Moon
+ */
+ { /* nacl */
+ {
+ 131,
+ {
+ 0x8e, 0x99, 0x3b, 0x9f, 0x48, 0x68, 0x12, 0x73,
+ 0xc2, 0x96, 0x50, 0xba, 0x32, 0xfc, 0x76, 0xce,
+ 0x48, 0x33, 0x2e, 0xa7, 0x16, 0x4d, 0x96, 0xa4,
+ 0x47, 0x6f, 0xb8, 0xc5, 0x31, 0xa1, 0x18, 0x6a,
+
+ 0xc0, 0xdf, 0xc1, 0x7c, 0x98, 0xdc, 0xe8, 0x7b,
+ 0x4d, 0xa7, 0xf0, 0x11, 0xec, 0x48, 0xc9, 0x72,
+ 0x71, 0xd2, 0xc2, 0x0f, 0x9b, 0x92, 0x8f, 0xe2,
+ 0x27, 0x0d, 0x6f, 0xb8, 0x63, 0xd5, 0x17, 0x38,
+
+ 0xb4, 0x8e, 0xee, 0xe3, 0x14, 0xa7, 0xcc, 0x8a,
+ 0xb9, 0x32, 0x16, 0x45, 0x48, 0xe5, 0x26, 0xae,
+ 0x90, 0x22, 0x43, 0x68, 0x51, 0x7a, 0xcf, 0xea,
+ 0xbd, 0x6b, 0xb3, 0x73, 0x2b, 0xc0, 0xe9, 0xda,
+
+ 0x99, 0x83, 0x2b, 0x61, 0xca, 0x01, 0xb6, 0xde,
+ 0x56, 0x24, 0x4a, 0x9e, 0x88, 0xd5, 0xf9, 0xb3,
+ 0x79, 0x73, 0xf6, 0x22, 0xa4, 0x3d, 0x14, 0xa6,
+ 0x59, 0x9b, 0x1f, 0x65, 0x4c, 0xb4, 0x5a, 0x74,
+
+ 0xe3, 0x55, 0xa5
+ }
+ },
+ {
+ 32,
+ {
+ 0xee, 0xa6, 0xa7, 0x25, 0x1c, 0x1e, 0x72, 0x91,
+ 0x6d, 0x11, 0xc2, 0xcb, 0x21, 0x4d, 0x3c, 0x25,
+ 0x25, 0x39, 0x12, 0x1d, 0x8e, 0x23, 0x4e, 0x65,
+ 0x2d, 0x65, 0x1f, 0xa4, 0xc8, 0xcf, 0xf8, 0x80
+ }
+ },
+ {
+ 16,
+ {
+ 0xf3, 0xff, 0xc7, 0x70, 0x3f, 0x94, 0x00, 0xe5,
+ 0x2a, 0x7d, 0xfb, 0x4b, 0x3d, 0x33, 0x05, 0xd9
+ }
+ },
+ },
+ { /* wrap 2^130-5 */
+ {
+ 16,
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ }
+ },
+ {
+ 32,
+ {
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ },
+ { /* wrap 2^128 */
+ {
+ 16,
+ {
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 32,
+ {
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ }
+ },
+ {
+ 16,
+ {
+ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ },
+ { /* limb carry */
+ {
+ 48,
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+
+ 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 32,
+ {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ },
+ { /* 2^130-5 */
+ {
+ 48,
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xfb, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+ 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
+
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
+ }
+ },
+ {
+ 32,
+ {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+
+ }
+ },
+ },
+ { /* 2^130-6 */
+ {
+ 16,
+ {
+ 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ }
+ },
+ {
+ 32,
+ {
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ }
+ },
+ },
+ { /* 5*H+L reduction intermediate */
+ {
+ 64,
+ {
+ 0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 32,
+ {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ },
+ { /* 5*H+L reduction final */
+ {
+ 48,
+ {
+ 0xe3, 0x35, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0xb9,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x33, 0x94, 0xd7, 0x50, 0x5e, 0x43, 0x79, 0xcd,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+
+ }
+ },
+ {
+ 32,
+ {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ },
+ {
+ 16,
+ {
+ 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ }
+ }
+ }
+};
+
+bool __init poly1305_selftest(void)
+{
+ simd_context_t simd_context = simd_get();
+ bool success = true;
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(poly1305_testvecs); ++i) {
+ struct poly1305_ctx poly1305;
+ const u8 *in = poly1305_testvecs[i].input.data;
+ size_t inlen = poly1305_testvecs[i].input.size;
+ const u8 *key = poly1305_testvecs[i].key.data;
+ const u8 *expected = poly1305_testvecs[i].expected.data;
+ size_t expectedlen = poly1305_testvecs[i].expected.size;
+ u8 out[POLY1305_MAC_SIZE];
+
+ if (expectedlen != sizeof(out)) {
+ pr_info("poly1305 self-test %zu logic: FAIL\n", i + 1);
+ success = false;
+ }
+
+ memset(out, 0, sizeof(out));
+ memset(&poly1305, 0, sizeof(poly1305));
+ poly1305_init(&poly1305, key, simd_context);
+ poly1305_update(&poly1305, in, inlen, simd_context);
+ poly1305_finish(&poly1305, out, simd_context);
+ if (memcmp(out, expected, expectedlen)) {
+ pr_info("poly1305 self-test %zu: FAIL\n", i + 1);
+ success = false;
+ }
+
+ if (inlen > 16) {
+ memset(out, 0, sizeof(out));
+ memset(&poly1305, 0, sizeof(poly1305));
+ poly1305_init(&poly1305, key, simd_context);
+ poly1305_update(&poly1305, in, 1, simd_context);
+ poly1305_update(&poly1305, in + 1, inlen - 1, simd_context);
+ poly1305_finish(&poly1305, out, simd_context);
+ if (memcmp(out, expected, expectedlen)) {
+ pr_info("poly1305 self-test %zu/1+(N-1): FAIL\n", i + 1);
+ success = false;
+ }
+ }
+
+ if (inlen > 32) {
+ size_t half = inlen / 2;
+
+ memset(out, 0, sizeof(out));
+ memset(&poly1305, 0, sizeof(poly1305));
+ poly1305_init(&poly1305, key, simd_context);
+ poly1305_update(&poly1305, in, half, simd_context);
+ poly1305_update(&poly1305, in + half, inlen - half, simd_context);
+ poly1305_finish(&poly1305, out, simd_context);
+ if (memcmp(out, expected, expectedlen)) {
+ pr_info("poly1305 self-test %zu/2: FAIL\n", i + 1);
+ success = false;
+ }
+
+ for (half = 16; half < inlen; half += 16) {
+ memset(out, 0, sizeof(out));
+ memset(&poly1305, 0, sizeof(poly1305));
+ poly1305_init(&poly1305, key, simd_context);
+ poly1305_update(&poly1305, in, half, simd_context);
+ poly1305_update(&poly1305, in + half, inlen - half, simd_context);
+ poly1305_finish(&poly1305, out, simd_context);
+ if (memcmp(out, expected, expectedlen)) {
+ pr_info("poly1305 self-test %zu/%zu+%zu: FAIL\n", i + 1, half, inlen - half);
+ success = false;
+ }
+ }
+ }
+ }
+ simd_put(simd_context);
+
+ if (success)
+ pr_info("poly1305 self-tests: pass\n");
+
+ return success;
+}
+#endif
--
2.18.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox