Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifdef USE_HW_SHA1
#ifndef __ARM_FEATURE_CRYPTO
#error "Compiler option is invalid"
#endif
#ifdef FREEBL_NO_DEPEND
#include "stubs.h"
#endif
#include <arm_neon.h>
#include <memory.h>
#include "blapi.h"
#include "sha_fast.h"
#if !defined(SHA_PUT_W_IN_STACK)
#define H2X 11
#else
#define H2X 0
#endif
static void shaCompress(SHA_HW_t *X, const PRUint32 *datain);
void
SHA1_Compress_Native(SHA1Context *ctx)
{
shaCompress(&ctx->H[H2X], ctx->u.w);
}
/*
* SHA: Add data to context.
*/
void
SHA1_Update_Native(SHA1Context *ctx, const unsigned char *dataIn, unsigned int len)
{
unsigned int lenB;
unsigned int togo;
if (!len) {
return;
}
/* accumulate the byte count. */
lenB = (unsigned int)(ctx->size) & 63U;
ctx->size += len;
/*
* Read the data into W and process blocks as they get full
*/
if (lenB > 0) {
togo = 64U - lenB;
if (len < togo) {
togo = len;
}
memcpy(ctx->u.b + lenB, dataIn, togo);
len -= togo;
dataIn += togo;
lenB = (lenB + togo) & 63U;
if (!lenB) {
shaCompress(&ctx->H[H2X], ctx->u.w);
}
}
while (len >= 64U) {
len -= 64U;
shaCompress(&ctx->H[H2X], (PRUint32 *)dataIn);
dataIn += 64U;
}
if (len) {
memcpy(ctx->u.b, dataIn, len);
}
}
/*
* SHA: Compression function, unrolled.
*/
static void
shaCompress(SHA_HW_t *X, const PRUint32 *inbuf)
{
#define XH(n) X[n - H2X]
const uint32x4_t K0 = vdupq_n_u32(0x5a827999);
const uint32x4_t K1 = vdupq_n_u32(0x6ed9eba1);
const uint32x4_t K2 = vdupq_n_u32(0x8f1bbcdc);
const uint32x4_t K3 = vdupq_n_u32(0xca62c1d6);
uint32x4_t abcd = vld1q_u32(&XH(0));
PRUint32 e = XH(4);
const uint32x4_t origABCD = abcd;
const PRUint32 origE = e;
uint32x4_t w0 = vld1q_u32(inbuf);
uint32x4_t w1 = vld1q_u32(inbuf + 4);
uint32x4_t w2 = vld1q_u32(inbuf + 8);
uint32x4_t w3 = vld1q_u32(inbuf + 12);
w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));
uint32x4_t t0 = vaddq_u32(w0, K0);
uint32x4_t t1 = vaddq_u32(w1, K0);
PRUint32 tmpE;
/*
* Using the following ARM instructions to accelerate SHA1
*
* sha1c for round 0 - 20
* sha1p for round 20 - 40
* sha1m for round 40 - 60
* sha1p for round 60 - 80
* sha1su0 and shasu1 for message schedule
* sha1h for rotate left 30
*/
/* Round 0-3 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1cq_u32(abcd, e, t0);
t0 = vaddq_u32(w2, K0);
w0 = vsha1su0q_u32(w0, w1, w2);
/* Round 4-7 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1cq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w3, K0);
w0 = vsha1su1q_u32(w0, w3);
w1 = vsha1su0q_u32(w1, w2, w3);
/* Round 8-11 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1cq_u32(abcd, e, t0);
t0 = vaddq_u32(w0, K0);
w1 = vsha1su1q_u32(w1, w0);
w2 = vsha1su0q_u32(w2, w3, w0);
/* Round 12-15 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1cq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w1, K1);
w2 = vsha1su1q_u32(w2, w1);
w3 = vsha1su0q_u32(w3, w0, w1);
/* Round 16-19 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1cq_u32(abcd, e, t0);
t0 = vaddq_u32(w2, K1);
w3 = vsha1su1q_u32(w3, w2);
w0 = vsha1su0q_u32(w0, w1, w2);
/* Round 20-23 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w3, K1);
w0 = vsha1su1q_u32(w0, w3);
w1 = vsha1su0q_u32(w1, w2, w3);
/* Round 24-27 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, e, t0);
t0 = vaddq_u32(w0, K1);
w1 = vsha1su1q_u32(w1, w0);
w2 = vsha1su0q_u32(w2, w3, w0);
/* Round 28-31 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w1, K1);
w2 = vsha1su1q_u32(w2, w1);
w3 = vsha1su0q_u32(w3, w0, w1);
/* Round 32-35 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, e, t0);
t0 = vaddq_u32(w2, K2);
w3 = vsha1su1q_u32(w3, w2);
w0 = vsha1su0q_u32(w0, w1, w2);
/* Round 36-39 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w3, K2);
w0 = vsha1su1q_u32(w0, w3);
w1 = vsha1su0q_u32(w1, w2, w3);
/* Round 40-43 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1mq_u32(abcd, e, t0);
t0 = vaddq_u32(w0, K2);
w1 = vsha1su1q_u32(w1, w0);
w2 = vsha1su0q_u32(w2, w3, w0);
/* Round 44-47 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1mq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w1, K2);
w2 = vsha1su1q_u32(w2, w1);
w3 = vsha1su0q_u32(w3, w0, w1);
/* Round 48-51 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1mq_u32(abcd, e, t0);
t0 = vaddq_u32(w2, K2);
w3 = vsha1su1q_u32(w3, w2);
w0 = vsha1su0q_u32(w0, w1, w2);
/* Round 52-55 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1mq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w3, K3);
w0 = vsha1su1q_u32(w0, w3);
w1 = vsha1su0q_u32(w1, w2, w3);
/* Round 56-59 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1mq_u32(abcd, e, t0);
t0 = vaddq_u32(w0, K3);
w1 = vsha1su1q_u32(w1, w0);
w2 = vsha1su0q_u32(w2, w3, w0);
/* Round 60-63 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w1, K3);
w2 = vsha1su1q_u32(w2, w1);
w3 = vsha1su0q_u32(w3, w0, w1);
/* Round 64-67 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, e, t0);
t0 = vaddq_u32(w2, K3);
w3 = vsha1su1q_u32(w3, w2);
w0 = vsha1su0q_u32(w0, w1, w2);
/* Round 68-71 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, tmpE, t1);
t1 = vaddq_u32(w3, K3);
w0 = vsha1su1q_u32(w0, w3);
/* Round 72-75 */
tmpE = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, e, t0);
/* Round 76-79 */
e = vsha1h_u32(vgetq_lane_u32(abcd, 0));
abcd = vsha1pq_u32(abcd, tmpE, t1);
e += origE;
abcd = vaddq_u32(origABCD, abcd);
vst1q_u32(&XH(0), abcd);
XH(4) = e;
}
#endif /* USE_HW_SHA1 */