Home | History | Annotate | Download | only in sha1
      1 /*
      2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 
      6 /*
      7  * The basic framework for this code came from the reference
      8  * implementation for MD5.  That implementation is Copyright (C)
      9  * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
     10  *
     11  * License to copy and use this software is granted provided that it
     12  * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
     13  * Algorithm" in all material mentioning or referencing this software
     14  * or this function.
     15  *
     16  * License is also granted to make and use derivative works provided
     17  * that such works are identified as "derived from the RSA Data
     18  * Security, Inc. MD5 Message-Digest Algorithm" in all material
     19  * mentioning or referencing the derived work.
     20  *
     21  * RSA Data Security, Inc. makes no representations concerning either
     22  * the merchantability of this software or the suitability of this
     23  * software for any particular purpose. It is provided "as is"
     24  * without express or implied warranty of any kind.
     25  *
     26  * These notices must be retained in any copies of any part of this
     27  * documentation and/or software.
     28  *
     29  * NOTE: Cleaned-up and optimized, version of SHA1, based on the FIPS 180-1
     30  * standard, available at http://www.itl.nist.gov/fipspubs/fip180-1.htm
     31  * Not as fast as one would like -- further optimizations are encouraged
     32  * and appreciated.
     33  */
     34 
     35 #ifndef _KERNEL
     36 #include <stdint.h>
     37 #include <strings.h>
     38 #include <stdlib.h>
     39 #include <errno.h>
     40 #include <sys/systeminfo.h>
     41 #endif  /* !_KERNEL */
     42 
     43 #include <sys/types.h>
     44 #include <sys/param.h>
     45 #include <sys/systm.h>
     46 #include <sys/sysmacros.h>
     47 #include <sys/sha1.h>
     48 #include <sys/sha1_consts.h>
     49 
     50 #ifdef _LITTLE_ENDIAN
     51 #include <sys/byteorder.h>
     52 #define	HAVE_HTONL
     53 #endif
     54 
     55 static void Encode(uint8_t *, const uint32_t *, size_t);
     56 
     57 #if	defined(__sparc)
     58 
     59 #define	SHA1_TRANSFORM(ctx, in) \
     60 	SHA1Transform((ctx)->state[0], (ctx)->state[1], (ctx)->state[2], \
     61 		(ctx)->state[3], (ctx)->state[4], (ctx), (in))
     62 
     63 static void SHA1Transform(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
     64     SHA1_CTX *, const uint8_t *);
     65 
     66 #elif	defined(__amd64)
     67 
     68 #define	SHA1_TRANSFORM(ctx, in) sha1_block_data_order((ctx), (in), 1)
     69 #define	SHA1_TRANSFORM_BLOCKS(ctx, in, num) sha1_block_data_order((ctx), \
     70 		(in), (num))
     71 
     72 void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t num_blocks);
     73 
     74 #else
     75 
     76 #define	SHA1_TRANSFORM(ctx, in) SHA1Transform((ctx), (in))
     77 
     78 static void SHA1Transform(SHA1_CTX *, const uint8_t *);
     79 
     80 #endif
     81 
     82 
     83 static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
     84 
     85 /*
     86  * F, G, and H are the basic SHA1 functions.
     87  */
     88 #define	F(b, c, d)	(((b) & (c)) | ((~b) & (d)))
     89 #define	G(b, c, d)	((b) ^ (c) ^ (d))
     90 #define	H(b, c, d)	(((b) & (c)) | (((b)|(c)) & (d)))
     91 
     92 /*
     93  * ROTATE_LEFT rotates x left n bits.
     94  */
     95 
     96 #if	defined(__GNUC__) && defined(_LP64)
     97 static __inline__ uint64_t
     98 ROTATE_LEFT(uint64_t value, uint32_t n)
     99 {
    100 	uint32_t t32;
    101 
    102 	t32 = (uint32_t)value;
    103 	return ((t32 << n) | (t32 >> (32 - n)));
    104 }
    105 
    106 #else
    107 
    108 #define	ROTATE_LEFT(x, n)	\
    109 	(((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n))))
    110 
    111 #endif
    112 
    113 
    114 /*
    115  * SHA1Init()
    116  *
    117  * purpose: initializes the sha1 context and begins and sha1 digest operation
    118  *   input: SHA1_CTX *	: the context to initializes.
    119  *  output: void
    120  */
    121 
    122 void
    123 SHA1Init(SHA1_CTX *ctx)
    124 {
    125 	ctx->count[0] = ctx->count[1] = 0;
    126 
    127 	/*
    128 	 * load magic initialization constants. Tell lint
    129 	 * that these constants are unsigned by using U.
    130 	 */
    131 
    132 	ctx->state[0] = 0x67452301U;
    133 	ctx->state[1] = 0xefcdab89U;
    134 	ctx->state[2] = 0x98badcfeU;
    135 	ctx->state[3] = 0x10325476U;
    136 	ctx->state[4] = 0xc3d2e1f0U;
    137 }
    138 
    139 #ifdef VIS_SHA1
    140 #ifdef _KERNEL
    141 
    142 #include <sys/regset.h>
    143 #include <sys/vis.h>
    144 #include <sys/fpu/fpusystm.h>
    145 
    146 /* the alignment for block stores to save fp registers */
    147 #define	VIS_ALIGN	(64)
    148 
    149 extern int sha1_savefp(kfpu_t *, int);
    150 extern void sha1_restorefp(kfpu_t *);
    151 
    152 uint32_t	vis_sha1_svfp_threshold = 128;
    153 
    154 #endif /* _KERNEL */
    155 
    156 /*
    157  * VIS SHA-1 consts.
    158  */
    159 static uint64_t VIS[] = {
    160 	0x8000000080000000ULL,
    161 	0x0002000200020002ULL,
    162 	0x5a8279996ed9eba1ULL,
    163 	0x8f1bbcdcca62c1d6ULL,
    164 	0x012389ab456789abULL};
    165 
    166 extern void SHA1TransformVIS(uint64_t *, uint32_t *, uint32_t *, uint64_t *);
    167 
    168 
    169 /*
    170  * SHA1Update()
    171  *
    172  * purpose: continues an sha1 digest operation, using the message block
    173  *          to update the context.
    174  *   input: SHA1_CTX *	: the context to update
    175  *          void *	: the message block
    176  *          size_t    : the length of the message block in bytes
    177  *  output: void
    178  */
    179 
    180 void
    181 SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
    182 {
    183 	uint32_t i, buf_index, buf_len;
    184 	uint64_t X0[40], input64[8];
    185 	const uint8_t *input = inptr;
    186 #ifdef _KERNEL
    187 	int usevis = 0;
    188 #else
    189 	int usevis = 1;
    190 #endif /* _KERNEL */
    191 
    192 	/* check for noop */
    193 	if (input_len == 0)
    194 		return;
    195 
    196 	/* compute number of bytes mod 64 */
    197 	buf_index = (ctx->count[1] >> 3) & 0x3F;
    198 
    199 	/* update number of bits */
    200 	if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
    201 		ctx->count[0]++;
    202 
    203 	ctx->count[0] += (input_len >> 29);
    204 
    205 	buf_len = 64 - buf_index;
    206 
    207 	/* transform as many times as possible */
    208 	i = 0;
    209 	if (input_len >= buf_len) {
    210 #ifdef _KERNEL
    211 		kfpu_t *fpu;
    212 		if (fpu_exists) {
    213 			uint8_t fpua[sizeof (kfpu_t) + GSR_SIZE + VIS_ALIGN];
    214 			uint32_t len = (input_len + buf_index) & ~0x3f;
    215 			int svfp_ok;
    216 
    217 			fpu = (kfpu_t *)P2ROUNDUP((uintptr_t)fpua, 64);
    218 			svfp_ok = ((len >= vis_sha1_svfp_threshold) ? 1 : 0);
    219 			usevis = fpu_exists && sha1_savefp(fpu, svfp_ok);
    220 		} else {
    221 			usevis = 0;
    222 		}
    223 #endif /* _KERNEL */
    224 
    225 		/*
    226 		 * general optimization:
    227 		 *
    228 		 * only do initial bcopy() and SHA1Transform() if
    229 		 * buf_index != 0.  if buf_index == 0, we're just
    230 		 * wasting our time doing the bcopy() since there
    231 		 * wasn't any data left over from a previous call to
    232 		 * SHA1Update().
    233 		 */
    234 
    235 		if (buf_index) {
    236 			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
    237 			if (usevis) {
    238 				SHA1TransformVIS(X0,
    239 				    ctx->buf_un.buf32,
    240 				    &ctx->state[0], VIS);
    241 			} else {
    242 				SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
    243 			}
    244 			i = buf_len;
    245 		}
    246 
    247 		/*
    248 		 * VIS SHA-1: uses the VIS 1.0 instructions to accelerate
    249 		 * SHA-1 processing. This is achieved by "offloading" the
    250 		 * computation of the message schedule (MS) to the VIS units.
    251 		 * This allows the VIS computation of the message schedule
    252 		 * to be performed in parallel with the standard integer
    253 		 * processing of the remainder of the SHA-1 computation.
    254 		 * performance by up to around 1.37X, compared to an optimized
    255 		 * integer-only implementation.
    256 		 *
    257 		 * The VIS implementation of SHA1Transform has a different API
    258 		 * to the standard integer version:
    259 		 *
    260 		 * void SHA1TransformVIS(
    261 		 *	 uint64_t *, // Pointer to MS for ith block
    262 		 *	 uint32_t *, // Pointer to ith block of message data
    263 		 *	 uint32_t *, // Pointer to SHA state i.e ctx->state
    264 		 *	 uint64_t *, // Pointer to various VIS constants
    265 		 * )
    266 		 *
    267 		 * Note: the message data must by 4-byte aligned.
    268 		 *
    269 		 * Function requires VIS 1.0 support.
    270 		 *
    271 		 * Handling is provided to deal with arbitrary byte alingment
    272 		 * of the input data but the performance gains are reduced
    273 		 * for alignments other than 4-bytes.
    274 		 */
    275 		if (usevis) {
    276 			if (!IS_P2ALIGNED(&input[i], sizeof (uint32_t))) {
    277 				/*
    278 				 * Main processing loop - input misaligned
    279 				 */
    280 				for (; i + 63 < input_len; i += 64) {
    281 					bcopy(&input[i], input64, 64);
    282 					SHA1TransformVIS(X0,
    283 					    (uint32_t *)input64,
    284 					    &ctx->state[0], VIS);
    285 				}
    286 			} else {
    287 				/*
    288 				 * Main processing loop - input 8-byte aligned
    289 				 */
    290 				for (; i + 63 < input_len; i += 64) {
    291 					SHA1TransformVIS(X0,
    292 					    /* LINTED E_BAD_PTR_CAST_ALIGN */
    293 					    (uint32_t *)&input[i], /* CSTYLED */
    294 					    &ctx->state[0], VIS);
    295 				}
    296 
    297 			}
    298 #ifdef _KERNEL
    299 			sha1_restorefp(fpu);
    300 #endif /* _KERNEL */
    301 		} else {
    302 			for (; i + 63 < input_len; i += 64) {
    303 				SHA1_TRANSFORM(ctx, &input[i]);
    304 			}
    305 		}
    306 
    307 		/*
    308 		 * general optimization:
    309 		 *
    310 		 * if i and input_len are the same, return now instead
    311 		 * of calling bcopy(), since the bcopy() in this case
    312 		 * will be an expensive nop.
    313 		 */
    314 
    315 		if (input_len == i)
    316 			return;
    317 
    318 		buf_index = 0;
    319 	}
    320 
    321 	/* buffer remaining input */
    322 	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
    323 }
    324 
    325 #else /* VIS_SHA1 */
    326 
    327 void
    328 SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
    329 {
    330 	uint32_t i, buf_index, buf_len;
    331 	const uint8_t *input = inptr;
    332 #if defined(__amd64)
    333 	uint32_t	block_count;
    334 #endif	/* __amd64 */
    335 
    336 	/* check for noop */
    337 	if (input_len == 0)
    338 		return;
    339 
    340 	/* compute number of bytes mod 64 */
    341 	buf_index = (ctx->count[1] >> 3) & 0x3F;
    342 
    343 	/* update number of bits */
    344 	if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
    345 		ctx->count[0]++;
    346 
    347 	ctx->count[0] += (input_len >> 29);
    348 
    349 	buf_len = 64 - buf_index;
    350 
    351 	/* transform as many times as possible */
    352 	i = 0;
    353 	if (input_len >= buf_len) {
    354 
    355 		/*
    356 		 * general optimization:
    357 		 *
    358 		 * only do initial bcopy() and SHA1Transform() if
    359 		 * buf_index != 0.  if buf_index == 0, we're just
    360 		 * wasting our time doing the bcopy() since there
    361 		 * wasn't any data left over from a previous call to
    362 		 * SHA1Update().
    363 		 */
    364 
    365 		if (buf_index) {
    366 			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
    367 			SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
    368 			i = buf_len;
    369 		}
    370 
    371 #if !defined(__amd64)
    372 		for (; i + 63 < input_len; i += 64)
    373 			SHA1_TRANSFORM(ctx, &input[i]);
    374 #else
    375 		block_count = (input_len - i) >> 6;
    376 		if (block_count > 0) {
    377 			SHA1_TRANSFORM_BLOCKS(ctx, &input[i], block_count);
    378 			i += block_count << 6;
    379 		}
    380 #endif	/* !__amd64 */
    381 
    382 		/*
    383 		 * general optimization:
    384 		 *
    385 		 * if i and input_len are the same, return now instead
    386 		 * of calling bcopy(), since the bcopy() in this case
    387 		 * will be an expensive nop.
    388 		 */
    389 
    390 		if (input_len == i)
    391 			return;
    392 
    393 		buf_index = 0;
    394 	}
    395 
    396 	/* buffer remaining input */
    397 	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
    398 }
    399 
    400 #endif /* VIS_SHA1 */
    401 
    402 /*
    403  * SHA1Final()
    404  *
    405  * purpose: ends an sha1 digest operation, finalizing the message digest and
    406  *          zeroing the context.
    407  *   input: uchar_t *	: A buffer to store the digest.
    408  *			: The function actually uses void* because many
    409  *			: callers pass things other than uchar_t here.
    410  *          SHA1_CTX *  : the context to finalize, save, and zero
    411  *  output: void
    412  */
    413 
    414 void
    415 SHA1Final(void *digest, SHA1_CTX *ctx)
    416 {
    417 	uint8_t		bitcount_be[sizeof (ctx->count)];
    418 	uint32_t	index = (ctx->count[1] >> 3) & 0x3f;
    419 
    420 	/* store bit count, big endian */
    421 	Encode(bitcount_be, ctx->count, sizeof (bitcount_be));
    422 
    423 	/* pad out to 56 mod 64 */
    424 	SHA1Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
    425 
    426 	/* append length (before padding) */
    427 	SHA1Update(ctx, bitcount_be, sizeof (bitcount_be));
    428 
    429 	/* store state in digest */
    430 	Encode(digest, ctx->state, sizeof (ctx->state));
    431 
    432 	/* zeroize sensitive information */
    433 	bzero(ctx, sizeof (*ctx));
    434 }
    435 
    436 
    437 #if !defined(__amd64)
    438 
    439 typedef uint32_t sha1word;
    440 
    441 /*
    442  * sparc optimization:
    443  *
    444  * on the sparc, we can load big endian 32-bit data easily.  note that
    445  * special care must be taken to ensure the address is 32-bit aligned.
    446  * in the interest of speed, we don't check to make sure, since
    447  * careful programming can guarantee this for us.
    448  */
    449 
    450 #if	defined(_BIG_ENDIAN)
    451 #define	LOAD_BIG_32(addr)	(*(uint32_t *)(addr))
    452 
    453 #elif	defined(HAVE_HTONL)
    454 #define	LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr)))
    455 
    456 #else
    457 /* little endian -- will work on big endian, but slowly */
    458 #define	LOAD_BIG_32(addr)	\
    459 	(((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3])
    460 #endif	/* _BIG_ENDIAN */
    461 
    462 /*
    463  * SHA1Transform()
    464  */
    465 #if	defined(W_ARRAY)
    466 #define	W(n) w[n]
    467 #else	/* !defined(W_ARRAY) */
    468 #define	W(n) w_ ## n
    469 #endif	/* !defined(W_ARRAY) */
    470 
    471 
    472 #if	defined(__sparc)
    473 
    474 /*
    475  * sparc register window optimization:
    476  *
    477  * `a', `b', `c', `d', and `e' are passed into SHA1Transform
    478  * explicitly since it increases the number of registers available to
    479  * the compiler.  under this scheme, these variables can be held in
    480  * %i0 - %i4, which leaves more local and out registers available.
    481  *
    482  * purpose: sha1 transformation -- updates the digest based on `block'
    483  *   input: uint32_t	: bytes  1 -  4 of the digest
    484  *          uint32_t	: bytes  5 -  8 of the digest
    485  *          uint32_t	: bytes  9 - 12 of the digest
    486  *          uint32_t	: bytes 12 - 16 of the digest
    487  *          uint32_t	: bytes 16 - 20 of the digest
    488  *          SHA1_CTX *	: the context to update
    489  *          uint8_t [64]: the block to use to update the digest
    490  *  output: void
    491  */
    492 
    493 void
    494 SHA1Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
    495     SHA1_CTX *ctx, const uint8_t blk[64])
    496 {
    497 	/*
    498 	 * sparc optimization:
    499 	 *
    500 	 * while it is somewhat counter-intuitive, on sparc, it is
    501 	 * more efficient to place all the constants used in this
    502 	 * function in an array and load the values out of the array
    503 	 * than to manually load the constants.  this is because
    504 	 * setting a register to a 32-bit value takes two ops in most
    505 	 * cases: a `sethi' and an `or', but loading a 32-bit value
    506 	 * from memory only takes one `ld' (or `lduw' on v9).  while
    507 	 * this increases memory usage, the compiler can find enough
    508 	 * other things to do while waiting to keep the pipeline does
    509 	 * not stall.  additionally, it is likely that many of these
    510 	 * constants are cached so that later accesses do not even go
    511 	 * out to the bus.
    512 	 *
    513 	 * this array is declared `static' to keep the compiler from
    514 	 * having to bcopy() this array onto the stack frame of
    515 	 * SHA1Transform() each time it is called -- which is
    516 	 * unacceptably expensive.
    517 	 *
    518 	 * the `const' is to ensure that callers are good citizens and
    519 	 * do not try to munge the array.  since these routines are
    520 	 * going to be called from inside multithreaded kernelland,
    521 	 * this is a good safety check. -- `sha1_consts' will end up in
    522 	 * .rodata.
    523 	 *
    524 	 * unfortunately, loading from an array in this manner hurts
    525 	 * performance under Intel.  So, there is a macro,
    526 	 * SHA1_CONST(), used in SHA1Transform(), that either expands to
    527 	 * a reference to this array, or to the actual constant,
    528 	 * depending on what platform this code is compiled for.
    529 	 */
    530 
    531 	static const uint32_t sha1_consts[] = {
    532 		SHA1_CONST_0, SHA1_CONST_1, SHA1_CONST_2, SHA1_CONST_3
    533 	};
    534 
    535 	/*
    536 	 * general optimization:
    537 	 *
    538 	 * use individual integers instead of using an array.  this is a
    539 	 * win, although the amount it wins by seems to vary quite a bit.
    540 	 */
    541 
    542 	uint32_t	w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
    543 	uint32_t	w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
    544 
    545 	/*
    546 	 * sparc optimization:
    547 	 *
    548 	 * if `block' is already aligned on a 4-byte boundary, use
    549 	 * LOAD_BIG_32() directly.  otherwise, bcopy() into a
    550 	 * buffer that *is* aligned on a 4-byte boundary and then do
    551 	 * the LOAD_BIG_32() on that buffer.  benchmarks have shown
    552 	 * that using the bcopy() is better than loading the bytes
    553 	 * individually and doing the endian-swap by hand.
    554 	 *
    555 	 * even though it's quite tempting to assign to do:
    556 	 *
    557 	 * blk = bcopy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32));
    558 	 *
    559 	 * and only have one set of LOAD_BIG_32()'s, the compiler
    560 	 * *does not* like that, so please resist the urge.
    561 	 */
    562 
    563 	if ((uintptr_t)blk & 0x3) {		/* not 4-byte aligned? */
    564 		bcopy(blk, ctx->buf_un.buf32,  sizeof (ctx->buf_un.buf32));
    565 		w_15 = LOAD_BIG_32(ctx->buf_un.buf32 + 15);
    566 		w_14 = LOAD_BIG_32(ctx->buf_un.buf32 + 14);
    567 		w_13 = LOAD_BIG_32(ctx->buf_un.buf32 + 13);
    568 		w_12 = LOAD_BIG_32(ctx->buf_un.buf32 + 12);
    569 		w_11 = LOAD_BIG_32(ctx->buf_un.buf32 + 11);
    570 		w_10 = LOAD_BIG_32(ctx->buf_un.buf32 + 10);
    571 		w_9  = LOAD_BIG_32(ctx->buf_un.buf32 +  9);
    572 		w_8  = LOAD_BIG_32(ctx->buf_un.buf32 +  8);
    573 		w_7  = LOAD_BIG_32(ctx->buf_un.buf32 +  7);
    574 		w_6  = LOAD_BIG_32(ctx->buf_un.buf32 +  6);
    575 		w_5  = LOAD_BIG_32(ctx->buf_un.buf32 +  5);
    576 		w_4  = LOAD_BIG_32(ctx->buf_un.buf32 +  4);
    577 		w_3  = LOAD_BIG_32(ctx->buf_un.buf32 +  3);
    578 		w_2  = LOAD_BIG_32(ctx->buf_un.buf32 +  2);
    579 		w_1  = LOAD_BIG_32(ctx->buf_un.buf32 +  1);
    580 		w_0  = LOAD_BIG_32(ctx->buf_un.buf32 +  0);
    581 	} else {
    582 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    583 		w_15 = LOAD_BIG_32(blk + 60);
    584 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    585 		w_14 = LOAD_BIG_32(blk + 56);
    586 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    587 		w_13 = LOAD_BIG_32(blk + 52);
    588 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    589 		w_12 = LOAD_BIG_32(blk + 48);
    590 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    591 		w_11 = LOAD_BIG_32(blk + 44);
    592 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    593 		w_10 = LOAD_BIG_32(blk + 40);
    594 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    595 		w_9  = LOAD_BIG_32(blk + 36);
    596 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    597 		w_8  = LOAD_BIG_32(blk + 32);
    598 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    599 		w_7  = LOAD_BIG_32(blk + 28);
    600 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    601 		w_6  = LOAD_BIG_32(blk + 24);
    602 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    603 		w_5  = LOAD_BIG_32(blk + 20);
    604 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    605 		w_4  = LOAD_BIG_32(blk + 16);
    606 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    607 		w_3  = LOAD_BIG_32(blk + 12);
    608 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    609 		w_2  = LOAD_BIG_32(blk +  8);
    610 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    611 		w_1  = LOAD_BIG_32(blk +  4);
    612 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    613 		w_0  = LOAD_BIG_32(blk +  0);
    614 	}
    615 #else	/* !defined(__sparc) */
    616 
    617 void /* CSTYLED */
    618 SHA1Transform(SHA1_CTX *ctx, const uint8_t blk[64])
    619 {
    620 	/* CSTYLED */
    621 	sha1word a = ctx->state[0];
    622 	sha1word b = ctx->state[1];
    623 	sha1word c = ctx->state[2];
    624 	sha1word d = ctx->state[3];
    625 	sha1word e = ctx->state[4];
    626 
    627 #if	defined(W_ARRAY)
    628 	sha1word	w[16];
    629 #else	/* !defined(W_ARRAY) */
    630 	sha1word	w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
    631 	sha1word	w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
    632 #endif	/* !defined(W_ARRAY) */
    633 
    634 	W(0)  = LOAD_BIG_32((void *)(blk +  0));
    635 	W(1)  = LOAD_BIG_32((void *)(blk +  4));
    636 	W(2)  = LOAD_BIG_32((void *)(blk +  8));
    637 	W(3)  = LOAD_BIG_32((void *)(blk + 12));
    638 	W(4)  = LOAD_BIG_32((void *)(blk + 16));
    639 	W(5)  = LOAD_BIG_32((void *)(blk + 20));
    640 	W(6)  = LOAD_BIG_32((void *)(blk + 24));
    641 	W(7)  = LOAD_BIG_32((void *)(blk + 28));
    642 	W(8)  = LOAD_BIG_32((void *)(blk + 32));
    643 	W(9)  = LOAD_BIG_32((void *)(blk + 36));
    644 	W(10) = LOAD_BIG_32((void *)(blk + 40));
    645 	W(11) = LOAD_BIG_32((void *)(blk + 44));
    646 	W(12) = LOAD_BIG_32((void *)(blk + 48));
    647 	W(13) = LOAD_BIG_32((void *)(blk + 52));
    648 	W(14) = LOAD_BIG_32((void *)(blk + 56));
    649 	W(15) = LOAD_BIG_32((void *)(blk + 60));
    650 
    651 #endif	/* !defined(__sparc) */
    652 
    653 	/*
    654 	 * general optimization:
    655 	 *
    656 	 * even though this approach is described in the standard as
    657 	 * being slower algorithmically, it is 30-40% faster than the
    658 	 * "faster" version under SPARC, because this version has more
    659 	 * of the constraints specified at compile-time and uses fewer
    660 	 * variables (and therefore has better register utilization)
    661 	 * than its "speedier" brother.  (i've tried both, trust me)
    662 	 *
    663 	 * for either method given in the spec, there is an "assignment"
    664 	 * phase where the following takes place:
    665 	 *
    666 	 *	tmp = (main_computation);
    667 	 *	e = d; d = c; c = rotate_left(b, 30); b = a; a = tmp;
    668 	 *
    669 	 * we can make the algorithm go faster by not doing this work,
    670 	 * but just pretending that `d' is now `e', etc. this works
    671 	 * really well and obviates the need for a temporary variable.
    672 	 * however, we still explicitly perform the rotate action,
    673 	 * since it is cheaper on SPARC to do it once than to have to
    674 	 * do it over and over again.
    675 	 */
    676 
    677 	/* round 1 */
    678 	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(0) + SHA1_CONST(0); /* 0 */
    679 	b = ROTATE_LEFT(b, 30);
    680 
    681 	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(1) + SHA1_CONST(0); /* 1 */
    682 	a = ROTATE_LEFT(a, 30);
    683 
    684 	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(2) + SHA1_CONST(0); /* 2 */
    685 	e = ROTATE_LEFT(e, 30);
    686 
    687 	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(3) + SHA1_CONST(0); /* 3 */
    688 	d = ROTATE_LEFT(d, 30);
    689 
    690 	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(4) + SHA1_CONST(0); /* 4 */
    691 	c = ROTATE_LEFT(c, 30);
    692 
    693 	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(5) + SHA1_CONST(0); /* 5 */
    694 	b = ROTATE_LEFT(b, 30);
    695 
    696 	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(6) + SHA1_CONST(0); /* 6 */
    697 	a = ROTATE_LEFT(a, 30);
    698 
    699 	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(7) + SHA1_CONST(0); /* 7 */
    700 	e = ROTATE_LEFT(e, 30);
    701 
    702 	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(8) + SHA1_CONST(0); /* 8 */
    703 	d = ROTATE_LEFT(d, 30);
    704 
    705 	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(9) + SHA1_CONST(0); /* 9 */
    706 	c = ROTATE_LEFT(c, 30);
    707 
    708 	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(10) + SHA1_CONST(0); /* 10 */
    709 	b = ROTATE_LEFT(b, 30);
    710 
    711 	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(11) + SHA1_CONST(0); /* 11 */
    712 	a = ROTATE_LEFT(a, 30);
    713 
    714 	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(12) + SHA1_CONST(0); /* 12 */
    715 	e = ROTATE_LEFT(e, 30);
    716 
    717 	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(13) + SHA1_CONST(0); /* 13 */
    718 	d = ROTATE_LEFT(d, 30);
    719 
    720 	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(14) + SHA1_CONST(0); /* 14 */
    721 	c = ROTATE_LEFT(c, 30);
    722 
    723 	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(15) + SHA1_CONST(0); /* 15 */
    724 	b = ROTATE_LEFT(b, 30);
    725 
    726 	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 16 */
    727 	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(0) + SHA1_CONST(0);
    728 	a = ROTATE_LEFT(a, 30);
    729 
    730 	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 17 */
    731 	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(1) + SHA1_CONST(0);
    732 	e = ROTATE_LEFT(e, 30);
    733 
    734 	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 18 */
    735 	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(2) + SHA1_CONST(0);
    736 	d = ROTATE_LEFT(d, 30);
    737 
    738 	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 19 */
    739 	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(3) + SHA1_CONST(0);
    740 	c = ROTATE_LEFT(c, 30);
    741 
    742 	/* round 2 */
    743 	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 20 */
    744 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(4) + SHA1_CONST(1);
    745 	b = ROTATE_LEFT(b, 30);
    746 
    747 	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 21 */
    748 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(5) + SHA1_CONST(1);
    749 	a = ROTATE_LEFT(a, 30);
    750 
    751 	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 22 */
    752 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(6) + SHA1_CONST(1);
    753 	e = ROTATE_LEFT(e, 30);
    754 
    755 	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 23 */
    756 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(7) + SHA1_CONST(1);
    757 	d = ROTATE_LEFT(d, 30);
    758 
    759 	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 24 */
    760 	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(8) + SHA1_CONST(1);
    761 	c = ROTATE_LEFT(c, 30);
    762 
    763 	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 25 */
    764 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(9) + SHA1_CONST(1);
    765 	b = ROTATE_LEFT(b, 30);
    766 
    767 	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 26 */
    768 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(10) + SHA1_CONST(1);
    769 	a = ROTATE_LEFT(a, 30);
    770 
    771 	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 27 */
    772 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(11) + SHA1_CONST(1);
    773 	e = ROTATE_LEFT(e, 30);
    774 
    775 	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 28 */
    776 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(12) + SHA1_CONST(1);
    777 	d = ROTATE_LEFT(d, 30);
    778 
    779 	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 29 */
    780 	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(13) + SHA1_CONST(1);
    781 	c = ROTATE_LEFT(c, 30);
    782 
    783 	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 30 */
    784 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(14) + SHA1_CONST(1);
    785 	b = ROTATE_LEFT(b, 30);
    786 
    787 	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 31 */
    788 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(15) + SHA1_CONST(1);
    789 	a = ROTATE_LEFT(a, 30);
    790 
    791 	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 32 */
    792 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(0) + SHA1_CONST(1);
    793 	e = ROTATE_LEFT(e, 30);
    794 
    795 	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 33 */
    796 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(1) + SHA1_CONST(1);
    797 	d = ROTATE_LEFT(d, 30);
    798 
    799 	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 34 */
    800 	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(2) + SHA1_CONST(1);
    801 	c = ROTATE_LEFT(c, 30);
    802 
    803 	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 35 */
    804 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(3) + SHA1_CONST(1);
    805 	b = ROTATE_LEFT(b, 30);
    806 
    807 	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 36 */
    808 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(4) + SHA1_CONST(1);
    809 	a = ROTATE_LEFT(a, 30);
    810 
    811 	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 37 */
    812 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(5) + SHA1_CONST(1);
    813 	e = ROTATE_LEFT(e, 30);
    814 
    815 	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 38 */
    816 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(6) + SHA1_CONST(1);
    817 	d = ROTATE_LEFT(d, 30);
    818 
    819 	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 39 */
    820 	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(7) + SHA1_CONST(1);
    821 	c = ROTATE_LEFT(c, 30);
    822 
    823 	/* round 3 */
    824 	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 40 */
    825 	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(8) + SHA1_CONST(2);
    826 	b = ROTATE_LEFT(b, 30);
    827 
    828 	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 41 */
    829 	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(9) + SHA1_CONST(2);
    830 	a = ROTATE_LEFT(a, 30);
    831 
    832 	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 42 */
    833 	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(10) + SHA1_CONST(2);
    834 	e = ROTATE_LEFT(e, 30);
    835 
    836 	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 43 */
    837 	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(11) + SHA1_CONST(2);
    838 	d = ROTATE_LEFT(d, 30);
    839 
    840 	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 44 */
    841 	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(12) + SHA1_CONST(2);
    842 	c = ROTATE_LEFT(c, 30);
    843 
    844 	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 45 */
    845 	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(13) + SHA1_CONST(2);
    846 	b = ROTATE_LEFT(b, 30);
    847 
    848 	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 46 */
    849 	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(14) + SHA1_CONST(2);
    850 	a = ROTATE_LEFT(a, 30);
    851 
    852 	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 47 */
    853 	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(15) + SHA1_CONST(2);
    854 	e = ROTATE_LEFT(e, 30);
    855 
    856 	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 48 */
    857 	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(0) + SHA1_CONST(2);
    858 	d = ROTATE_LEFT(d, 30);
    859 
    860 	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 49 */
    861 	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(1) + SHA1_CONST(2);
    862 	c = ROTATE_LEFT(c, 30);
    863 
    864 	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 50 */
    865 	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(2) + SHA1_CONST(2);
    866 	b = ROTATE_LEFT(b, 30);
    867 
    868 	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 51 */
    869 	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(3) + SHA1_CONST(2);
    870 	a = ROTATE_LEFT(a, 30);
    871 
    872 	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 52 */
    873 	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(4) + SHA1_CONST(2);
    874 	e = ROTATE_LEFT(e, 30);
    875 
    876 	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 53 */
    877 	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(5) + SHA1_CONST(2);
    878 	d = ROTATE_LEFT(d, 30);
    879 
    880 	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 54 */
    881 	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(6) + SHA1_CONST(2);
    882 	c = ROTATE_LEFT(c, 30);
    883 
    884 	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 55 */
    885 	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(7) + SHA1_CONST(2);
    886 	b = ROTATE_LEFT(b, 30);
    887 
    888 	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 56 */
    889 	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(8) + SHA1_CONST(2);
    890 	a = ROTATE_LEFT(a, 30);
    891 
    892 	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 57 */
    893 	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(9) + SHA1_CONST(2);
    894 	e = ROTATE_LEFT(e, 30);
    895 
    896 	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 58 */
    897 	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(10) + SHA1_CONST(2);
    898 	d = ROTATE_LEFT(d, 30);
    899 
    900 	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 59 */
    901 	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(11) + SHA1_CONST(2);
    902 	c = ROTATE_LEFT(c, 30);
    903 
    904 	/* round 4 */
    905 	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 60 */
    906 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(12) + SHA1_CONST(3);
    907 	b = ROTATE_LEFT(b, 30);
    908 
    909 	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 61 */
    910 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(13) + SHA1_CONST(3);
    911 	a = ROTATE_LEFT(a, 30);
    912 
    913 	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 62 */
    914 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(14) + SHA1_CONST(3);
    915 	e = ROTATE_LEFT(e, 30);
    916 
    917 	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 63 */
    918 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(15) + SHA1_CONST(3);
    919 	d = ROTATE_LEFT(d, 30);
    920 
    921 	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 64 */
    922 	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(0) + SHA1_CONST(3);
    923 	c = ROTATE_LEFT(c, 30);
    924 
    925 	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 65 */
    926 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(1) + SHA1_CONST(3);
    927 	b = ROTATE_LEFT(b, 30);
    928 
    929 	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 66 */
    930 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(2) + SHA1_CONST(3);
    931 	a = ROTATE_LEFT(a, 30);
    932 
    933 	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 67 */
    934 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(3) + SHA1_CONST(3);
    935 	e = ROTATE_LEFT(e, 30);
    936 
    937 	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 68 */
    938 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(4) + SHA1_CONST(3);
    939 	d = ROTATE_LEFT(d, 30);
    940 
    941 	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 69 */
    942 	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(5) + SHA1_CONST(3);
    943 	c = ROTATE_LEFT(c, 30);
    944 
    945 	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 70 */
    946 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(6) + SHA1_CONST(3);
    947 	b = ROTATE_LEFT(b, 30);
    948 
    949 	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 71 */
    950 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(7) + SHA1_CONST(3);
    951 	a = ROTATE_LEFT(a, 30);
    952 
    953 	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 72 */
    954 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(8) + SHA1_CONST(3);
    955 	e = ROTATE_LEFT(e, 30);
    956 
    957 	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 73 */
    958 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(9) + SHA1_CONST(3);
    959 	d = ROTATE_LEFT(d, 30);
    960 
    961 	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 74 */
    962 	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(10) + SHA1_CONST(3);
    963 	c = ROTATE_LEFT(c, 30);
    964 
    965 	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 75 */
    966 	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(11) + SHA1_CONST(3);
    967 	b = ROTATE_LEFT(b, 30);
    968 
    969 	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 76 */
    970 	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(12) + SHA1_CONST(3);
    971 	a = ROTATE_LEFT(a, 30);
    972 
    973 	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 77 */
    974 	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(13) + SHA1_CONST(3);
    975 	e = ROTATE_LEFT(e, 30);
    976 
    977 	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 78 */
    978 	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(14) + SHA1_CONST(3);
    979 	d = ROTATE_LEFT(d, 30);
    980 
    981 	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 79 */
    982 
    983 	ctx->state[0] += ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(15) +
    984 	    SHA1_CONST(3);
    985 	ctx->state[1] += b;
    986 	ctx->state[2] += ROTATE_LEFT(c, 30);
    987 	ctx->state[3] += d;
    988 	ctx->state[4] += e;
    989 
    990 	/* zeroize sensitive information */
    991 	W(0) = W(1) = W(2) = W(3) = W(4) = W(5) = W(6) = W(7) = W(8) = 0;
    992 	W(9) = W(10) = W(11) = W(12) = W(13) = W(14) = W(15) = 0;
    993 }
    994 #endif	/* !__amd64 */
    995 
    996 
    997 /*
    998  * Encode()
    999  *
   1000  * purpose: to convert a list of numbers from little endian to big endian
   1001  *   input: uint8_t *	: place to store the converted big endian numbers
   1002  *	    uint32_t *	: place to get numbers to convert from
   1003  *          size_t	: the length of the input in bytes
   1004  *  output: void
   1005  */
   1006 
   1007 static void
   1008 Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
   1009     size_t len)
   1010 {
   1011 	size_t		i, j;
   1012 
   1013 #if	defined(__sparc)
   1014 	if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
   1015 		for (i = 0, j = 0; j < len; i++, j += 4) {
   1016 			/* LINTED E_BAD_PTR_CAST_ALIGN */
   1017 			*((uint32_t *)(output + j)) = input[i];
   1018 		}
   1019 	} else {
   1020 #endif	/* little endian -- will work on big endian, but slowly */
   1021 		for (i = 0, j = 0; j < len; i++, j += 4) {
   1022 			output[j]	= (input[i] >> 24) & 0xff;
   1023 			output[j + 1]	= (input[i] >> 16) & 0xff;
   1024 			output[j + 2]	= (input[i] >>  8) & 0xff;
   1025 			output[j + 3]	= input[i] & 0xff;
   1026 		}
   1027 #if	defined(__sparc)
   1028 	}
   1029 #endif
   1030 }
   1031