Home | History | Annotate | Download | only in md5
      1 /*
      2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 
      6 /*
      7  * Cleaned-up and optimized version of MD5, based on the reference
      8  * implementation provided in RFC 1321.  See RSA Copyright information
      9  * below.
     10  */
     11 
     12 /*
     13  * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
     14  */
     15 
     16 /*
     17  * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
     18  * rights reserved.
     19  *
     20  * License to copy and use this software is granted provided that it
     21  * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
     22  * Algorithm" in all material mentioning or referencing this software
     23  * or this function.
     24  *
     25  * License is also granted to make and use derivative works provided
     26  * that such works are identified as "derived from the RSA Data
     27  * Security, Inc. MD5 Message-Digest Algorithm" in all material
     28  * mentioning or referencing the derived work.
     29  *
     30  * RSA Data Security, Inc. makes no representations concerning either
     31  * the merchantability of this software or the suitability of this
     32  * software for any particular purpose. It is provided "as is"
     33  * without express or implied warranty of any kind.
     34  *
     35  * These notices must be retained in any copies of any part of this
     36  * documentation and/or software.
     37  */
     38 
     39 #ifndef _KERNEL
     40 #include <stdint.h>
     41 #endif /* _KERNEL */
     42 
     43 #include <sys/types.h>
     44 #include <sys/md5.h>
     45 #include <sys/md5_consts.h>	/* MD5_CONST() optimization */
     46 #include "md5_byteswap.h"
     47 #if	!defined(_KERNEL) || defined(_BOOT)
     48 #include <strings.h>
     49 #endif /* !_KERNEL || _BOOT */
     50 
     51 #ifdef _KERNEL
     52 #include <sys/systm.h>
     53 #endif /* _KERNEL */
     54 
     55 static void Encode(uint8_t *, const uint32_t *, size_t);
     56 
     57 #if !defined(__amd64)
     58 static void MD5Transform(uint32_t, uint32_t, uint32_t, uint32_t, MD5_CTX *,
     59     const uint8_t [64]);
     60 #else
     61 void md5_block_asm_host_order(MD5_CTX *ctx, const void *inpp,
     62     unsigned int input_length_in_blocks);
     63 #endif /* !defined(__amd64) */
     64 
     65 static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
     66 
     67 /*
     68  * F, G, H and I are the basic MD5 functions.
     69  */
     70 #define	F(b, c, d)	(((b) & (c)) | ((~b) & (d)))
     71 #define	G(b, c, d)	(((b) & (d)) | ((c) & (~d)))
     72 #define	H(b, c, d)	((b) ^ (c) ^ (d))
     73 #define	I(b, c, d)	((c) ^ ((b) | (~d)))
     74 
     75 /*
     76  * ROTATE_LEFT rotates x left n bits.
     77  */
     78 #define	ROTATE_LEFT(x, n)	\
     79 	(((x) << (n)) | ((x) >> ((sizeof (x) << 3) - (n))))
     80 
     81 /*
     82  * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
     83  * Rotation is separate from addition to prevent recomputation.
     84  */
     85 
     86 #define	FF(a, b, c, d, x, s, ac) { \
     87 	(a) += F((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
     88 	(a) = ROTATE_LEFT((a), (s)); \
     89 	(a) += (b); \
     90 	}
     91 
     92 #define	GG(a, b, c, d, x, s, ac) { \
     93 	(a) += G((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
     94 	(a) = ROTATE_LEFT((a), (s)); \
     95 	(a) += (b); \
     96 	}
     97 
     98 #define	HH(a, b, c, d, x, s, ac) { \
     99 	(a) += H((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
    100 	(a) = ROTATE_LEFT((a), (s)); \
    101 	(a) += (b); \
    102 	}
    103 
    104 #define	II(a, b, c, d, x, s, ac) { \
    105 	(a) += I((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
    106 	(a) = ROTATE_LEFT((a), (s)); \
    107 	(a) += (b); \
    108 	}
    109 
    110 /*
    111  * Loading 32-bit constants on a RISC is expensive since it involves both a
    112  * `sethi' and an `or'.  thus, we instead have the compiler generate `ld's to
    113  * load the constants from an array called `md5_consts'.  however, on intel
    114  * (and other CISC processors), it is cheaper to load the constant
    115  * directly.  thus, the c code in MD5Transform() uses the macro MD5_CONST()
    116  * which either expands to a constant or an array reference, depending on the
    117  * architecture the code is being compiled for.
    118  *
    119  * Right now, i386 and amd64 are the CISC exceptions.
    120  * If we get another CISC ISA, we'll have to change the ifdef.
    121  */
    122 
    123 #if defined(__i386) || defined(__amd64)
    124 
    125 #define	MD5_CONST(x)		(MD5_CONST_ ## x)
    126 #define	MD5_CONST_e(x)		MD5_CONST(x)
    127 #define	MD5_CONST_o(x)		MD5_CONST(x)
    128 
    129 #else
    130 /*
    131  * sparc/RISC optimization:
    132  *
    133  * while it is somewhat counter-intuitive, on sparc (and presumably other RISC
    134  * machines), it is more efficient to place all the constants used in this
    135  * function in an array and load the values out of the array than to manually
    136  * load the constants.  this is because setting a register to a 32-bit value
    137  * takes two ops in most cases: a `sethi' and an `or', but loading a 32-bit
    138  * value from memory only takes one `ld' (or `lduw' on v9).  while this
    139  * increases memory usage, the compiler can find enough other things to do
    140  * while waiting to keep the pipeline does not stall.  additionally, it is
    141  * likely that many of these constants are cached so that later accesses do
    142  * not even go out to the bus.
    143  *
    144  * this array is declared `static' to keep the compiler from having to
    145  * bcopy() this array onto the stack frame of MD5Transform() each time it is
    146  * called -- which is unacceptably expensive.
    147  *
    148  * the `const' is to ensure that callers are good citizens and do not try to
    149  * munge the array.  since these routines are going to be called from inside
    150  * multithreaded kernelland, this is a good safety check. -- `constants' will
    151  * end up in .rodata.
    152  *
    153  * unfortunately, loading from an array in this manner hurts performance under
    154  * intel (and presumably other CISC machines).  so, there is a macro,
    155  * MD5_CONST(), used in MD5Transform(), that either expands to a reference to
    156  * this array, or to the actual constant, depending on what platform this code
    157  * is compiled for.
    158  */
    159 
    160 #ifdef sun4v
    161 
    162 /*
    163  * Going to load these consts in 8B chunks, so need to enforce 8B alignment
    164  */
    165 
    166 /* CSTYLED */
    167 #pragma align 64 (md5_consts)
    168 #define	_MD5_CHECK_ALIGNMENT
    169 
    170 #endif /* sun4v */
    171 
    172 static const uint32_t md5_consts[] = {
    173 	MD5_CONST_0,	MD5_CONST_1,	MD5_CONST_2,	MD5_CONST_3,
    174 	MD5_CONST_4,	MD5_CONST_5,	MD5_CONST_6,	MD5_CONST_7,
    175 	MD5_CONST_8,	MD5_CONST_9,	MD5_CONST_10,	MD5_CONST_11,
    176 	MD5_CONST_12,	MD5_CONST_13,	MD5_CONST_14,	MD5_CONST_15,
    177 	MD5_CONST_16,	MD5_CONST_17,	MD5_CONST_18,	MD5_CONST_19,
    178 	MD5_CONST_20,	MD5_CONST_21,	MD5_CONST_22,	MD5_CONST_23,
    179 	MD5_CONST_24,	MD5_CONST_25,	MD5_CONST_26,	MD5_CONST_27,
    180 	MD5_CONST_28,	MD5_CONST_29,	MD5_CONST_30,	MD5_CONST_31,
    181 	MD5_CONST_32,	MD5_CONST_33,	MD5_CONST_34,	MD5_CONST_35,
    182 	MD5_CONST_36,	MD5_CONST_37,	MD5_CONST_38,	MD5_CONST_39,
    183 	MD5_CONST_40,	MD5_CONST_41,	MD5_CONST_42,	MD5_CONST_43,
    184 	MD5_CONST_44,	MD5_CONST_45,	MD5_CONST_46,	MD5_CONST_47,
    185 	MD5_CONST_48,	MD5_CONST_49,	MD5_CONST_50,	MD5_CONST_51,
    186 	MD5_CONST_52,	MD5_CONST_53,	MD5_CONST_54,	MD5_CONST_55,
    187 	MD5_CONST_56,	MD5_CONST_57,	MD5_CONST_58,	MD5_CONST_59,
    188 	MD5_CONST_60,	MD5_CONST_61,	MD5_CONST_62,	MD5_CONST_63
    189 };
    190 
    191 
    192 #ifdef sun4v
    193 /*
    194  * To reduce the number of loads, load consts in 64-bit
    195  * chunks and then split.
    196  *
    197  * No need to mask upper 32-bits, as just interested in
    198  * low 32-bits (saves an & operation and means that this
    199  * optimization doesn't increases the icount.
    200  */
    201 #define	MD5_CONST_e(x)		(md5_consts64[x/2] >> 32)
    202 #define	MD5_CONST_o(x)		(md5_consts64[x/2])
    203 
    204 #else
    205 
    206 #define	MD5_CONST_e(x)		(md5_consts[x])
    207 #define	MD5_CONST_o(x)		(md5_consts[x])
    208 
    209 #endif /* sun4v */
    210 
    211 #endif
    212 
    213 /*
    214  * MD5Init()
    215  *
    216  * purpose: initializes the md5 context and begins and md5 digest operation
    217  *   input: MD5_CTX *	: the context to initialize.
    218  *  output: void
    219  */
    220 
    221 void
    222 MD5Init(MD5_CTX *ctx)
    223 {
    224 	ctx->count[0] = ctx->count[1] = 0;
    225 
    226 	/* load magic initialization constants */
    227 	ctx->state[0] = MD5_INIT_CONST_1;
    228 	ctx->state[1] = MD5_INIT_CONST_2;
    229 	ctx->state[2] = MD5_INIT_CONST_3;
    230 	ctx->state[3] = MD5_INIT_CONST_4;
    231 }
    232 
    233 /*
    234  * MD5Update()
    235  *
    236  * purpose: continues an md5 digest operation, using the message block
    237  *          to update the context.
    238  *   input: MD5_CTX *	: the context to update
    239  *          uint8_t *	: the message block
    240  *          uint32_t    : the length of the message block in bytes
    241  *  output: void
    242  *
    243  * MD5 crunches in 64-byte blocks.  All numeric constants here are related to
    244  * that property of MD5.
    245  */
    246 
    247 void
    248 MD5Update(MD5_CTX *ctx, const void *inpp, unsigned int input_len)
    249 {
    250 	uint32_t		i, buf_index, buf_len;
    251 #ifdef	sun4v
    252 	uint32_t		old_asi;
    253 #endif	/* sun4v */
    254 #if defined(__amd64)
    255 	uint32_t		block_count;
    256 #endif /* !defined(__amd64) */
    257 	const unsigned char 	*input = (const unsigned char *)inpp;
    258 
    259 	/* compute (number of bytes computed so far) mod 64 */
    260 	buf_index = (ctx->count[0] >> 3) & 0x3F;
    261 
    262 	/* update number of bits hashed into this MD5 computation so far */
    263 	if ((ctx->count[0] += (input_len << 3)) < (input_len << 3))
    264 		ctx->count[1]++;
    265 	ctx->count[1] += (input_len >> 29);
    266 
    267 	buf_len = 64 - buf_index;
    268 
    269 	/* transform as many times as possible */
    270 	i = 0;
    271 	if (input_len >= buf_len) {
    272 
    273 		/*
    274 		 * general optimization:
    275 		 *
    276 		 * only do initial bcopy() and MD5Transform() if
    277 		 * buf_index != 0.  if buf_index == 0, we're just
    278 		 * wasting our time doing the bcopy() since there
    279 		 * wasn't any data left over from a previous call to
    280 		 * MD5Update().
    281 		 */
    282 
    283 #ifdef sun4v
    284 		/*
    285 		 * For N1 use %asi register. However, costly to repeatedly set
    286 		 * in MD5Transform. Therefore, set once here.
    287 		 * Should probably restore the old value afterwards...
    288 		 */
    289 		old_asi = get_little();
    290 		set_little(0x88);
    291 #endif /* sun4v */
    292 
    293 		if (buf_index) {
    294 			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
    295 
    296 #if !defined(__amd64)
    297 			MD5Transform(ctx->state[0], ctx->state[1],
    298 			    ctx->state[2], ctx->state[3], ctx,
    299 			    ctx->buf_un.buf8);
    300 #else
    301 			md5_block_asm_host_order(ctx, ctx->buf_un.buf8, 1);
    302 #endif /* !defined(__amd64) */
    303 
    304 			i = buf_len;
    305 		}
    306 
    307 #if !defined(__amd64)
    308 		for (; i + 63 < input_len; i += 64)
    309 			MD5Transform(ctx->state[0], ctx->state[1],
    310 			    ctx->state[2], ctx->state[3], ctx, &input[i]);
    311 
    312 #else
    313 		block_count = (input_len - i) >> 6;
    314 		if (block_count > 0) {
    315 			md5_block_asm_host_order(ctx, &input[i], block_count);
    316 			i += block_count << 6;
    317 		}
    318 #endif /* !defined(__amd64) */
    319 
    320 
    321 #ifdef sun4v
    322 		/*
    323 		 * Restore old %ASI value
    324 		 */
    325 		set_little(old_asi);
    326 #endif /* sun4v */
    327 
    328 		/*
    329 		 * general optimization:
    330 		 *
    331 		 * if i and input_len are the same, return now instead
    332 		 * of calling bcopy(), since the bcopy() in this
    333 		 * case will be an expensive nop.
    334 		 */
    335 
    336 		if (input_len == i)
    337 			return;
    338 
    339 		buf_index = 0;
    340 	}
    341 
    342 	/* buffer remaining input */
    343 	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
    344 }
    345 
    346 /*
    347  * MD5Final()
    348  *
    349  * purpose: ends an md5 digest operation, finalizing the message digest and
    350  *          zeroing the context.
    351  *   input: uchar_t *	: a buffer to store the digest in
    352  *			: The function actually uses void* because many
    353  *			: callers pass things other than uchar_t here.
    354  *          MD5_CTX *   : the context to finalize, save, and zero
    355  *  output: void
    356  */
    357 
    358 void
    359 MD5Final(void *digest, MD5_CTX *ctx)
    360 {
    361 	uint8_t		bitcount_le[sizeof (ctx->count)];
    362 	uint32_t	index = (ctx->count[0] >> 3) & 0x3f;
    363 
    364 	/* store bit count, little endian */
    365 	Encode(bitcount_le, ctx->count, sizeof (bitcount_le));
    366 
    367 	/* pad out to 56 mod 64 */
    368 	MD5Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
    369 
    370 	/* append length (before padding) */
    371 	MD5Update(ctx, bitcount_le, sizeof (bitcount_le));
    372 
    373 	/* store state in digest */
    374 	Encode(digest, ctx->state, sizeof (ctx->state));
    375 
    376 	/* zeroize sensitive information */
    377 	bzero(ctx, sizeof (*ctx));
    378 }
    379 
    380 #ifndef	_KERNEL
    381 
    382 void
    383 md5_calc(unsigned char *output, unsigned char *input, unsigned int inlen)
    384 {
    385 	MD5_CTX context;
    386 
    387 	MD5Init(&context);
    388 	MD5Update(&context, input, inlen);
    389 	MD5Final(output, &context);
    390 }
    391 
    392 #endif	/* !_KERNEL */
    393 
    394 #if !defined(__amd64)
    395 /*
    396  * sparc register window optimization:
    397  *
    398  * `a', `b', `c', and `d' are passed into MD5Transform explicitly
    399  * since it increases the number of registers available to the
    400  * compiler.  under this scheme, these variables can be held in
    401  * %i0 - %i3, which leaves more local and out registers available.
    402  */
    403 
    404 /*
    405  * MD5Transform()
    406  *
    407  * purpose: md5 transformation -- updates the digest based on `block'
    408  *   input: uint32_t	: bytes  1 -  4 of the digest
    409  *          uint32_t	: bytes  5 -  8 of the digest
    410  *          uint32_t	: bytes  9 - 12 of the digest
    411  *          uint32_t	: bytes 12 - 16 of the digest
    412  *          MD5_CTX *   : the context to update
    413  *          uint8_t [64]: the block to use to update the digest
    414  *  output: void
    415  */
    416 
    417 static void
    418 MD5Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d,
    419     MD5_CTX *ctx, const uint8_t block[64])
    420 {
    421 	/*
    422 	 * general optimization:
    423 	 *
    424 	 * use individual integers instead of using an array.  this is a
    425 	 * win, although the amount it wins by seems to vary quite a bit.
    426 	 */
    427 
    428 	register uint32_t	x_0, x_1, x_2,  x_3,  x_4,  x_5,  x_6,  x_7;
    429 	register uint32_t	x_8, x_9, x_10, x_11, x_12, x_13, x_14, x_15;
    430 #ifdef sun4v
    431 	unsigned long long 	*md5_consts64;
    432 
    433 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    434 	md5_consts64 = (unsigned long long *) md5_consts;
    435 #endif	/* sun4v */
    436 
    437 	/*
    438 	 * general optimization:
    439 	 *
    440 	 * the compiler (at least SC4.2/5.x) generates better code if
    441 	 * variable use is localized.  in this case, swapping the integers in
    442 	 * this order allows `x_0 'to be swapped nearest to its first use in
    443 	 * FF(), and likewise for `x_1' and up.  note that the compiler
    444 	 * prefers this to doing each swap right before the FF() that
    445 	 * uses it.
    446 	 */
    447 
    448 	/*
    449 	 * sparc v9/v8plus optimization:
    450 	 *
    451 	 * if `block' is already aligned on a 4-byte boundary, use the
    452 	 * optimized load_little_32() directly.  otherwise, bcopy()
    453 	 * into a buffer that *is* aligned on a 4-byte boundary and
    454 	 * then do the load_little_32() on that buffer.  benchmarks
    455 	 * have shown that using the bcopy() is better than loading
    456 	 * the bytes individually and doing the endian-swap by hand.
    457 	 *
    458 	 * even though it's quite tempting to assign to do:
    459 	 *
    460 	 * blk = bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
    461 	 *
    462 	 * and only have one set of LOAD_LITTLE_32()'s, the compiler (at least
    463 	 * SC4.2/5.x) *does not* like that, so please resist the urge.
    464 	 */
    465 
    466 #ifdef _MD5_CHECK_ALIGNMENT
    467 	if ((uintptr_t)block & 0x3) {		/* not 4-byte aligned? */
    468 		bcopy(block, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
    469 
    470 #ifdef sun4v
    471 		x_15 = LOAD_LITTLE_32_f(ctx->buf_un.buf32);
    472 		x_14 = LOAD_LITTLE_32_e(ctx->buf_un.buf32);
    473 		x_13 = LOAD_LITTLE_32_d(ctx->buf_un.buf32);
    474 		x_12 = LOAD_LITTLE_32_c(ctx->buf_un.buf32);
    475 		x_11 = LOAD_LITTLE_32_b(ctx->buf_un.buf32);
    476 		x_10 = LOAD_LITTLE_32_a(ctx->buf_un.buf32);
    477 		x_9  = LOAD_LITTLE_32_9(ctx->buf_un.buf32);
    478 		x_8  = LOAD_LITTLE_32_8(ctx->buf_un.buf32);
    479 		x_7  = LOAD_LITTLE_32_7(ctx->buf_un.buf32);
    480 		x_6  = LOAD_LITTLE_32_6(ctx->buf_un.buf32);
    481 		x_5  = LOAD_LITTLE_32_5(ctx->buf_un.buf32);
    482 		x_4  = LOAD_LITTLE_32_4(ctx->buf_un.buf32);
    483 		x_3  = LOAD_LITTLE_32_3(ctx->buf_un.buf32);
    484 		x_2  = LOAD_LITTLE_32_2(ctx->buf_un.buf32);
    485 		x_1  = LOAD_LITTLE_32_1(ctx->buf_un.buf32);
    486 		x_0  = LOAD_LITTLE_32_0(ctx->buf_un.buf32);
    487 #else
    488 		x_15 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 15);
    489 		x_14 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 14);
    490 		x_13 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 13);
    491 		x_12 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 12);
    492 		x_11 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 11);
    493 		x_10 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 10);
    494 		x_9  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  9);
    495 		x_8  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  8);
    496 		x_7  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  7);
    497 		x_6  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  6);
    498 		x_5  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  5);
    499 		x_4  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  4);
    500 		x_3  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  3);
    501 		x_2  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  2);
    502 		x_1  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  1);
    503 		x_0  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  0);
    504 #endif /* sun4v */
    505 	} else
    506 #endif
    507 	{
    508 
    509 #ifdef sun4v
    510 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    511 		x_15 = LOAD_LITTLE_32_f(block);
    512 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    513 		x_14 = LOAD_LITTLE_32_e(block);
    514 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    515 		x_13 = LOAD_LITTLE_32_d(block);
    516 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    517 		x_12 = LOAD_LITTLE_32_c(block);
    518 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    519 		x_11 = LOAD_LITTLE_32_b(block);
    520 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    521 		x_10 = LOAD_LITTLE_32_a(block);
    522 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    523 		x_9  = LOAD_LITTLE_32_9(block);
    524 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    525 		x_8  = LOAD_LITTLE_32_8(block);
    526 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    527 		x_7  = LOAD_LITTLE_32_7(block);
    528 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    529 		x_6  = LOAD_LITTLE_32_6(block);
    530 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    531 		x_5  = LOAD_LITTLE_32_5(block);
    532 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    533 		x_4  = LOAD_LITTLE_32_4(block);
    534 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    535 		x_3  = LOAD_LITTLE_32_3(block);
    536 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    537 		x_2  = LOAD_LITTLE_32_2(block);
    538 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    539 		x_1  = LOAD_LITTLE_32_1(block);
    540 		/* LINTED E_BAD_PTR_CAST_ALIGN */
    541 		x_0  = LOAD_LITTLE_32_0(block);
    542 #else
    543 		x_15 = LOAD_LITTLE_32(block + 60);
    544 		x_14 = LOAD_LITTLE_32(block + 56);
    545 		x_13 = LOAD_LITTLE_32(block + 52);
    546 		x_12 = LOAD_LITTLE_32(block + 48);
    547 		x_11 = LOAD_LITTLE_32(block + 44);
    548 		x_10 = LOAD_LITTLE_32(block + 40);
    549 		x_9  = LOAD_LITTLE_32(block + 36);
    550 		x_8  = LOAD_LITTLE_32(block + 32);
    551 		x_7  = LOAD_LITTLE_32(block + 28);
    552 		x_6  = LOAD_LITTLE_32(block + 24);
    553 		x_5  = LOAD_LITTLE_32(block + 20);
    554 		x_4  = LOAD_LITTLE_32(block + 16);
    555 		x_3  = LOAD_LITTLE_32(block + 12);
    556 		x_2  = LOAD_LITTLE_32(block +  8);
    557 		x_1  = LOAD_LITTLE_32(block +  4);
    558 		x_0  = LOAD_LITTLE_32(block +  0);
    559 #endif /* sun4v */
    560 	}
    561 
    562 	/* round 1 */
    563 	FF(a, b, c, d, 	x_0, MD5_SHIFT_11, MD5_CONST_e(0));  /* 1 */
    564 	FF(d, a, b, c, 	x_1, MD5_SHIFT_12, MD5_CONST_o(1));  /* 2 */
    565 	FF(c, d, a, b, 	x_2, MD5_SHIFT_13, MD5_CONST_e(2));  /* 3 */
    566 	FF(b, c, d, a, 	x_3, MD5_SHIFT_14, MD5_CONST_o(3));  /* 4 */
    567 	FF(a, b, c, d, 	x_4, MD5_SHIFT_11, MD5_CONST_e(4));  /* 5 */
    568 	FF(d, a, b, c, 	x_5, MD5_SHIFT_12, MD5_CONST_o(5));  /* 6 */
    569 	FF(c, d, a, b, 	x_6, MD5_SHIFT_13, MD5_CONST_e(6));  /* 7 */
    570 	FF(b, c, d, a, 	x_7, MD5_SHIFT_14, MD5_CONST_o(7));  /* 8 */
    571 	FF(a, b, c, d, 	x_8, MD5_SHIFT_11, MD5_CONST_e(8));  /* 9 */
    572 	FF(d, a, b, c, 	x_9, MD5_SHIFT_12, MD5_CONST_o(9));  /* 10 */
    573 	FF(c, d, a, b, x_10, MD5_SHIFT_13, MD5_CONST_e(10)); /* 11 */
    574 	FF(b, c, d, a, x_11, MD5_SHIFT_14, MD5_CONST_o(11)); /* 12 */
    575 	FF(a, b, c, d, x_12, MD5_SHIFT_11, MD5_CONST_e(12)); /* 13 */
    576 	FF(d, a, b, c, x_13, MD5_SHIFT_12, MD5_CONST_o(13)); /* 14 */
    577 	FF(c, d, a, b, x_14, MD5_SHIFT_13, MD5_CONST_e(14)); /* 15 */
    578 	FF(b, c, d, a, x_15, MD5_SHIFT_14, MD5_CONST_o(15)); /* 16 */
    579 
    580 	/* round 2 */
    581 	GG(a, b, c, d,  x_1, MD5_SHIFT_21, MD5_CONST_e(16)); /* 17 */
    582 	GG(d, a, b, c,  x_6, MD5_SHIFT_22, MD5_CONST_o(17)); /* 18 */
    583 	GG(c, d, a, b, x_11, MD5_SHIFT_23, MD5_CONST_e(18)); /* 19 */
    584 	GG(b, c, d, a,  x_0, MD5_SHIFT_24, MD5_CONST_o(19)); /* 20 */
    585 	GG(a, b, c, d,  x_5, MD5_SHIFT_21, MD5_CONST_e(20)); /* 21 */
    586 	GG(d, a, b, c, x_10, MD5_SHIFT_22, MD5_CONST_o(21)); /* 22 */
    587 	GG(c, d, a, b, x_15, MD5_SHIFT_23, MD5_CONST_e(22)); /* 23 */
    588 	GG(b, c, d, a,  x_4, MD5_SHIFT_24, MD5_CONST_o(23)); /* 24 */
    589 	GG(a, b, c, d,  x_9, MD5_SHIFT_21, MD5_CONST_e(24)); /* 25 */
    590 	GG(d, a, b, c, x_14, MD5_SHIFT_22, MD5_CONST_o(25)); /* 26 */
    591 	GG(c, d, a, b,  x_3, MD5_SHIFT_23, MD5_CONST_e(26)); /* 27 */
    592 	GG(b, c, d, a,  x_8, MD5_SHIFT_24, MD5_CONST_o(27)); /* 28 */
    593 	GG(a, b, c, d, x_13, MD5_SHIFT_21, MD5_CONST_e(28)); /* 29 */
    594 	GG(d, a, b, c,  x_2, MD5_SHIFT_22, MD5_CONST_o(29)); /* 30 */
    595 	GG(c, d, a, b,  x_7, MD5_SHIFT_23, MD5_CONST_e(30)); /* 31 */
    596 	GG(b, c, d, a, x_12, MD5_SHIFT_24, MD5_CONST_o(31)); /* 32 */
    597 
    598 	/* round 3 */
    599 	HH(a, b, c, d,  x_5, MD5_SHIFT_31, MD5_CONST_e(32)); /* 33 */
    600 	HH(d, a, b, c,  x_8, MD5_SHIFT_32, MD5_CONST_o(33)); /* 34 */
    601 	HH(c, d, a, b, x_11, MD5_SHIFT_33, MD5_CONST_e(34)); /* 35 */
    602 	HH(b, c, d, a, x_14, MD5_SHIFT_34, MD5_CONST_o(35)); /* 36 */
    603 	HH(a, b, c, d,  x_1, MD5_SHIFT_31, MD5_CONST_e(36)); /* 37 */
    604 	HH(d, a, b, c,  x_4, MD5_SHIFT_32, MD5_CONST_o(37)); /* 38 */
    605 	HH(c, d, a, b,  x_7, MD5_SHIFT_33, MD5_CONST_e(38)); /* 39 */
    606 	HH(b, c, d, a, x_10, MD5_SHIFT_34, MD5_CONST_o(39)); /* 40 */
    607 	HH(a, b, c, d, x_13, MD5_SHIFT_31, MD5_CONST_e(40)); /* 41 */
    608 	HH(d, a, b, c,  x_0, MD5_SHIFT_32, MD5_CONST_o(41)); /* 42 */
    609 	HH(c, d, a, b,  x_3, MD5_SHIFT_33, MD5_CONST_e(42)); /* 43 */
    610 	HH(b, c, d, a,  x_6, MD5_SHIFT_34, MD5_CONST_o(43)); /* 44 */
    611 	HH(a, b, c, d,  x_9, MD5_SHIFT_31, MD5_CONST_e(44)); /* 45 */
    612 	HH(d, a, b, c, x_12, MD5_SHIFT_32, MD5_CONST_o(45)); /* 46 */
    613 	HH(c, d, a, b, x_15, MD5_SHIFT_33, MD5_CONST_e(46)); /* 47 */
    614 	HH(b, c, d, a,  x_2, MD5_SHIFT_34, MD5_CONST_o(47)); /* 48 */
    615 
    616 	/* round 4 */
    617 	II(a, b, c, d,  x_0, MD5_SHIFT_41, MD5_CONST_e(48)); /* 49 */
    618 	II(d, a, b, c,  x_7, MD5_SHIFT_42, MD5_CONST_o(49)); /* 50 */
    619 	II(c, d, a, b, x_14, MD5_SHIFT_43, MD5_CONST_e(50)); /* 51 */
    620 	II(b, c, d, a,  x_5, MD5_SHIFT_44, MD5_CONST_o(51)); /* 52 */
    621 	II(a, b, c, d, x_12, MD5_SHIFT_41, MD5_CONST_e(52)); /* 53 */
    622 	II(d, a, b, c,  x_3, MD5_SHIFT_42, MD5_CONST_o(53)); /* 54 */
    623 	II(c, d, a, b, x_10, MD5_SHIFT_43, MD5_CONST_e(54)); /* 55 */
    624 	II(b, c, d, a,  x_1, MD5_SHIFT_44, MD5_CONST_o(55)); /* 56 */
    625 	II(a, b, c, d,  x_8, MD5_SHIFT_41, MD5_CONST_e(56)); /* 57 */
    626 	II(d, a, b, c, x_15, MD5_SHIFT_42, MD5_CONST_o(57)); /* 58 */
    627 	II(c, d, a, b,  x_6, MD5_SHIFT_43, MD5_CONST_e(58)); /* 59 */
    628 	II(b, c, d, a, x_13, MD5_SHIFT_44, MD5_CONST_o(59)); /* 60 */
    629 	II(a, b, c, d,  x_4, MD5_SHIFT_41, MD5_CONST_e(60)); /* 61 */
    630 	II(d, a, b, c, x_11, MD5_SHIFT_42, MD5_CONST_o(61)); /* 62 */
    631 	II(c, d, a, b,  x_2, MD5_SHIFT_43, MD5_CONST_e(62)); /* 63 */
    632 	II(b, c, d, a,  x_9, MD5_SHIFT_44, MD5_CONST_o(63)); /* 64 */
    633 
    634 	ctx->state[0] += a;
    635 	ctx->state[1] += b;
    636 	ctx->state[2] += c;
    637 	ctx->state[3] += d;
    638 
    639 	/*
    640 	 * zeroize sensitive information -- compiler will optimize
    641 	 * this out if everything is kept in registers
    642 	 */
    643 
    644 	x_0 = x_1  = x_2  = x_3  = x_4  = x_5  = x_6  = x_7 = x_8 = 0;
    645 	x_9 = x_10 = x_11 = x_12 = x_13 = x_14 = x_15 = 0;
    646 }
    647 #endif /* !defined(__amd64) */
    648 
    649 /*
    650  * Encode()
    651  *
    652  * purpose: to convert a list of numbers from big endian to little endian
    653  *   input: uint8_t *	: place to store the converted little endian numbers
    654  *	    uint32_t *	: place to get numbers to convert from
    655  *          size_t	: the length of the input in bytes
    656  *  output: void
    657  */
    658 
    659 static void
    660 Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
    661     size_t input_len)
    662 {
    663 	size_t		i, j;
    664 
    665 	for (i = 0, j = 0; j < input_len; i++, j += sizeof (uint32_t)) {
    666 
    667 #ifdef _LITTLE_ENDIAN
    668 
    669 #ifdef _MD5_CHECK_ALIGNMENT
    670 		if ((uintptr_t)output & 0x3)	/* Not 4-byte aligned */
    671 			bcopy(input + i, output + j, 4);
    672 		else *(uint32_t *)(output + j) = input[i];
    673 #else
    674 		/*LINTED E_BAD_PTR_CAST_ALIGN*/
    675 		*(uint32_t *)(output + j) = input[i];
    676 #endif /* _MD5_CHECK_ALIGNMENT */
    677 
    678 #else	/* big endian -- will work on little endian, but slowly */
    679 
    680 		output[j] = input[i] & 0xff;
    681 		output[j + 1] = (input[i] >> 8)  & 0xff;
    682 		output[j + 2] = (input[i] >> 16) & 0xff;
    683 		output[j + 3] = (input[i] >> 24) & 0xff;
    684 #endif
    685 	}
    686 }
    687