Home | History | Annotate | Download | only in smbsrv
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Multibyte/wide-char conversion routines. Wide-char encoding provides
     28  * a fixed size character encoding that maps to the Unicode 16-bit
     29  * (UCS-2) character set standard. Multibyte or UCS transformation
     30  * format (UTF) encoding is a variable length character encoding scheme
     31  * that s compatible with existing ASCII characters and guarantees that
     32  * the resultant strings do not contain embedded null characters. Both
     33  * types of encoding provide a null terminator: single byte for UTF-8
     34  * and a wide-char null for Unicode. See RFC 2044.
     35  *
     36  * The table below illustrates the UTF-8 encoding scheme. The letter x
     37  * indicates bits available for encoding the character value.
     38  *
     39  *	UCS-2			UTF-8 octet sequence (binary)
     40  *	0x0000-0x007F	0xxxxxxx
     41  *	0x0080-0x07FF	110xxxxx 10xxxxxx
     42  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
     43  *
     44  * RFC 2044
     45  * UTF-8,a transformation format of UNICODE and ISO 10646
     46  * F. Yergeau
     47  * Alis Technologies
     48  * October 1996
     49  */
     50 
     51 #ifdef _KERNEL
     52 #include <sys/types.h>
     53 #include <sys/sunddi.h>
     54 #else
     55 #include <stdio.h>
     56 #include <stdlib.h>
     57 #include <assert.h>
     58 #include <strings.h>
     59 #endif
     60 #include <smbsrv/string.h>
     61 
     62 
     63 /*
     64  * mbstowcs
     65  *
     66  * The mbstowcs() function converts a multibyte character string
     67  * mbstring into a wide character string wcstring. No more than
     68  * nwchars wide characters are stored. A terminating null wide
     69  * character is appended if there is room.
     70  *
     71  * Returns the number of wide characters converted, not counting
     72  * any terminating null wide character. Returns -1 if an invalid
     73  * multibyte character is encountered.
     74  */
     75 size_t
     76 smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
     77 {
     78 	int len;
     79 	smb_wchar_t	*start = wcstring;
     80 
     81 	while (nwchars--) {
     82 		len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
     83 		if (len < 0) {
     84 			*wcstring = 0;
     85 			return ((size_t)-1);
     86 		}
     87 
     88 		if (*mbstring == 0)
     89 			break;
     90 
     91 		++wcstring;
     92 		mbstring += len;
     93 	}
     94 
     95 	return (wcstring - start);
     96 }
     97 
     98 
     99 /*
    100  * mbtowc
    101  *
    102  * The mbtowc() function converts a multibyte character mbchar into
    103  * a wide character and stores the result in the object pointed to
    104  * by wcharp. Up to nbytes bytes are examined.
    105  *
    106  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
    107  * states are not supported.  Shift states are used to switch between
    108  * representation modes using reserved bytes to signal shifting
    109  * without them being interpreted as characters.  If mbchar is null
    110  * mbtowc should return non-zero if the current locale requires shift
    111  * states.  Otherwise it should be return 0.
    112  *
    113  * If mbchar is non-null, returns the number of bytes processed in
    114  * mbchar.  If mbchar is invalid, returns -1.
    115  */
    116 int /*ARGSUSED*/
    117 smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
    118 {
    119 	unsigned char mbyte;
    120 	smb_wchar_t wide_char;
    121 	int count;
    122 	int bytes_left;
    123 
    124 	if (mbchar == NULL)
    125 		return (0); /* no shift states */
    126 
    127 	/* 0xxxxxxx -> 1 byte ASCII encoding */
    128 	if (((mbyte = *mbchar++) & 0x80) == 0) {
    129 		if (wcharp)
    130 			*wcharp = (smb_wchar_t)mbyte;
    131 
    132 		return (mbyte ? 1 : 0);
    133 	}
    134 
    135 	/* 10xxxxxx -> invalid first byte */
    136 	if ((mbyte & 0x40) == 0)
    137 		return (-1);
    138 
    139 	wide_char = mbyte;
    140 	if ((mbyte & 0x20) == 0) {
    141 		wide_char &= 0x1f;
    142 		bytes_left = 1;
    143 	} else if ((mbyte & 0x10) == 0) {
    144 		wide_char &= 0x0f;
    145 		bytes_left = 2;
    146 	} else {
    147 		return (-1);
    148 	}
    149 
    150 	count = 1;
    151 	while (bytes_left--) {
    152 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
    153 			return (-1);
    154 
    155 		count++;
    156 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
    157 	}
    158 
    159 	if (wcharp)
    160 		*wcharp = wide_char;
    161 
    162 	return (count);
    163 }
    164 
    165 
    166 /*
    167  * wctomb
    168  *
    169  * The wctomb() function converts a wide character wchar into a multibyte
    170  * character and stores the result in mbchar. The object pointed to by
    171  * mbchar must be large enough to accommodate the multibyte character.
    172  *
    173  * Returns the numberof bytes written to mbchar.
    174  */
    175 int
    176 smb_wctomb(char *mbchar, smb_wchar_t wchar)
    177 {
    178 	if ((wchar & ~0x7f) == 0) {
    179 		*mbchar = (char)wchar;
    180 		return (1);
    181 	}
    182 
    183 	if ((wchar & ~0x7ff) == 0) {
    184 		*mbchar++ = (wchar >> 6) | 0xc0;
    185 		*mbchar = (wchar & 0x3f) | 0x80;
    186 		return (2);
    187 	}
    188 
    189 	*mbchar++ = (wchar >> 12) | 0xe0;
    190 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
    191 	*mbchar = (wchar & 0x3f) | 0x80;
    192 	return (3);
    193 }
    194 
    195 
    196 /*
    197  * wcstombs
    198  *
    199  * The wcstombs() function converts a wide character string wcstring
    200  * into a multibyte character string mbstring. Up to nbytes bytes are
    201  * stored in mbstring. Partial multibyte characters at the end of the
    202  * string are not stored. The multibyte character string is null
    203  * terminated if there is room.
    204  *
    205  * Returns the number of bytes converted, not counting the terminating
    206  * null byte.
    207  */
    208 size_t
    209 smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
    210 {
    211 	char *start = mbstring;
    212 	const smb_wchar_t *wcp = wcstring;
    213 	smb_wchar_t wide_char;
    214 	char buf[4];
    215 	size_t len;
    216 
    217 	if ((mbstring == NULL) || (wcstring == NULL))
    218 		return (0);
    219 
    220 	while (nbytes > MTS_MB_CHAR_MAX) {
    221 		wide_char = *wcp++;
    222 		len = smb_wctomb(mbstring, wide_char);
    223 
    224 		if (wide_char == 0)
    225 			/*LINTED E_PTRDIFF_OVERFLOW*/
    226 			return (mbstring - start);
    227 
    228 		mbstring += len;
    229 		nbytes -= len;
    230 	}
    231 
    232 	while (wide_char && nbytes) {
    233 		wide_char = *wcp++;
    234 		if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
    235 			*mbstring = 0;
    236 			break;
    237 		}
    238 
    239 		bcopy(buf, mbstring, len);
    240 		mbstring += len;
    241 		nbytes -= len;
    242 	}
    243 
    244 	/*LINTED E_PTRDIFF_OVERFLOW*/
    245 	return (mbstring - start);
    246 }
    247 
    248 
    249 /*
    250  * Returns the number of bytes that would be written if the multi-
    251  * byte string mbs was converted to a wide character string, not
    252  * counting the terminating null wide character.
    253  */
    254 size_t
    255 smb_wcequiv_strlen(const char *mbs)
    256 {
    257 	smb_wchar_t	wide_char;
    258 	size_t bytes;
    259 	size_t len = 0;
    260 
    261 	while (*mbs) {
    262 		bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
    263 		if (bytes == ((size_t)-1))
    264 			return ((size_t)-1);
    265 
    266 		len += sizeof (smb_wchar_t);
    267 		mbs += bytes;
    268 	}
    269 
    270 	return (len);
    271 }
    272 
    273 
    274 /*
    275  * Returns the number of bytes that would be written if the multi-
    276  * byte string mbs was converted to a single byte character string,
    277  * not counting the terminating null character.
    278  */
    279 size_t
    280 smb_sbequiv_strlen(const char *mbs)
    281 {
    282 	smb_wchar_t	wide_char;
    283 	size_t nbytes;
    284 	size_t len = 0;
    285 
    286 	while (*mbs) {
    287 		nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
    288 		if (nbytes == ((size_t)-1))
    289 			return ((size_t)-1);
    290 
    291 		if (wide_char & 0xFF00)
    292 			len += sizeof (smb_wchar_t);
    293 		else
    294 			++len;
    295 
    296 		mbs += nbytes;
    297 	}
    298 
    299 	return (len);
    300 }
    301 
    302 
    303 /*
    304  * stombs
    305  *
    306  * Convert a regular null terminated string 'string' to a UTF-8 encoded
    307  * null terminated multi-byte string 'mbstring'. Only full converted
    308  * UTF-8 characters will be written 'mbstring'. If a character will not
    309  * fit within the remaining buffer space or 'mbstring' will overflow
    310  * max_mblen, the conversion process will be terminated and 'mbstring'
    311  * will be null terminated.
    312  *
    313  * Returns the number of bytes written to 'mbstring', excluding the
    314  * terminating null character.
    315  *
    316  * If either mbstring or string is a null pointer, -1 is returned.
    317  */
    318 int
    319 smb_stombs(char *mbstring, char *string, int max_mblen)
    320 {
    321 	char *start = mbstring;
    322 	unsigned char *p = (unsigned char *)string;
    323 	int space_left = max_mblen;
    324 	int	len;
    325 	smb_wchar_t	wide_char;
    326 	char buf[4];
    327 
    328 	if (!mbstring || !string)
    329 		return (-1);
    330 
    331 	while (*p && space_left > 2) {
    332 		wide_char = *p++;
    333 		len = smb_wctomb(mbstring, wide_char);
    334 		mbstring += len;
    335 		space_left -= len;
    336 	}
    337 
    338 	if (*p) {
    339 		wide_char = *p;
    340 		if ((len = smb_wctomb(buf, wide_char)) < 2) {
    341 			*mbstring = *buf;
    342 			mbstring += len;
    343 			space_left -= len;
    344 		}
    345 	}
    346 
    347 	*mbstring = '\0';
    348 
    349 	/*LINTED E_PTRDIFF_OVERFLOW*/
    350 	return (mbstring - start);
    351 }
    352 
    353 
    354 /*
    355  * mbstos
    356  *
    357  * Convert a null terminated multi-byte string 'mbstring' to a regular
    358  * null terminated string 'string'.  A 1-byte character in 'mbstring'
    359  * maps to a 1-byte character in 'string'. A 2-byte character in
    360  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
    361  * Otherwise the upper byte null will be discarded to ensure that the
    362  * output stream does not contain embedded null characters.
    363  *
    364  * If the input stream contains invalid multi-byte characters, a value
    365  * of -1 will be returned. Otherwise the length of 'string', excluding
    366  * the terminating null character, is returned.
    367  *
    368  * If either mbstring or string is a null pointer, -1 is returned.
    369  */
    370 int
    371 smb_mbstos(char *string, const char *mbstring)
    372 {
    373 	smb_wchar_t wc;
    374 	unsigned char *start = (unsigned char *)string;
    375 	int len;
    376 
    377 	if (string == NULL || mbstring == NULL)
    378 		return (-1);
    379 
    380 	while (*mbstring) {
    381 		if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
    382 			*string = 0;
    383 			return (-1);
    384 		}
    385 
    386 		if (wc & 0xFF00) {
    387 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
    388 			*((smb_wchar_t *)string) = wc;
    389 			string += sizeof (smb_wchar_t);
    390 		}
    391 		else
    392 		{
    393 			*string = (unsigned char)wc;
    394 			string++;
    395 		}
    396 
    397 		mbstring += len;
    398 	}
    399 
    400 	*string = 0;
    401 
    402 	/*LINTED E_PTRDIFF_OVERFLOW*/
    403 	return ((unsigned char *)string - start);
    404 }
    405