Home | History | Annotate | Download | only in auto_ef
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 2003, by Sun Microsystems, Inc.
     23  * All rights reserved.
     24  */
     25 
     26 #ident  "@(#)auto_ef_file.c 1.18 07/04/12 SMI"
     27 
     28 #include <ctype.h>
     29 #include <fcntl.h>
     30 #include <memory.h>
     31 #include <string.h>
     32 #include <unistd.h>
     33 #include <sys/types.h>
     34 #include <sys/stat.h>
     35 #include <errno.h>
     36 
     37 #include "auto_ef_lib.h"
     38 
     39 #define	AUTOEF_BLOCKSIZE	8192
     40 #define	HASHSIZE		8192
     41 #define AUTO_EF_LINE_MAX	65536
     42 #define LEVEL0_LINE	1024
     43 #define LEVEL1_LINE	256
     44 #define LEVEL2_LINE	64
     45 #define LEVEL3_LINE	0
     46 
     47 auto_ef_t *execute_file(const char *, int, size_t *);
     48 void CorrectATEFO(_auto_ef_t);
     49 void remove_encoding(_auto_ef_t, char *);
     50 int buflength_file(char *);
     51 int SuperSetOr2022(char *);
     52 void ConvScore_file(_auto_ef_t *);
     53 void CalcAutoefFile(auto_ef_t *, _auto_ef_t *);
     54 void AutoefAddScore(char *, double, _auto_ef_t *);
     55 int AutoefFindKeyWord(char *, _auto_ef_t *);
     56 
     57 enum eSuperSetOr2022{ eASCII, eISO_JP, eISO_CNorKR, eISO_Other,
     58 	eCP949, eGB18030, eHKSCS, eCP874 };
     59 
     60 size_t auto_ef_file(auto_ef_t **aef, const char *convert_file_name,
     61 	int auto_ef_flag) {
     62 
     63 	struct stat filetype;
     64 
     65 	size_t auto_ef_file_size = 0;
     66 	_auto_ef_t root_autoef = (_auto_ef_t) NULL;
     67 
     68 	/*
     69 	 * Determine fine type
     70 	 */
     71 
     72 	if (convert_file_name != NULL) {
     73 		if (stat(convert_file_name, &filetype) == 0) {
     74 			if (! S_ISREG(filetype.st_mode)) {
     75 				errno = EACCES;
     76 				*aef = NULL;
     77 				return ((size_t)-1);
     78 			}
     79 		} else {
     80 			/* stat() failed, return -1 with errno from stat */
     81 			*aef = NULL;
     82 			return ((size_t)-1);
     83 		}
     84 	}
     85 
     86 	*aef = execute_file(convert_file_name, auto_ef_flag, &auto_ef_file_size);
     87 
     88 	return (auto_ef_file_size);
     89 }
     90 
     91 auto_ef_t *execute_file(const char *file,
     92 	int auto_ef_flag, size_t *return_size)
     93 {
     94 	char	*lbuf, *p;
     95 	long	count;
     96 	long	offset = 0;
     97 	char	*next_ptr = NULL;
     98 	long	next_count = 0;
     99 	char  auto_ef_block[AUTOEF_BLOCKSIZE] = "\0";
    100 	int   auto_ef_offset = 0, auto_ef_blocksize = 0;
    101 	char  *tmpbuf;
    102 	int i;
    103 	int autoef_flag = 0;
    104 	int autoef_times = 0;
    105 	int level_flag = 0;
    106 	int level_max;
    107 	int auto_ef_count = 0;
    108 
    109 	int level = 0;
    110 
    111 	long long tln;
    112 	char *prntbuf = NULL;
    113 	long fw_lPrntBufLen = 0;
    114 	int temp;
    115 	char *linebuf;
    116 	long long lnum;
    117 	char *ptr, *ptrend;
    118 	int nlflag;
    119 	auto_ef_t *rootp;
    120 	_auto_ef_t root_autoef_file = NULL;
    121 
    122 	int auto_ef_overline = 0;
    123 
    124 	tln = 0;
    125 
    126 	fw_lPrntBufLen = BUFSIZ + 1;
    127 	if ((prntbuf = malloc(fw_lPrntBufLen)) == NULL) {
    128 		errno = ENOMEM;
    129 		*return_size = (size_t)-1;
    130 		return (NULL);
    131 	}
    132 
    133 	if ((linebuf = malloc(fw_lPrntBufLen)) == NULL) {
    134 		errno = ENOMEM;
    135 		*return_size = (size_t)-1;
    136 		free(prntbuf);
    137 		return NULL;
    138 	}
    139 
    140 	if (file == NULL)
    141 		temp = 0;
    142 	else if ((temp = open(file, O_RDONLY)) == -1) {
    143 		errno = EACCES;
    144 		*return_size = (size_t)-1;
    145 		free(prntbuf);
    146 		free(linebuf);
    147 		return NULL;
    148 	}
    149 
    150 	/* read in first block of bytes */
    151 	if ((count = read(temp, prntbuf, BUFSIZ)) <= 0) {
    152 		(void) close(temp);
    153 		errno = ENOMEM;
    154 		*return_size = (size_t)-1;
    155 		free(prntbuf);
    156 		free(linebuf);
    157 		return NULL;
    158 	}
    159 
    160 	/* Mac file format */
    161 	for (i = 0; i < count; i++) {
    162 		char a = prntbuf[i];
    163 		char b = (i + 1) < count ? prntbuf[i + 1] : 0;
    164 		if (a == '\r' && b != '\n')
    165 			prntbuf[i] = '\n';
    166 	}
    167 
    168 	lnum = 0;
    169 	ptr = prntbuf;
    170 
    171 	level = auto_ef_flag & 0x3;
    172 
    173 	switch (level) {
    174 	case 0:
    175 		level_max = LEVEL0_LINE;
    176 		break;
    177 	case 1:
    178 		level_max = LEVEL1_LINE;
    179 		break;
    180 	case 2:
    181 		level_max = LEVEL2_LINE;
    182 		break;
    183 	default:
    184 		level_max = LEVEL3_LINE;
    185 		break;
    186 	}
    187 
    188 	for (;;) {
    189 		if (level_max != 0) {
    190 			level_flag++;
    191 		}
    192 		/* look for next newline */
    193 		if ((ptrend = memchr(ptr + offset, '\n', count)) == NULL) {
    194 			offset += count;
    195 
    196 			/*
    197 			 * shift unused data to the beginning of the buffer
    198 			 */
    199 
    200 			if (ptr > prntbuf) {
    201 				(void) memmove(prntbuf, ptr, offset);
    202 				ptr = prntbuf;
    203 			}
    204 
    205 			/*
    206 			 * re-allocate a larger buffer if this one is full
    207 			 */
    208 
    209 			if (fw_lPrntBufLen < AUTO_EF_LINE_MAX) {
    210 
    211 				if (offset + BUFSIZ > fw_lPrntBufLen) {
    212 					/*
    213 					 * allocate a new buffer and preserve the
    214 					 * contents...
    215 					 */
    216 					fw_lPrntBufLen += BUFSIZ;
    217 
    218 					if ((prntbuf = realloc(prntbuf, fw_lPrntBufLen))
    219 						== NULL) {
    220 
    221 						errno = ENOMEM;
    222 						*return_size = (size_t)-1;
    223 						free(prntbuf);
    224 						free(linebuf);
    225 						return NULL;
    226 					}
    227 					/*
    228 					 * set up a bigger linebuffer
    229 					 * (this is only used
    230 					 * for case insensitive
    231 					 * operations). Contents do
    232 					 * not have to be preserved.
    233 					 */
    234 					free(linebuf);
    235 					if ((linebuf = malloc(fw_lPrntBufLen))
    236 						== NULL) {
    237 
    238 						errno = ENOMEM;
    239 						*return_size = (size_t)-1;
    240 						free(prntbuf);
    241 						free(linebuf);
    242 						return NULL;
    243 					}
    244 
    245 					ptr = prntbuf;
    246 				}
    247 
    248 				p = prntbuf + offset;
    249 				if ((count = read(temp, p, BUFSIZ)) > 0) {
    250 					/* Mac file format */
    251 					for (i = 0; i < BUFSIZ; i++) {
    252 						char a = p[i];
    253 						char b = (i + 1) < count ? p[i + 1] : 0;
    254 						if (a == '\r' && b != '\n')
    255 							p[i] = '\n';
    256 					}
    257 					continue;
    258 				}
    259 
    260 				if (offset == 0)
    261 					/* end of file already reached */
    262 					break;
    263 
    264 				/* last line of file has no newline */
    265 				ptrend = ptr + offset;
    266 				nlflag = 0;
    267 
    268 			} else {
    269 				/*
    270 				char tmpbuf[BUFSIZ+1];
    271 				*/
    272 				char *a;
    273 
    274 				a = prntbuf;
    275 
    276 				auto_ef_overline = 1;
    277 				if ((tmpbuf = (char *)malloc(fw_lPrntBufLen + 1)) == NULL) {
    278 					errno = ENOMEM;
    279 					*return_size = (size_t)-1;
    280 					free(prntbuf);
    281 					free(linebuf);
    282 					return NULL;
    283 				}
    284 
    285 				strlcpy(tmpbuf, ptr, fw_lPrntBufLen + 1);
    286 
    287 				for(;;){
    288 					count = read(temp, a, count);
    289 					if ((ptrend = memchr(a, '\n', count)) != NULL)
    290 						break;
    291 					offset += count;
    292 					if (count == 0) break;
    293 				}
    294 				if (ptrend != NULL){
    295 					next_ptr = ptrend + 1;
    296 					next_count = count - (next_ptr - ptr);
    297 					nlflag = 1;
    298 				} else {
    299 					next_count = 0;
    300 					nlflag = 0;
    301 				}
    302 			}
    303 
    304 		} else {
    305 			next_ptr = ptrend + 1;
    306 			next_count = offset + count - (next_ptr - ptr);
    307 			nlflag = 1;
    308 		}
    309 		lnum++;
    310 		if (ptrend != NULL)
    311 			*ptrend = '\0';
    312 
    313 		if (auto_ef_overline){
    314 			auto_ef_overline = 0;
    315 
    316 		} else {
    317 
    318 			if ((tmpbuf = (char *)malloc(fw_lPrntBufLen + 1)) == NULL) {
    319 				errno = ENOMEM;
    320 				*return_size = (size_t)-1;
    321 				free(prntbuf);
    322 				free(linebuf);
    323 				return NULL;
    324 			}
    325 
    326 			strlcpy(tmpbuf, ptr, fw_lPrntBufLen + 1);
    327 		}
    328 
    329 		auto_ef_offset = buflength_file(tmpbuf);
    330 		auto_ef_offset++;
    331 		auto_ef_blocksize = auto_ef_offset + auto_ef_blocksize;
    332 
    333 		if (auto_ef_blocksize > AUTOEF_BLOCKSIZE) {
    334 			size_t atefsize = 0;
    335 			if (auto_ef_offset < AUTOEF_BLOCKSIZE) {
    336 
    337 				/*
    338 				 * In case of auto_ef_block is
    339 				 * full and next get block is not
    340 				 * larger than auto_ef_block
    341 				 */
    342 				if (level_max == 0 ||
    343 					(autoef_times % level_max == 0) ||
    344 					autoef_times == 0) {
    345 
    346 					atefsize = auto_ef_str(&rootp,
    347 						auto_ef_block, AUTOEF_BLOCKSIZE,
    348 						level);
    349 					if (atefsize == (size_t)0) {
    350 						auto_ef_free(rootp);
    351 					} else if (atefsize == -1) {
    352 						auto_ef_free(rootp);
    353 					} else {
    354 						CalcAutoefFile(rootp, &root_autoef_file);
    355 						auto_ef_free(rootp);
    356 						autoef_flag = 0;
    357 						autoef_times++;
    358 					}
    359 				}
    360 				strncpy(auto_ef_block, (const char *)tmpbuf, AUTOEF_BLOCKSIZE);
    361 				auto_ef_blocksize = auto_ef_offset;
    362 			} else {
    363 				/*
    364 				 * In case of auto_ef_block
    365 				 * is full and next get block
    366 				 * is larger than auto_ef_block
    367 				 */
    368 				if (level_max == 0 ||
    369 					(autoef_times % level_max == 0) ||
    370 					autoef_times == 0) {
    371 
    372 					atefsize = auto_ef_str(&rootp, tmpbuf,
    373 						auto_ef_offset, level);
    374 
    375 					if (atefsize == (size_t)0) {
    376 						auto_ef_free(rootp);
    377 					} else if (atefsize == -1) {
    378 						auto_ef_free(rootp);
    379 					} else {
    380 						CalcAutoefFile(rootp, &root_autoef_file);
    381 						auto_ef_free(rootp);
    382 						auto_ef_blocksize = 0;
    383 						autoef_flag = 0;
    384 						autoef_times++;
    385 					}
    386 				}
    387 				auto_ef_block[0] = '\0';
    388 				auto_ef_blocksize = 0;
    389 			}
    390 		} else {
    391 			strncat(auto_ef_block, (const char *)tmpbuf, AUTOEF_BLOCKSIZE);
    392 			autoef_flag = 1;
    393 		}
    394 
    395 		free(tmpbuf);
    396 		if (!nlflag)
    397 			break;
    398 
    399 		ptr = next_ptr;
    400 		count = next_count;
    401 		offset = 0;
    402 		if (fw_lPrntBufLen > AUTO_EF_LINE_MAX)
    403 			fw_lPrntBufLen = BUFSIZ + 1;
    404 	}
    405 
    406 	free(tmpbuf);
    407 	free(prntbuf);
    408 	free(linebuf);
    409 
    410 	(void) close(temp);
    411 	if (autoef_flag == 1) {
    412 		size_t atefsize = 0;
    413 		autoef_times++;
    414 		atefsize = auto_ef_str(&rootp, auto_ef_block,
    415 			AUTOEF_BLOCKSIZE, level);
    416 		if (atefsize == (size_t)0) {
    417 			auto_ef_free(rootp);
    418 		} else if (atefsize == -1) {
    419 			auto_ef_free(rootp);
    420 		} else {
    421 			CalcAutoefFile(rootp, &root_autoef_file);
    422 			auto_ef_free(rootp);
    423 		}
    424 	}
    425 
    426 	if (root_autoef_file != NULL) {
    427 		auto_ef_t *tmp = NULL;
    428 		size_t autoef_size;
    429 		_auto_ef_t tmpautoef;
    430 
    431 		CorrectATEFO(root_autoef_file);
    432 		tmpautoef = SortATEFO(root_autoef_file);
    433 		Free_AUTOEF(&root_autoef_file);
    434 		ConvScore_file(&tmpautoef);
    435 		tmp = ATEFO2AUTOEF(tmpautoef, &autoef_size);
    436 		Free_AUTOEF(&tmpautoef);
    437 		*return_size = autoef_size;
    438 		return (tmp);
    439 	} else {
    440 		*return_size = (size_t)0;
    441 		Free_AUTOEF(&root_autoef_file);
    442 		return (NULL);
    443 	}
    444 
    445 }
    446 
    447 void CorrectATEFO(_auto_ef_t rtp) {
    448 	/* Remove downward compatibility if the other encoding is included */
    449 	_auto_ef_t p;
    450 	int codeid;
    451 	int ascii = FALSE, iso2022 = FALSE, cp949_gbk = FALSE;
    452 	int gb18030 = FALSE, hkscs = FALSE, others = FALSE;
    453 	int iso2022_krcn = FALSE;
    454 	int cp874 = FALSE;
    455 
    456 	for (p = rtp; p != NULL; p = p->_next_autoef) {
    457 		switch (SuperSetOr2022(p->_encoding)) {
    458 		case 0:
    459 			ascii = TRUE;
    460 			break;
    461 		case 1:
    462 			iso2022 = TRUE;
    463 			break;
    464 		case 2:
    465 			iso2022 = TRUE;
    466 			iso2022_krcn = TRUE;
    467 			break;
    468 		case 4:
    469 			cp949_gbk = TRUE;
    470 			break;
    471 		case 5:
    472 			gb18030 = TRUE;
    473 			break;
    474 		case 6:
    475 			hkscs = TRUE;
    476 			break;
    477 		case 7:
    478 			cp874 = TRUE;
    479 			break;
    480 		default:
    481 			others = TRUE;
    482 		}
    483 	}
    484 
    485 	if ((ascii == TRUE) && (iso2022 == TRUE || cp949_gbk == TRUE ||
    486 		gb18030 == TRUE || hkscs == TRUE || others == TRUE)) {
    487 
    488 		/* remove ascii */
    489 		remove_encoding(rtp, ASCII);
    490 	}
    491 
    492 	if (iso2022 == TRUE) {
    493 		if (iso2022_krcn == TRUE) {
    494 			remove_encoding(rtp, ISOJP);
    495 		}
    496 	}
    497 
    498 	if (cp949_gbk == TRUE) {
    499 		/* remove euc */
    500 		remove_encoding(rtp, EUCJP);
    501 		remove_encoding(rtp, EUCKR);
    502 		remove_encoding(rtp, EUCCN);
    503 		remove_encoding(rtp, EUCTW);
    504 	}
    505 
    506 	if (gb18030 == TRUE) {
    507 		/* remove euc */
    508 		remove_encoding(rtp, EUCJP);
    509 		remove_encoding(rtp, EUCKR);
    510 		remove_encoding(rtp, EUCCN);
    511 		remove_encoding(rtp, EUCTW);
    512 	}
    513 
    514 	if (hkscs == TRUE) {
    515 		/* remove big5 */
    516 		remove_encoding(rtp, BIG5);
    517 	}
    518 
    519 	if (cp874 == TRUE) {
    520 		/* remove TIS620.2533 */
    521 		remove_encoding(rtp, TIS620);
    522 	}
    523 }
    524 
    525 void remove_encoding(_auto_ef_t rtp, char *a) {
    526 	_auto_ef_t p;
    527 
    528 	for (p = rtp; p != NULL; p = p->_next_autoef) {
    529 		if (strcmp(p->_encoding, a) == 0) {
    530 			p->_score = 0;
    531 			break;
    532 		}
    533 	}
    534 }
    535 
    536 int buflength_file(char *buf) {
    537 	int i;
    538 
    539 	for (i = 0; ; i++) {
    540 		if (buf[i] == '\0') break;
    541 	}
    542 
    543 	return (i);
    544 }
    545 
    546 void CalcAutoefFile(auto_ef_t *autoefp, _auto_ef_t *root_autoef_file) {
    547 	auto_ef_t *p;
    548 	int flag;
    549 
    550 	for (p = autoefp; *p != NULL; p++) {
    551 		if ((AutoefFindKeyWord(auto_ef_get_encoding(*p), root_autoef_file)) == TRUE) {
    552 			AutoefAddScore(auto_ef_get_encoding(*p),
    553 				auto_ef_get_score(*p), root_autoef_file);
    554 		} else {
    555 			/*
    556 			 * This Regist_AUTOEF can be void
    557 			 */
    558 			(void) Regist_AUTOEF(auto_ef_get_encoding(*p),
    559 				auto_ef_get_score(*p),
    560 				M_FromCodeToLang(auto_ef_get_encoding(*p)),
    561 				root_autoef_file);
    562 		}
    563 	}
    564 }
    565 
    566 int SuperSetOr2022(char *code) {
    567 
    568 	if (strcmp(code, ASCII)		== 0)
    569 		return (eASCII);
    570 	if (strcmp(code, ISOJP)		== 0)
    571 		return (eISO_JP);
    572 	if (strcmp(code, ISOKR)		== 0)
    573 		return (eISO_CNorKR);
    574 	if (strcmp(code, ISOCN)	== 0)
    575 		return (eISO_CNorKR);
    576 	if (strcmp(code, CP949)		== 0)
    577 		return (eCP949);
    578 	if (strcmp(code, GB18030)	== 0)
    579 		return (eGB18030);
    580 	if (strcmp(code, HKSCS)		== 0)
    581 		return (eHKSCS);
    582 	if (strcmp(code, CP874)		== 0)
    583 		return (eCP874);
    584 	return (-1);
    585 }
    586 
    587 void ConvScore_file(_auto_ef_t *rtp) {
    588 	_auto_ef_t p, q, np;
    589 	double percent = 0.0;
    590 	double sum = 0.0;
    591 	int j = 0;
    592 	int times_flag = 0;
    593 
    594 	for (p = *rtp; p != NULL; p = p->_next_autoef) {
    595 		if (p->_score > 0) {
    596 			sum += p->_score;
    597 			j++;
    598 			if (fmod(p->_score, 100.0) != 0.0)
    599 				times_flag = -1;
    600 		}
    601 	}
    602 
    603 	if (times_flag >= 0 && j > 0) {
    604 		(*rtp)->_score = 100.0;
    605 		if ((*rtp)->_next_autoef != NULL) {
    606 			Free_AUTOEF(&((*rtp)->_next_autoef));
    607 		}
    608 		return;
    609 	}
    610 
    611 
    612 	for (p = *rtp, q = *rtp; p != NULL;
    613 			q = p, p = p->_next_autoef) {
    614 		if (p->_score != 100.0) {
    615 			percent = (int)((p->_score / sum) * 1000.0) / 10;
    616 			p->_score = percent;
    617 		}
    618 		if (p->_score <= 0.0) {
    619 			Free_AUTOEF(&p);
    620 			q->_next_autoef = NULL;
    621 			break;
    622 		}
    623 	}
    624 }
    625 
    626 void AutoefAddScore(char *a, double addscore, _auto_ef_t *root_autoef_file) {
    627         _auto_ef_t p;
    628 
    629         for (p = *root_autoef_file; p != NULL; p = p->_next_autoef) {
    630                 if (strcmp(p->_encoding, a) == 0) {
    631                         p->_score = p->_score + addscore;
    632                 }
    633         }
    634 }
    635 
    636 int AutoefFindKeyWord(char *a, _auto_ef_t *root_autoef_file) {
    637         _auto_ef_t p;
    638 
    639         for (p = *root_autoef_file; p != NULL; p = p->_next_autoef) {
    640                 if (strcmp(p->_encoding, a) == 0)
    641                         return (TRUE);
    642         }
    643 
    644         return (FALSE);
    645 }
    646 
    647