1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, by Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 26 #ident "@(#)auto_ef_file.c 1.18 07/04/12 SMI" 27 28 #include <ctype.h> 29 #include <fcntl.h> 30 #include <memory.h> 31 #include <string.h> 32 #include <unistd.h> 33 #include <sys/types.h> 34 #include <sys/stat.h> 35 #include <errno.h> 36 37 #include "auto_ef_lib.h" 38 39 #define AUTOEF_BLOCKSIZE 8192 40 #define HASHSIZE 8192 41 #define AUTO_EF_LINE_MAX 65536 42 #define LEVEL0_LINE 1024 43 #define LEVEL1_LINE 256 44 #define LEVEL2_LINE 64 45 #define LEVEL3_LINE 0 46 47 auto_ef_t *execute_file(const char *, int, size_t *); 48 void CorrectATEFO(_auto_ef_t); 49 void remove_encoding(_auto_ef_t, char *); 50 int buflength_file(char *); 51 int SuperSetOr2022(char *); 52 void ConvScore_file(_auto_ef_t *); 53 void CalcAutoefFile(auto_ef_t *, _auto_ef_t *); 54 void AutoefAddScore(char *, double, _auto_ef_t *); 55 int AutoefFindKeyWord(char *, _auto_ef_t *); 56 57 enum eSuperSetOr2022{ eASCII, eISO_JP, eISO_CNorKR, eISO_Other, 58 eCP949, eGB18030, eHKSCS, eCP874 }; 59 60 size_t auto_ef_file(auto_ef_t **aef, const char *convert_file_name, 61 int auto_ef_flag) { 62 63 struct stat filetype; 64 65 size_t auto_ef_file_size = 0; 66 _auto_ef_t root_autoef = (_auto_ef_t) NULL; 67 68 /* 69 * Determine fine type 70 */ 71 72 if (convert_file_name != NULL) { 73 if (stat(convert_file_name, &filetype) == 0) { 74 if (! S_ISREG(filetype.st_mode)) { 75 errno = EACCES; 76 *aef = NULL; 77 return ((size_t)-1); 78 } 79 } else { 80 /* stat() failed, return -1 with errno from stat */ 81 *aef = NULL; 82 return ((size_t)-1); 83 } 84 } 85 86 *aef = execute_file(convert_file_name, auto_ef_flag, &auto_ef_file_size); 87 88 return (auto_ef_file_size); 89 } 90 91 auto_ef_t *execute_file(const char *file, 92 int auto_ef_flag, size_t *return_size) 93 { 94 char *lbuf, *p; 95 long count; 96 long offset = 0; 97 char *next_ptr = NULL; 98 long next_count = 0; 99 char auto_ef_block[AUTOEF_BLOCKSIZE] = "\0"; 100 int auto_ef_offset = 0, auto_ef_blocksize = 0; 101 char *tmpbuf; 102 int i; 103 int autoef_flag = 0; 104 int autoef_times = 0; 105 int level_flag = 0; 106 int level_max; 107 int auto_ef_count = 0; 108 109 int level = 0; 110 111 long long tln; 112 char *prntbuf = NULL; 113 long fw_lPrntBufLen = 0; 114 int temp; 115 char *linebuf; 116 long long lnum; 117 char *ptr, *ptrend; 118 int nlflag; 119 auto_ef_t *rootp; 120 _auto_ef_t root_autoef_file = NULL; 121 122 int auto_ef_overline = 0; 123 124 tln = 0; 125 126 fw_lPrntBufLen = BUFSIZ + 1; 127 if ((prntbuf = malloc(fw_lPrntBufLen)) == NULL) { 128 errno = ENOMEM; 129 *return_size = (size_t)-1; 130 return (NULL); 131 } 132 133 if ((linebuf = malloc(fw_lPrntBufLen)) == NULL) { 134 errno = ENOMEM; 135 *return_size = (size_t)-1; 136 free(prntbuf); 137 return NULL; 138 } 139 140 if (file == NULL) 141 temp = 0; 142 else if ((temp = open(file, O_RDONLY)) == -1) { 143 errno = EACCES; 144 *return_size = (size_t)-1; 145 free(prntbuf); 146 free(linebuf); 147 return NULL; 148 } 149 150 /* read in first block of bytes */ 151 if ((count = read(temp, prntbuf, BUFSIZ)) <= 0) { 152 (void) close(temp); 153 errno = ENOMEM; 154 *return_size = (size_t)-1; 155 free(prntbuf); 156 free(linebuf); 157 return NULL; 158 } 159 160 /* Mac file format */ 161 for (i = 0; i < count; i++) { 162 char a = prntbuf[i]; 163 char b = (i + 1) < count ? prntbuf[i + 1] : 0; 164 if (a == '\r' && b != '\n') 165 prntbuf[i] = '\n'; 166 } 167 168 lnum = 0; 169 ptr = prntbuf; 170 171 level = auto_ef_flag & 0x3; 172 173 switch (level) { 174 case 0: 175 level_max = LEVEL0_LINE; 176 break; 177 case 1: 178 level_max = LEVEL1_LINE; 179 break; 180 case 2: 181 level_max = LEVEL2_LINE; 182 break; 183 default: 184 level_max = LEVEL3_LINE; 185 break; 186 } 187 188 for (;;) { 189 if (level_max != 0) { 190 level_flag++; 191 } 192 /* look for next newline */ 193 if ((ptrend = memchr(ptr + offset, '\n', count)) == NULL) { 194 offset += count; 195 196 /* 197 * shift unused data to the beginning of the buffer 198 */ 199 200 if (ptr > prntbuf) { 201 (void) memmove(prntbuf, ptr, offset); 202 ptr = prntbuf; 203 } 204 205 /* 206 * re-allocate a larger buffer if this one is full 207 */ 208 209 if (fw_lPrntBufLen < AUTO_EF_LINE_MAX) { 210 211 if (offset + BUFSIZ > fw_lPrntBufLen) { 212 /* 213 * allocate a new buffer and preserve the 214 * contents... 215 */ 216 fw_lPrntBufLen += BUFSIZ; 217 218 if ((prntbuf = realloc(prntbuf, fw_lPrntBufLen)) 219 == NULL) { 220 221 errno = ENOMEM; 222 *return_size = (size_t)-1; 223 free(prntbuf); 224 free(linebuf); 225 return NULL; 226 } 227 /* 228 * set up a bigger linebuffer 229 * (this is only used 230 * for case insensitive 231 * operations). Contents do 232 * not have to be preserved. 233 */ 234 free(linebuf); 235 if ((linebuf = malloc(fw_lPrntBufLen)) 236 == NULL) { 237 238 errno = ENOMEM; 239 *return_size = (size_t)-1; 240 free(prntbuf); 241 free(linebuf); 242 return NULL; 243 } 244 245 ptr = prntbuf; 246 } 247 248 p = prntbuf + offset; 249 if ((count = read(temp, p, BUFSIZ)) > 0) { 250 /* Mac file format */ 251 for (i = 0; i < BUFSIZ; i++) { 252 char a = p[i]; 253 char b = (i + 1) < count ? p[i + 1] : 0; 254 if (a == '\r' && b != '\n') 255 p[i] = '\n'; 256 } 257 continue; 258 } 259 260 if (offset == 0) 261 /* end of file already reached */ 262 break; 263 264 /* last line of file has no newline */ 265 ptrend = ptr + offset; 266 nlflag = 0; 267 268 } else { 269 /* 270 char tmpbuf[BUFSIZ+1]; 271 */ 272 char *a; 273 274 a = prntbuf; 275 276 auto_ef_overline = 1; 277 if ((tmpbuf = (char *)malloc(fw_lPrntBufLen + 1)) == NULL) { 278 errno = ENOMEM; 279 *return_size = (size_t)-1; 280 free(prntbuf); 281 free(linebuf); 282 return NULL; 283 } 284 285 strlcpy(tmpbuf, ptr, fw_lPrntBufLen + 1); 286 287 for(;;){ 288 count = read(temp, a, count); 289 if ((ptrend = memchr(a, '\n', count)) != NULL) 290 break; 291 offset += count; 292 if (count == 0) break; 293 } 294 if (ptrend != NULL){ 295 next_ptr = ptrend + 1; 296 next_count = count - (next_ptr - ptr); 297 nlflag = 1; 298 } else { 299 next_count = 0; 300 nlflag = 0; 301 } 302 } 303 304 } else { 305 next_ptr = ptrend + 1; 306 next_count = offset + count - (next_ptr - ptr); 307 nlflag = 1; 308 } 309 lnum++; 310 if (ptrend != NULL) 311 *ptrend = '\0'; 312 313 if (auto_ef_overline){ 314 auto_ef_overline = 0; 315 316 } else { 317 318 if ((tmpbuf = (char *)malloc(fw_lPrntBufLen + 1)) == NULL) { 319 errno = ENOMEM; 320 *return_size = (size_t)-1; 321 free(prntbuf); 322 free(linebuf); 323 return NULL; 324 } 325 326 strlcpy(tmpbuf, ptr, fw_lPrntBufLen + 1); 327 } 328 329 auto_ef_offset = buflength_file(tmpbuf); 330 auto_ef_offset++; 331 auto_ef_blocksize = auto_ef_offset + auto_ef_blocksize; 332 333 if (auto_ef_blocksize > AUTOEF_BLOCKSIZE) { 334 size_t atefsize = 0; 335 if (auto_ef_offset < AUTOEF_BLOCKSIZE) { 336 337 /* 338 * In case of auto_ef_block is 339 * full and next get block is not 340 * larger than auto_ef_block 341 */ 342 if (level_max == 0 || 343 (autoef_times % level_max == 0) || 344 autoef_times == 0) { 345 346 atefsize = auto_ef_str(&rootp, 347 auto_ef_block, AUTOEF_BLOCKSIZE, 348 level); 349 if (atefsize == (size_t)0) { 350 auto_ef_free(rootp); 351 } else if (atefsize == -1) { 352 auto_ef_free(rootp); 353 } else { 354 CalcAutoefFile(rootp, &root_autoef_file); 355 auto_ef_free(rootp); 356 autoef_flag = 0; 357 autoef_times++; 358 } 359 } 360 strncpy(auto_ef_block, (const char *)tmpbuf, AUTOEF_BLOCKSIZE); 361 auto_ef_blocksize = auto_ef_offset; 362 } else { 363 /* 364 * In case of auto_ef_block 365 * is full and next get block 366 * is larger than auto_ef_block 367 */ 368 if (level_max == 0 || 369 (autoef_times % level_max == 0) || 370 autoef_times == 0) { 371 372 atefsize = auto_ef_str(&rootp, tmpbuf, 373 auto_ef_offset, level); 374 375 if (atefsize == (size_t)0) { 376 auto_ef_free(rootp); 377 } else if (atefsize == -1) { 378 auto_ef_free(rootp); 379 } else { 380 CalcAutoefFile(rootp, &root_autoef_file); 381 auto_ef_free(rootp); 382 auto_ef_blocksize = 0; 383 autoef_flag = 0; 384 autoef_times++; 385 } 386 } 387 auto_ef_block[0] = '\0'; 388 auto_ef_blocksize = 0; 389 } 390 } else { 391 strncat(auto_ef_block, (const char *)tmpbuf, AUTOEF_BLOCKSIZE); 392 autoef_flag = 1; 393 } 394 395 free(tmpbuf); 396 if (!nlflag) 397 break; 398 399 ptr = next_ptr; 400 count = next_count; 401 offset = 0; 402 if (fw_lPrntBufLen > AUTO_EF_LINE_MAX) 403 fw_lPrntBufLen = BUFSIZ + 1; 404 } 405 406 free(tmpbuf); 407 free(prntbuf); 408 free(linebuf); 409 410 (void) close(temp); 411 if (autoef_flag == 1) { 412 size_t atefsize = 0; 413 autoef_times++; 414 atefsize = auto_ef_str(&rootp, auto_ef_block, 415 AUTOEF_BLOCKSIZE, level); 416 if (atefsize == (size_t)0) { 417 auto_ef_free(rootp); 418 } else if (atefsize == -1) { 419 auto_ef_free(rootp); 420 } else { 421 CalcAutoefFile(rootp, &root_autoef_file); 422 auto_ef_free(rootp); 423 } 424 } 425 426 if (root_autoef_file != NULL) { 427 auto_ef_t *tmp = NULL; 428 size_t autoef_size; 429 _auto_ef_t tmpautoef; 430 431 CorrectATEFO(root_autoef_file); 432 tmpautoef = SortATEFO(root_autoef_file); 433 Free_AUTOEF(&root_autoef_file); 434 ConvScore_file(&tmpautoef); 435 tmp = ATEFO2AUTOEF(tmpautoef, &autoef_size); 436 Free_AUTOEF(&tmpautoef); 437 *return_size = autoef_size; 438 return (tmp); 439 } else { 440 *return_size = (size_t)0; 441 Free_AUTOEF(&root_autoef_file); 442 return (NULL); 443 } 444 445 } 446 447 void CorrectATEFO(_auto_ef_t rtp) { 448 /* Remove downward compatibility if the other encoding is included */ 449 _auto_ef_t p; 450 int codeid; 451 int ascii = FALSE, iso2022 = FALSE, cp949_gbk = FALSE; 452 int gb18030 = FALSE, hkscs = FALSE, others = FALSE; 453 int iso2022_krcn = FALSE; 454 int cp874 = FALSE; 455 456 for (p = rtp; p != NULL; p = p->_next_autoef) { 457 switch (SuperSetOr2022(p->_encoding)) { 458 case 0: 459 ascii = TRUE; 460 break; 461 case 1: 462 iso2022 = TRUE; 463 break; 464 case 2: 465 iso2022 = TRUE; 466 iso2022_krcn = TRUE; 467 break; 468 case 4: 469 cp949_gbk = TRUE; 470 break; 471 case 5: 472 gb18030 = TRUE; 473 break; 474 case 6: 475 hkscs = TRUE; 476 break; 477 case 7: 478 cp874 = TRUE; 479 break; 480 default: 481 others = TRUE; 482 } 483 } 484 485 if ((ascii == TRUE) && (iso2022 == TRUE || cp949_gbk == TRUE || 486 gb18030 == TRUE || hkscs == TRUE || others == TRUE)) { 487 488 /* remove ascii */ 489 remove_encoding(rtp, ASCII); 490 } 491 492 if (iso2022 == TRUE) { 493 if (iso2022_krcn == TRUE) { 494 remove_encoding(rtp, ISOJP); 495 } 496 } 497 498 if (cp949_gbk == TRUE) { 499 /* remove euc */ 500 remove_encoding(rtp, EUCJP); 501 remove_encoding(rtp, EUCKR); 502 remove_encoding(rtp, EUCCN); 503 remove_encoding(rtp, EUCTW); 504 } 505 506 if (gb18030 == TRUE) { 507 /* remove euc */ 508 remove_encoding(rtp, EUCJP); 509 remove_encoding(rtp, EUCKR); 510 remove_encoding(rtp, EUCCN); 511 remove_encoding(rtp, EUCTW); 512 } 513 514 if (hkscs == TRUE) { 515 /* remove big5 */ 516 remove_encoding(rtp, BIG5); 517 } 518 519 if (cp874 == TRUE) { 520 /* remove TIS620.2533 */ 521 remove_encoding(rtp, TIS620); 522 } 523 } 524 525 void remove_encoding(_auto_ef_t rtp, char *a) { 526 _auto_ef_t p; 527 528 for (p = rtp; p != NULL; p = p->_next_autoef) { 529 if (strcmp(p->_encoding, a) == 0) { 530 p->_score = 0; 531 break; 532 } 533 } 534 } 535 536 int buflength_file(char *buf) { 537 int i; 538 539 for (i = 0; ; i++) { 540 if (buf[i] == '\0') break; 541 } 542 543 return (i); 544 } 545 546 void CalcAutoefFile(auto_ef_t *autoefp, _auto_ef_t *root_autoef_file) { 547 auto_ef_t *p; 548 int flag; 549 550 for (p = autoefp; *p != NULL; p++) { 551 if ((AutoefFindKeyWord(auto_ef_get_encoding(*p), root_autoef_file)) == TRUE) { 552 AutoefAddScore(auto_ef_get_encoding(*p), 553 auto_ef_get_score(*p), root_autoef_file); 554 } else { 555 /* 556 * This Regist_AUTOEF can be void 557 */ 558 (void) Regist_AUTOEF(auto_ef_get_encoding(*p), 559 auto_ef_get_score(*p), 560 M_FromCodeToLang(auto_ef_get_encoding(*p)), 561 root_autoef_file); 562 } 563 } 564 } 565 566 int SuperSetOr2022(char *code) { 567 568 if (strcmp(code, ASCII) == 0) 569 return (eASCII); 570 if (strcmp(code, ISOJP) == 0) 571 return (eISO_JP); 572 if (strcmp(code, ISOKR) == 0) 573 return (eISO_CNorKR); 574 if (strcmp(code, ISOCN) == 0) 575 return (eISO_CNorKR); 576 if (strcmp(code, CP949) == 0) 577 return (eCP949); 578 if (strcmp(code, GB18030) == 0) 579 return (eGB18030); 580 if (strcmp(code, HKSCS) == 0) 581 return (eHKSCS); 582 if (strcmp(code, CP874) == 0) 583 return (eCP874); 584 return (-1); 585 } 586 587 void ConvScore_file(_auto_ef_t *rtp) { 588 _auto_ef_t p, q, np; 589 double percent = 0.0; 590 double sum = 0.0; 591 int j = 0; 592 int times_flag = 0; 593 594 for (p = *rtp; p != NULL; p = p->_next_autoef) { 595 if (p->_score > 0) { 596 sum += p->_score; 597 j++; 598 if (fmod(p->_score, 100.0) != 0.0) 599 times_flag = -1; 600 } 601 } 602 603 if (times_flag >= 0 && j > 0) { 604 (*rtp)->_score = 100.0; 605 if ((*rtp)->_next_autoef != NULL) { 606 Free_AUTOEF(&((*rtp)->_next_autoef)); 607 } 608 return; 609 } 610 611 612 for (p = *rtp, q = *rtp; p != NULL; 613 q = p, p = p->_next_autoef) { 614 if (p->_score != 100.0) { 615 percent = (int)((p->_score / sum) * 1000.0) / 10; 616 p->_score = percent; 617 } 618 if (p->_score <= 0.0) { 619 Free_AUTOEF(&p); 620 q->_next_autoef = NULL; 621 break; 622 } 623 } 624 } 625 626 void AutoefAddScore(char *a, double addscore, _auto_ef_t *root_autoef_file) { 627 _auto_ef_t p; 628 629 for (p = *root_autoef_file; p != NULL; p = p->_next_autoef) { 630 if (strcmp(p->_encoding, a) == 0) { 631 p->_score = p->_score + addscore; 632 } 633 } 634 } 635 636 int AutoefFindKeyWord(char *a, _auto_ef_t *root_autoef_file) { 637 _auto_ef_t p; 638 639 for (p = *root_autoef_file; p != NULL; p = p->_next_autoef) { 640 if (strcmp(p->_encoding, a) == 0) 641 return (TRUE); 642 } 643 644 return (FALSE); 645 } 646 647