1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #include <stdio.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <algorithm> 42 #include "syllable.h" 43 #include "pinyin_data.h" 44 45 static const char *initials[] = {"", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w", }; 46 static const unsigned num_initials = sizeof(initials)/sizeof(*initials); 47 48 static const char *finals[] = {"", "a", "o", "e", "ai", "ei", "ao", "ou", "an", "en", "ang", "eng", "er", "i", "ia", "ie", "iao", "iu", "ian", "in", "iang", "ing", "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ong", "v", "ue", "iong", }; 49 static const unsigned num_finals = sizeof(finals)/sizeof(*finals); 50 51 static const char *fuzzy_finals[] = {"iao", "ian", "iang", "uai", "uan", "uang"}; 52 static const unsigned num_fuzzy_finals = sizeof(fuzzy_finals)/sizeof(*fuzzy_finals); 53 54 static const char * fuzzy_pairs[] = { 55 "z", "zh", 56 "c", "ch", 57 "s", "sh", 58 "an", "ang", 59 "on", "ong", 60 "en", "eng", 61 "in", "ing", 62 "eng", "ong", 63 "ian", "iang", 64 "uan", "uang", 65 "l", "n", 66 "f", "h", 67 "r", "l", 68 "k", "g", 69 }; 70 static const unsigned num_fuzzy_pairs = sizeof(fuzzy_pairs)/sizeof(*fuzzy_pairs)/2; 71 72 static const char * auto_correction_pairs[] = { 73 "ign", "ing", 74 "img", "ing", 75 "uei", "ui", 76 "uen", "un", 77 "iou", "iu", 78 }; 79 static const unsigned num_auto_correction_pairs = sizeof(auto_correction_pairs)/sizeof(*auto_correction_pairs)/2; 80 81 static const TPyTabEntry 82 pinyin_table[] = { 83 {"a", 0x00010}, 84 {"ai", 0x00040}, 85 {"an", 0x00080}, 86 {"ang", 0x000a0}, 87 {"ao", 0x00060}, 88 {"b", 0x01000}, 89 {"ba", 0x01010}, 90 {"bai", 0x01040}, 91 {"ban", 0x01080}, 92 {"bang", 0x010a0}, 93 {"bao", 0x01060}, 94 {"bei", 0x01050}, 95 {"ben", 0x01090}, 96 {"beng", 0x010b0}, 97 {"bi", 0x010d0}, 98 {"bian", 0x01120}, 99 {"biao", 0x01100}, 100 {"bie", 0x010f0}, 101 {"bin", 0x01130}, 102 {"bing", 0x01150}, 103 {"bo", 0x01020}, 104 {"bu", 0x01160}, 105 {"c", 0x14000}, 106 {"ca", 0x14010}, 107 {"cai", 0x14040}, 108 {"can", 0x14080}, 109 {"cang", 0x140a0}, 110 {"cao", 0x14060}, 111 {"ce", 0x14030}, 112 {"cei", 0x14050}, 113 {"cen", 0x14090}, 114 {"ceng", 0x140b0}, 115 {"ch", 0x10000}, 116 {"cha", 0x10010}, 117 {"chai", 0x10040}, 118 {"chan", 0x10080}, 119 {"chang", 0x100a0}, 120 {"chao", 0x10060}, 121 {"che", 0x10030}, 122 {"chen", 0x10090}, 123 {"cheng", 0x100b0}, 124 {"chi", 0x100d0}, 125 {"chong", 0x101e0}, 126 {"chou", 0x10070}, 127 {"chu", 0x10160}, 128 {"chua", 0x10170}, 129 {"chuai", 0x10190}, 130 {"chuan", 0x101b0}, 131 {"chuang", 0x101d0}, 132 {"chui", 0x101a0}, 133 {"chun", 0x101c0}, 134 {"chuo", 0x10180}, 135 {"ci", 0x140d0}, 136 {"cong", 0x141e0}, 137 {"cou", 0x14070}, 138 {"cu", 0x14160}, 139 {"cuan", 0x141b0}, 140 {"cui", 0x141a0}, 141 {"cun", 0x141c0}, 142 {"cuo", 0x14180}, 143 {"d", 0x05000}, 144 {"da", 0x05010}, 145 {"dai", 0x05040}, 146 {"dan", 0x05080}, 147 {"dang", 0x050a0}, 148 {"dao", 0x05060}, 149 {"de", 0x05030}, 150 {"dei", 0x05050}, 151 {"den", 0x05090}, 152 {"deng", 0x050b0}, 153 {"di", 0x050d0}, 154 {"dia", 0x050e0}, 155 {"dian", 0x05120}, 156 {"diao", 0x05100}, 157 {"die", 0x050f0}, 158 {"ding", 0x05150}, 159 {"diu", 0x05110}, 160 {"dong", 0x051e0}, 161 {"dou", 0x05070}, 162 {"du", 0x05160}, 163 {"duan", 0x051b0}, 164 {"dui", 0x051a0}, 165 {"dun", 0x051c0}, 166 {"duo", 0x05180}, 167 {"e", 0x00030}, 168 {"ei", 0x00050}, 169 {"en", 0x00090}, 170 {"eng", 0x000b0}, 171 {"er", 0x000c0}, 172 {"f", 0x04000}, 173 {"fa", 0x04010}, 174 {"fan", 0x04080}, 175 {"fang", 0x040a0}, 176 {"fei", 0x04050}, 177 {"fen", 0x04090}, 178 {"feng", 0x040b0}, 179 {"fiao", 0x04100}, 180 {"fo", 0x04020}, 181 {"fou", 0x04070}, 182 {"fu", 0x04160}, 183 {"g", 0x09000}, 184 {"ga", 0x09010}, 185 {"gai", 0x09040}, 186 {"gan", 0x09080}, 187 {"gang", 0x090a0}, 188 {"gao", 0x09060}, 189 {"ge", 0x09030}, 190 {"gei", 0x09050}, 191 {"gen", 0x09090}, 192 {"geng", 0x090b0}, 193 {"gong", 0x091e0}, 194 {"gou", 0x09070}, 195 {"gu", 0x09160}, 196 {"gua", 0x09170}, 197 {"guai", 0x09190}, 198 {"guan", 0x091b0}, 199 {"guang", 0x091d0}, 200 {"gui", 0x091a0}, 201 {"gun", 0x091c0}, 202 {"guo", 0x09180}, 203 {"h", 0x0b000}, 204 {"ha", 0x0b010}, 205 {"hai", 0x0b040}, 206 {"han", 0x0b080}, 207 {"hang", 0x0b0a0}, 208 {"hao", 0x0b060}, 209 {"he", 0x0b030}, 210 {"hei", 0x0b050}, 211 {"hen", 0x0b090}, 212 {"heng", 0x0b0b0}, 213 {"hong", 0x0b1e0}, 214 {"hou", 0x0b070}, 215 {"hu", 0x0b160}, 216 {"hua", 0x0b170}, 217 {"huai", 0x0b190}, 218 {"huan", 0x0b1b0}, 219 {"huang", 0x0b1d0}, 220 {"hui", 0x0b1a0}, 221 {"hun", 0x0b1c0}, 222 {"huo", 0x0b180}, 223 {"j", 0x0c000}, 224 {"ji", 0x0c0d0}, 225 {"jia", 0x0c0e0}, 226 {"jian", 0x0c120}, 227 {"jiang", 0x0c140}, 228 {"jiao", 0x0c100}, 229 {"jie", 0x0c0f0}, 230 {"jin", 0x0c130}, 231 {"jing", 0x0c150}, 232 {"jiong", 0x0c210}, 233 {"jiu", 0x0c110}, 234 {"ju", 0x0c160}, 235 {"juan", 0x0c1b0}, 236 {"jue", 0x0c200}, 237 {"jun", 0x0c1c0}, 238 {"k", 0x0a000}, 239 {"ka", 0x0a010}, 240 {"kai", 0x0a040}, 241 {"kan", 0x0a080}, 242 {"kang", 0x0a0a0}, 243 {"kao", 0x0a060}, 244 {"ke", 0x0a030}, 245 {"kei", 0x0a050}, 246 {"ken", 0x0a090}, 247 {"keng", 0x0a0b0}, 248 {"kong", 0x0a1e0}, 249 {"kou", 0x0a070}, 250 {"ku", 0x0a160}, 251 {"kua", 0x0a170}, 252 {"kuai", 0x0a190}, 253 {"kuan", 0x0a1b0}, 254 {"kuang", 0x0a1d0}, 255 {"kui", 0x0a1a0}, 256 {"kun", 0x0a1c0}, 257 {"kuo", 0x0a180}, 258 {"l", 0x08000}, 259 {"la", 0x08010}, 260 {"lai", 0x08040}, 261 {"lan", 0x08080}, 262 {"lang", 0x080a0}, 263 {"lao", 0x08060}, 264 {"le", 0x08030}, 265 {"lei", 0x08050}, 266 {"leng", 0x080b0}, 267 {"li", 0x080d0}, 268 {"lia", 0x080e0}, 269 {"lian", 0x08120}, 270 {"liang", 0x08140}, 271 {"liao", 0x08100}, 272 {"lie", 0x080f0}, 273 {"lin", 0x08130}, 274 {"ling", 0x08150}, 275 {"liu", 0x08110}, 276 {"long", 0x081e0}, 277 {"lou", 0x08070}, 278 {"lu", 0x08160}, 279 {"luan", 0x081b0}, 280 {"lue", 0x08200}, 281 {"lun", 0x081c0}, 282 {"luo", 0x08180}, 283 {"lv", 0x081f0}, 284 {"m", 0x03000}, 285 {"ma", 0x03010}, 286 {"mai", 0x03040}, 287 {"man", 0x03080}, 288 {"mang", 0x030a0}, 289 {"mao", 0x03060}, 290 {"me", 0x03030}, 291 {"mei", 0x03050}, 292 {"men", 0x03090}, 293 {"meng", 0x030b0}, 294 {"mi", 0x030d0}, 295 {"mian", 0x03120}, 296 {"miao", 0x03100}, 297 {"mie", 0x030f0}, 298 {"min", 0x03130}, 299 {"ming", 0x03150}, 300 {"miu", 0x03110}, 301 {"mo", 0x03020}, 302 {"mou", 0x03070}, 303 {"mu", 0x03160}, 304 {"n", 0x07000}, 305 {"na", 0x07010}, 306 {"nai", 0x07040}, 307 {"nan", 0x07080}, 308 {"nang", 0x070a0}, 309 {"nao", 0x07060}, 310 {"ne", 0x07030}, 311 {"nei", 0x07050}, 312 {"nen", 0x07090}, 313 {"neng", 0x070b0}, 314 {"ni", 0x070d0}, 315 {"nian", 0x07120}, 316 {"niang", 0x07140}, 317 {"niao", 0x07100}, 318 {"nie", 0x070f0}, 319 {"nin", 0x07130}, 320 {"ning", 0x07150}, 321 {"niu", 0x07110}, 322 {"nong", 0x071e0}, 323 {"nou", 0x07070}, 324 {"nu", 0x07160}, 325 {"nuan", 0x071b0}, 326 {"nue", 0x07200}, 327 {"nun", 0x071c0}, 328 {"nuo", 0x07180}, 329 {"nv", 0x071f0}, 330 {"o", 0x00020}, 331 {"ou", 0x00070}, 332 {"p", 0x02000}, 333 {"pa", 0x02010}, 334 {"pai", 0x02040}, 335 {"pan", 0x02080}, 336 {"pang", 0x020a0}, 337 {"pao", 0x02060}, 338 {"pei", 0x02050}, 339 {"pen", 0x02090}, 340 {"peng", 0x020b0}, 341 {"pi", 0x020d0}, 342 {"pian", 0x02120}, 343 {"piao", 0x02100}, 344 {"pie", 0x020f0}, 345 {"pin", 0x02130}, 346 {"ping", 0x02150}, 347 {"po", 0x02020}, 348 {"pou", 0x02070}, 349 {"pu", 0x02160}, 350 {"q", 0x0d000}, 351 {"qi", 0x0d0d0}, 352 {"qia", 0x0d0e0}, 353 {"qian", 0x0d120}, 354 {"qiang", 0x0d140}, 355 {"qiao", 0x0d100}, 356 {"qie", 0x0d0f0}, 357 {"qin", 0x0d130}, 358 {"qing", 0x0d150}, 359 {"qiong", 0x0d210}, 360 {"qiu", 0x0d110}, 361 {"qu", 0x0d160}, 362 {"quan", 0x0d1b0}, 363 {"que", 0x0d200}, 364 {"qun", 0x0d1c0}, 365 {"r", 0x12000}, 366 {"ran", 0x12080}, 367 {"rang", 0x120a0}, 368 {"rao", 0x12060}, 369 {"re", 0x12030}, 370 {"ren", 0x12090}, 371 {"reng", 0x120b0}, 372 {"ri", 0x120d0}, 373 {"rong", 0x121e0}, 374 {"rou", 0x12070}, 375 {"ru", 0x12160}, 376 {"ruan", 0x121b0}, 377 {"rui", 0x121a0}, 378 {"run", 0x121c0}, 379 {"ruo", 0x12180}, 380 {"s", 0x15000}, 381 {"sa", 0x15010}, 382 {"sai", 0x15040}, 383 {"san", 0x15080}, 384 {"sang", 0x150a0}, 385 {"sao", 0x15060}, 386 {"se", 0x15030}, 387 {"sen", 0x15090}, 388 {"seng", 0x150b0}, 389 {"sh", 0x11000}, 390 {"sha", 0x11010}, 391 {"shai", 0x11040}, 392 {"shan", 0x11080}, 393 {"shang", 0x110a0}, 394 {"shao", 0x11060}, 395 {"she", 0x11030}, 396 {"shei", 0x11050}, 397 {"shen", 0x11090}, 398 {"sheng", 0x110b0}, 399 {"shi", 0x110d0}, 400 {"shou", 0x11070}, 401 {"shu", 0x11160}, 402 {"shua", 0x11170}, 403 {"shuai", 0x11190}, 404 {"shuan", 0x111b0}, 405 {"shuang", 0x111d0}, 406 {"shui", 0x111a0}, 407 {"shun", 0x111c0}, 408 {"shuo", 0x11180}, 409 {"si", 0x150d0}, 410 {"song", 0x151e0}, 411 {"sou", 0x15070}, 412 {"su", 0x15160}, 413 {"suan", 0x151b0}, 414 {"sui", 0x151a0}, 415 {"sun", 0x151c0}, 416 {"suo", 0x15180}, 417 {"t", 0x06000}, 418 {"ta", 0x06010}, 419 {"tai", 0x06040}, 420 {"tan", 0x06080}, 421 {"tang", 0x060a0}, 422 {"tao", 0x06060}, 423 {"te", 0x06030}, 424 {"tei", 0x06050}, 425 {"teng", 0x060b0}, 426 {"ti", 0x060d0}, 427 {"tian", 0x06120}, 428 {"tiao", 0x06100}, 429 {"tie", 0x060f0}, 430 {"ting", 0x06150}, 431 {"tong", 0x061e0}, 432 {"tou", 0x06070}, 433 {"tu", 0x06160}, 434 {"tuan", 0x061b0}, 435 {"tui", 0x061a0}, 436 {"tun", 0x061c0}, 437 {"tuo", 0x06180}, 438 {"w", 0x17000}, 439 {"wa", 0x17010}, 440 {"wai", 0x17040}, 441 {"wan", 0x17080}, 442 {"wang", 0x170a0}, 443 {"wei", 0x17050}, 444 {"wen", 0x17090}, 445 {"weng", 0x170b0}, 446 {"wo", 0x17020}, 447 {"wu", 0x17160}, 448 {"x", 0x0e000}, 449 {"xi", 0x0e0d0}, 450 {"xia", 0x0e0e0}, 451 {"xian", 0x0e120}, 452 {"xiang", 0x0e140}, 453 {"xiao", 0x0e100}, 454 {"xie", 0x0e0f0}, 455 {"xin", 0x0e130}, 456 {"xing", 0x0e150}, 457 {"xiong", 0x0e210}, 458 {"xiu", 0x0e110}, 459 {"xu", 0x0e160}, 460 {"xuan", 0x0e1b0}, 461 {"xue", 0x0e200}, 462 {"xun", 0x0e1c0}, 463 {"y", 0x16000}, 464 {"ya", 0x16010}, 465 {"yai", 0x16040}, 466 {"yan", 0x16080}, 467 {"yang", 0x160a0}, 468 {"yao", 0x16060}, 469 {"ye", 0x16030}, 470 {"yi", 0x160d0}, 471 {"yin", 0x16130}, 472 {"ying", 0x16150}, 473 {"yo", 0x16020}, 474 {"yong", 0x161e0}, 475 {"you", 0x16070}, 476 {"yu", 0x16160}, 477 {"yuan", 0x161b0}, 478 {"yue", 0x16200}, 479 {"yun", 0x161c0}, 480 {"z", 0x13000}, 481 {"za", 0x13010}, 482 {"zai", 0x13040}, 483 {"zan", 0x13080}, 484 {"zang", 0x130a0}, 485 {"zao", 0x13060}, 486 {"ze", 0x13030}, 487 {"zei", 0x13050}, 488 {"zen", 0x13090}, 489 {"zeng", 0x130b0}, 490 {"zh", 0x0f000}, 491 {"zha", 0x0f010}, 492 {"zhai", 0x0f040}, 493 {"zhan", 0x0f080}, 494 {"zhang", 0x0f0a0}, 495 {"zhao", 0x0f060}, 496 {"zhe", 0x0f030}, 497 {"zhei", 0x0f050}, 498 {"zhen", 0x0f090}, 499 {"zheng", 0x0f0b0}, 500 {"zhi", 0x0f0d0}, 501 {"zhong", 0x0f1e0}, 502 {"zhou", 0x0f070}, 503 {"zhu", 0x0f160}, 504 {"zhua", 0x0f170}, 505 {"zhuai", 0x0f190}, 506 {"zhuan", 0x0f1b0}, 507 {"zhuang", 0x0f1d0}, 508 {"zhui", 0x0f1a0}, 509 {"zhun", 0x0f1c0}, 510 {"zhuo", 0x0f180}, 511 {"zi", 0x130d0}, 512 {"zong", 0x131e0}, 513 {"zou", 0x13070}, 514 {"zu", 0x13160}, 515 {"zuan", 0x131b0}, 516 {"zui", 0x131a0}, 517 {"zun", 0x131c0}, 518 {"zuo", 0x13180}, 519 }; 520 521 static int 522 pytab_entry_compare (const char *s, TPyTabEntry *v) 523 {return strcmp (s, v->pystr);} 524 525 TSyllable 526 CPinyinData::encodeSyllable (const char *pinyin) 527 { 528 typedef int (*bsearch_compare) (const void*, const void*); 529 TPyTabEntry *e = (TPyTabEntry*) bsearch (pinyin, pinyin_table, 530 sizeof(pinyin_table)/sizeof(pinyin_table[0]), 531 sizeof(pinyin_table[0]), 532 (bsearch_compare) pytab_entry_compare); 533 if (e) 534 return e->id; 535 536 return 0; 537 } 538 539 const char * 540 CPinyinData::decodeSyllable (TSyllable s, const char **i, const char **f) 541 { 542 if (i) *i = initials[s.initial]; 543 if (f) *f = finals[s.final]; 544 545 static char buf[128]; 546 snprintf (buf, sizeof(buf), "%s%s", initials[s.initial], finals[s.final]); 547 548 typedef int (*bsearch_compare) (const void*, const void*); 549 TPyTabEntry *e = (TPyTabEntry*) bsearch (buf, pinyin_table, 550 sizeof(pinyin_table)/sizeof(pinyin_table[0]), 551 sizeof(pinyin_table[0]), 552 (bsearch_compare) pytab_entry_compare); 553 554 if (e) 555 return e->pystr; 556 557 return NULL; 558 } 559 560 const char ** 561 CPinyinData::getAutoCorrectionPairs (unsigned &num) 562 { 563 num = num_auto_correction_pairs; 564 return auto_correction_pairs; 565 } 566 567 const char ** 568 CPinyinData::getFuzzyPairs (unsigned &num) 569 { 570 num = num_fuzzy_pairs; 571 return fuzzy_pairs; 572 } 573 574 const char ** 575 CPinyinData::getInitials (unsigned &num) 576 { 577 num = num_initials; 578 return initials; 579 } 580 581 const char ** 582 CPinyinData::getFinals (unsigned &num) 583 { 584 num = num_finals; 585 return finals; 586 } 587 588 const TPyTabEntry * 589 CPinyinData::getPinyinTable(unsigned &num) 590 { 591 num = sizeof(pinyin_table) / sizeof(TPyTabEntry); 592 return pinyin_table; 593 } 594
