1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifndef SUNPY_IMI_CONTEXT_H 39 #define SUNPY_IMI_CONTEXT_H 40 41 #include "portability.h" 42 43 #ifdef HAVE_CONFIG_H 44 #include <config.h> 45 #endif 46 47 #ifdef DEBUG 48 #ifdef HAVE_ASSERT_H 49 #include <assert.h> 50 #endif 51 #endif 52 53 #include <map> 54 #include <vector> 55 #include <list> 56 #include <math.h> 57 58 #include "imi_data.h" 59 #include "ic_history.h" 60 61 #define UNKNOWN_WORD_ID 0 62 #define OOV_WORD_ID 69 63 #define SENTENCE_BREAKER_ID 10 64 65 struct TLongExpFloat { 66 public: 67 TLongExpFloat(const TLongExpFloat& b) : m_base(b.m_base), m_exp(b.m_exp) { } 68 69 TLongExpFloat(int exp = 0, double base=0.0) : m_base(base), m_exp(exp) { } 70 71 TLongExpFloat(double d); 72 73 TLongExpFloat 74 operator* (const TLongExpFloat& b) const; 75 76 TLongExpFloat 77 operator/ (const TLongExpFloat& b) const; 78 79 bool 80 operator< (const TLongExpFloat& b) const; 81 82 bool 83 operator<=(const TLongExpFloat& b) const; 84 85 bool 86 operator==(const TLongExpFloat& b) const; 87 88 void 89 toString(std::string& str) const; 90 91 void 92 toString(char* buf) const 93 { if (buf) sprintf(buf, "%10lf*2^%d", m_base, m_exp); } 94 95 double 96 log2() const 97 { 98 #ifdef DEBUG 99 //assert(m_base > 0.0); 100 #endif 101 return ::log2(m_base)+m_exp; 102 } 103 104 private: 105 double m_base; 106 int m_exp; 107 }; 108 109 /** 110 * TSentenceScore is only used for whole sentence score, 111 * the score from language model still using double. 112 */ 113 #ifdef _USE_RAW_PROBABILITY 114 typedef TLongExpFloat TSentenceScore; 115 #else 116 typedef double TSentenceScore; 117 #endif 118 119 class CBone; 120 class CCandidate; 121 122 typedef std::list<CBone> CSkeleton; 123 typedef CSkeleton::iterator CSkeletonIter; 124 typedef std::vector<CCandidate> CCandidates; 125 typedef CCandidates::iterator CCandidatesIter; 126 127 class CIMIContext; 128 class CBoneInnerData; 129 130 union TCandiRank { 131 public: 132 bool 133 operator< (const TCandiRank& b) const 134 { return m_all < b.m_all; }; 135 136 TCandiRank() : m_all(0) { } 137 138 TCandiRank(bool user, bool best, unsigned int len, 139 bool fromLattice, TSentenceScore score); 140 141 TCandiRank(bool user, bool best, unsigned int len, 142 bool fromLattice, unsigned score); 143 144 protected: 145 unsigned int m_all; 146 #if !defined(WORDS_BIGENDIAN) 147 struct TAnony { 148 unsigned m_cost : 24; 149 unsigned m_lattice: 1; 150 unsigned m_best : 1; 151 unsigned m_len : 5; 152 unsigned m_user : 1; 153 } anony; 154 #else 155 struct TAnony { 156 unsigned m_user : 1; 157 unsigned m_len : 5; 158 unsigned m_best : 1; 159 unsigned m_lattice: 1; 160 unsigned m_cost : 24; 161 } anony; 162 #endif 163 164 }; 165 166 /** 167 * CCandidate represent basic information about a single candidate. 168 * Its start bone and finishing bone. It's content string. and its 169 * word id. 170 */ 171 class CCandidate { 172 public: 173 friend class CIMIContext; 174 public: 175 CSkeletonIter m_BoneStart; 176 CSkeletonIter m_BoneEnd; 177 const TWCHAR *m_String; 178 179 public: 180 CCandidate(const CCandidate& b) 181 : m_BoneStart(b.m_BoneStart), m_BoneEnd(b.m_BoneEnd), 182 m_String(b.m_String), m_WordId(b.m_WordId) { } 183 184 /** Give out the constructor for convinience */ 185 CCandidate(const TWCHAR* s = NULL, 186 CSkeletonIter h=CSkeletonIter(), 187 CSkeletonIter t=CSkeletonIter(), 188 unsigned int wid=0) 189 : m_BoneStart(h), m_BoneEnd(t), m_String(s), m_WordId(wid) { } 190 191 void 192 print(std::string& prefix); 193 194 protected: 195 unsigned int m_WordId; 196 }; // of CCandidate 197 198 199 /** 200 * Bone is the basic unit for CIMSessionDate to store a Syllable, ie 201 * a Pinyin string for only one Chinese character. Such as "zhang", 202 * or "zh" under non-complete pinyin. 203 */ 204 class CBone { 205 friend class CIMIContext; 206 public: 207 /** 208 * In case that use input is not Pinyin, such as under English input 209 * mode, A bone contains a string for the input. For English, all 210 * consecutive string grouped into one bone, such as "SUN Microsystem"; 211 * For Punc, each punctuaction become on bone, such as ": would be 212 * split into two bones. 213 * 214 * For Pinyin type, there are three different type definition, and the 215 * value for each syllable string depends on whether or not NonComplete 216 * Pinyin mode is enabled: 217 * 218 * NODE_PINYIN : Valid pinyin. Such as "zh" with NonComplet 219 * Pinyin enabled, or "zhang" \n 220 * NODE_INVALID_PINYIN : Invalid syllable string, such as "u"; or "afdasf"\n 221 * NODE_INCOMPLETE_PINYIN: incomplete syllable string (maybe complet further), 222 * such as "to"; or "zh" when NonComplete Pinyin disabled. 223 */ 224 enum NODE_TYPE { 225 NODE_TAIL = 0x0000, //pusedo tail node 226 227 CATE_PINYIN = 0x0100, 228 NODE_PINYIN = 0x0101, //pinyin 229 NODE_INVALID_PINYIN = 0x0102, //invalid syllable string 230 NODE_INCOMPLETE_PINYIN = 0x0103, //incomplete syllable string 231 232 233 CATE_OTHER = 0x0200, 234 NODE_ASCII = 0x0201, //english string 235 NODE_PUNC = 0x0202, //punctuation 236 NODE_SIMBOL = 0x0203, //other simbol 237 NODE_DIGITAL = 0x0204 //not implemeted here 238 239 }; // of NODE_TYPE 240 241 /** 242 * Boundary type indicate how the bone is seperated, by (1) Automatic 243 * Syllable segmentation, (2) different bone type or punc bone seperate 244 * rule, (3) user sepecified. 245 */ 246 enum BOUNDARY_TYPE { 247 AUTO_BOUNDARY, //automatic segmentation result 248 ABSOLUTE_BOUNDARY, //boundary without ambiguation 249 USER_BOUNDARY //user given boundary 250 }; // of BOUNDARY_TYPE 251 252 public: 253 int m_BoneType; 254 int m_BoundaryType; // original code for m_String[0] in non_pinyin node 255 wstring m_String; 256 257 public: 258 /** 259 * Never copy or allocate space for m_pInnerData; 260 */ 261 CBone(const CBone& b); 262 263 /** 264 * @param boundType: Boundary type. 265 * @param boneType: BoneType. 266 * No space is allocated for m_pInnerData 267 */ 268 CBone(int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL); 269 270 /** 271 * @param pwc can not be NULL 272 * @param boundType: Boundary type. 273 * @param boneType: BoneType. 274 * No space is allocated for m_pInnerData 275 */ 276 CBone(const TWCHAR* pwc, int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL); 277 278 /** 279 * @param pwc the string should be copied into this bone, not NULL 280 * @param len the string len 281 * @param boundType: Boundary type. 282 * @param boneType: BoneType. 283 * No space is allocated for m_pInnerData 284 */ 285 CBone(const TWCHAR* pwc, size_t len, int boundType, int boneType); 286 287 /** Free all space if necessary. */ 288 ~CBone(); 289 290 inline bool 291 isPinyinNode() const 292 { return ((m_BoneType & CATE_PINYIN)!= 0); } 293 294 inline bool 295 isValidPinyinNode() const 296 { return (m_BoneType == NODE_PINYIN || m_BoneType == NODE_INCOMPLETE_PINYIN); } 297 298 inline bool 299 isUserBoundary() const 300 { return m_BoundaryType == USER_BOUNDARY; } 301 302 inline bool 303 isAutoBoundary() const 304 { return m_BoundaryType == AUTO_BOUNDARY; } 305 306 inline bool 307 isTailNode() const 308 { return (m_BoneType == 0); } 309 310 bool 311 isUserSelectionStart(void); 312 313 void 314 print(std::string& prefix); 315 316 protected: 317 CBoneInnerData *m_pInnerData; 318 }; // of CBone 319 320 321 /** 322 * It is more suitable to call this as Input Context together with I 323 * MSessionView. These data record 324 * input history for a input session. Normally a seesion data would 325 * only contains history keys and the cursor position. It would be enough 326 * to find corresponding result from the history for most IME. 327 * 328 * The Session data class here take responsible for generating best sentence 329 * from Pinyin string. It also contains all core algorithm for this conversion 330 * progress. 331 * 332 * All Key processing job should be done by Session View class, and only several 333 * interface exist here for the SessionView to modify Input Context and get 334 * best sentence and ranked candidates. 335 * 336 * The other important function it provide is Automatic Pinyin segmentation. 337 * 338 * For our input method, from the aspect of effeciency, a internal 339 * search lattice should also be remained and only partial of the lattice 340 * would be rebuild or updated when user give a new input. The reason is 341 * that it is time consuming to construct a whole search lattice, especially 342 * for a long sentence and/or with Non-Complete syllables, which cause 343 * potential candidates number increase. Yet all search related data are 344 * hidden to outer usage. 345 */ 346 class CIMIContext { 347 public: 348 /*@{*/ 349 /** 350 * Constructor of CIMIContext. Set all the pointer to NULL. 351 * set Non-Complete Syllable to true, set Strict Left2Right Model 352 * to false. 353 * 354 * Note: At this time, CIMIContext could not be used to 355 * search directly. Only after setCoreData() and clear(), 356 * the internal search lattice are constructed and can 357 * be used. 358 */ 359 CIMIContext(); 360 361 /** 362 * free all resource/spaces 363 */ 364 virtual 365 ~CIMIContext() 366 { m_Skeleton.clear(); } 367 /*@}*/ 368 369 /*@{*/ 370 /** 371 * Copy language model ptr and Pinyin-Trie Ptr inside the IMCoreData 372 * into my own member. 373 * Also build Chinese Punctuation Map from Pinyin-Trie. 374 * 375 * @param pCoreData is the core resource data for the Input Method 376 */ 377 void 378 setCoreData(CIMIData *pCoreData); 379 380 /** 381 * clear all internal Input Context, after this call, the Session data 382 * or Input Context should same as they were just created. (Of cause, 383 * the values from core data and desktop data remains.) More specifically, 384 * it will clear skeleton, add a psuedo tail node, set internal candidate 385 * position to skeleton.end() (which means no candidates needed now). then 386 * it will construct a initial search lattice. 387 * 388 * Note: This function should be called only after setCoreData, because 389 * it will use the language model to construct a empty search 390 * lattice. 391 */ 392 void 393 clear(); 394 395 void 396 setHistoryMemory(CICHistory *phm); 397 398 CICHistory * 399 getHistoryMemory(); 400 401 /** return true if defined DEBUG */ 402 void 403 print_lattice(); 404 405 //memorize sentence in current text 406 void 407 memorize(void); 408 409 /** 410 * Return the bone list to let the view read them directly. 411 */ 412 CSkeleton & 413 getSkeleton(void) 414 { return m_Skeleton; } 415 416 /** 417 * To construct the lattice, algorithm need to append two 418 * psuedo tail bone at the end of the bone list. 419 * For the SessionView could operate the list as the two tailing 420 * bone were not there, SessionView should call this function 421 * to replace call like getSekelton()->end() when iterating the 422 * Skeleton (bone list). 423 * 424 * @return the first psuedo tailing node at the end of bone 425 * list. For SessionView usage. 426 */ 427 CSkeletonIter 428 getLastBone(void) 429 { return --(--(m_Skeleton.end())); } 430 431 /** 432 * To cooperate with the getLastBone. 433 * @return the first bone of the skeleton 434 */ 435 CSkeletonIter 436 getFirstBone(void) 437 { return m_Skeleton.begin(); } 438 439 bool 440 isEmpty(void) 441 { return m_Skeleton.size() == 2; } 442 /*@}*/ 443 444 445 // functions to set options. Options should be set when Session Data 446 // is clear, ie, just created or just after clear() is called 447 448 /*@{*/ 449 /** 450 * NonCompleteSyllable means we can give a candidates when PinYin is 451 * just partial a complete syllable string. Such as "sh" would give 452 * all characters with one of it PinYin starts from "sh", ex, "shi", 453 * "sheng" etc. 454 * In our definition, Non-Complete PinYin not only limitied on "SHENGMU", 455 * For examplet, "to" is not a valid full pinyin, yet it could lead 456 * "tong", "tou". So it would also gives out those corresponding 457 * characters. 458 * On the other hand, when a PinYin String is a valid Full Syllable 459 * PinYin, even it could lead other Full Syllable PinYin, it will 460 * not be treated as non-complete PinYin. For example, "da" will only 461 * give candidates characters pronounced "da", although it could lead 462 * "dan" "dao" "dang", etc. 463 */ 464 void 465 setNonCompleteSyllable(bool use = true) 466 { m_bNonCompleteSyllable = use; } 467 468 bool 469 canNonCompleteSyllable() 470 { return m_bNonCompleteSyllable; } 471 472 /** 473 * Left2RightSelection could improve the performance under 474 * the TwoLine view style. But could not be used with OneLine view 475 * style and ThreeLine view style. 476 */ 477 void 478 setLeft2RightSelection(bool use = true) 479 { m_bStrictLeft2Right = use; } 480 481 bool 482 isGBKEnabled() 483 { return m_bGBK; } 484 485 void 486 enableGBK(bool enable) 487 { m_bGBK = enable; } 488 489 void 490 setHistoryPower(int power) 491 { m_HistoryPower = ((power>= 0 && power <=10)?(power):(3)); } 492 493 int 494 getHistoryPower() 495 { return m_HistoryPower; } 496 497 void 498 enableContextRanking(bool enable) 499 { m_ContextRanking = enable; } 500 501 int 502 isContextRankingEnabled() 503 { return m_ContextRanking; } 504 /*@}*/ 505 506 /*@{*/ 507 bool 508 isValidSyllable(const wstring& sy) 509 { return isValidSyllable(sy.c_str()); } 510 511 bool 512 isValidSyllable(const TWCHAR* pstr); 513 514 /** 515 * segPinyinSimplest() do simplest segmentation for given PinYin. That, the 516 * last char is the just input by user (or just delete by user), and 517 * prefix string is at least non-complet syllable. This kind of segment 518 * suitable for OneLineView. Syllable Pinyin string can only be modified 519 * at the end. 520 * 521 * @param pinyin : a wide char string. Each char in the string should only 522 * be one of the following: [a-z]. Note, that prefix string (except the 523 * last char should already be complete or non-complet). 524 * 525 * @param result: After segmentation, result contains sequence of syllable 526 * Bones. The result may be one of the following status:\n 527 * (1) two bones, the first is complete, and 528 * the second is complete or non-complete.\n 529 * (2) two bones, the first is complete, and 530 * the second is invalid.\n 531 * (3) single complete or non-complet bone, with AUTO_BOUNDARY.\n 532 * (4) single invalid bone.\n 533 * Status (2) and (4) is invalid, program should reject the last 534 * input char. Status (1) program should commit the first bone and 535 * use the remaining string as prefix. For status (3), program 536 * should wait for further input. 537 * 538 * @return whether the segmentation is valid(or whether the last char 539 * is valid) 540 */ 541 bool 542 segPinyinSimplest(const wstring& pinyin, CSkeleton& result); 543 544 /** 545 * [head1, tail1) [head2, tail2) may from two link. and form an virtual link. 546 * generate the segmentation result into result where each node represent a 547 * syllable. Note that invalid_syllable is inevitable. 548 */ 549 void 550 segPinyin(CSkeletonIter head1, CSkeletonIter tail1, 551 CSkeletonIter head2, CSkeletonIter tail2, 552 CSkeleton& result); 553 /*@}*/ 554 555 /*@{*/ 556 /** 557 * modify skeleton. remove bones [boneStart, boneEnd). and insert all 558 * bones in the skel just before the boneEnd. Then update the 559 * search lattice and search for a new result. 560 * 561 * @param boneStart start bone iterator in the skeleton 562 * @param boneEnd ending bone iterator in the skeleton 563 * @param skel the new list of bones to be inserted. 564 * @param pItLeftmost the leftmost bone affected by the operation. 565 * for manual search 566 * 567 * @return whether or not the original gotted candidates called 568 * by getCandidates() would be affected by this modification. 569 * @retval true Affected. getCandidates() should be call again. 570 * @retval false Not affected. 571 */ 572 bool 573 modify(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel, 574 bool doSearch=true, CSkeletonIter* pItLeftmost=NULL); 575 576 /** 577 * modify skeleton. remove bones [boneStart, boneEnd). and insert all 578 * bones in the skel just before the boneEnd. Do Syllable segmentation 579 * again to make syllables legal. This may look left for 2/3 nodes. Also 580 * new cursor position are counted. 581 * Then update the search lattice and search for a new result if needed. 582 * 583 * @param boneStart start bone iterator in the skeleton 584 * @param boneEnd ending bone iterator in the skeleton 585 * @param skel the new list of bones to be inserted. 586 * @param cursor the cursor bone in the skel, after this function, 587 * it contains the corresponding cursor bone iterator 588 * in the IC's Skeleton. 589 * @param cursorIdx the cursor's idx inside the cursor bone. Also contain 590 * new position index after this function. 591 * @param candiStart The candidate list's starting position. Also the leftmost 592 * when looking left to prevent re-segment insufficient. 593 * Return value are set to its new position, because it would 594 * change onto the new list. 595 * @param stickLeft When the cursor position after segment is located at 596 * boundary. This give how it should be, @value true for 597 * the tail of the left bone, @value false for the head 598 * of the right bone. 599 * @param doSearch should be always true, otherwise you have to research the 600 * whole skeleton after this call, because currently we do 601 * not provide an interface to return the righmost bone to 602 * remember the minimized the research demand cause by this. 603 * @return whether or not the original gotted candidates called 604 * by getCandidates() would be affected by this modification. 605 * @retval true Affected. getCandidates() should be call again. 606 * @retval false Not affected. 607 */ 608 bool 609 modifyAndReseg(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel, 610 CSkeletonIter& cursor, int& cursorIdx, CSkeletonIter& candiStart, 611 bool stickLeft=true, bool doSearch=true); 612 613 614 /** 615 * Cancel original selection that including the bone 616 * 617 * @param bone Selection is a candidate, which have a bone range. 618 * If any selection's range contains the bone, it 619 * would be canceled. 620 * @param update "true" to update the search lattice. Normally, 621 * when called by SessionView, update should be set. For 622 * internal call of the function, one could let the update 623 * parameter to false, and do lattice searching later. 624 * 625 * @return the Bone on the left of param bone, which is the start bone 626 * of a user selection, and the selection include param bone. 627 * if no such a bone found, just return param bone itself. 628 * 629 * The algorithm working like following: 630 * - Find the left most bone whose PinYin lexicon state could arrived here. 631 * This could be done just read the first element of BoneInnerData's 632 * m_LexiconStates. 633 * - From current bone to the left-most bone, try to find the first meet 634 * bone whose BestWord candidates is valid. 635 * -# if the BestWord's right bone is at the left of target bone: 636 * ====>do nothing 637 * -# if the BestWord's right bone is at the right of the target bone: 638 * ====> Invalidate the BestWord. And if update is set, 639 * ====> searchFrom the BestWord's right bone. 640 */ 641 CSkeletonIter 642 cancelSelection(CSkeletonIter bone, bool update=true); 643 644 /** 645 * Cancel original selection that including the bone, but do 646 * not count User Selection right starting at bone. 647 * 648 * @param bone Selection is a candidate, which have a bone range. 649 * If any selection's range contains the bone, it 650 * would be canceled. 651 * @param update "true" to update the search lattice. Normally, 652 * when called by SessionView, update should be set. For 653 * internal call of the function, one could let the update 654 * parameter to false, and do lattice searching later. 655 * 656 * @return the Bone on the left of param bone, which is the start bone 657 * of a user selection, and the selection include param bone. 658 * if no such a bone found, just return param bone itself. 659 */ 660 CSkeletonIter 661 cancelSelectionCover(CSkeletonIter bone, bool update=true); 662 663 664 /** 665 * Tell me that user make a selection for a specific candidate. 666 * The lattice will be updated (only the neccessary part). And 667 * best sentence are searched. 668 * 669 * @param candi the user selection. 670 * @return whether or not it will affect the previous gotted 671 * candidate list. 672 * 673 * Algorithm-Description: 674 * - Put the candi to the candi's leftBone's bestWord, and validate it. 675 * -# If Pure Left to Right model enabled, then: 676 * ====> rebuild the lattice from bestWord's left bone. 677 * -# If Pure Left to Right model disabled, then 678 * ====> rebuild the lattice from bestWord's right bone. 679 * . 680 * . 681 * 682 * the search function will deal with the human selection, the processing 683 * of selection focused on two different things: 684 * - the word-set finding 685 * - the word scoring 686 * . 687 * More specificlly, the search routine will, normally, finding candidates 688 * words bone after bone. At each bone: 689 * -# It first check all finishing lexicon states, ie., get all possible 690 * words that will ending right before this bone, using them to construct 691 * lattice states of this bone. For each word, there is a inner cost 692 * associated with them. (For future word class usage) This cost is 693 * positive double. But for user selection, the cost should will be 694 * set to a suitable negtive double to make the best path alway go 695 * throught this word. The value of the negetive is affected by: 696 * maximum word length, N-gram's N, mini-backoff weight, mini-common-pr. 697 * (Due to float plus operation accuracy lost, the value should not be 698 * set to a very negitive, although the less the better.) But when 699 * "Pure Left to Right" is enabled, this trick will not be used, on 700 * the other hand, the following step would make sure there is only 701 * one possible transfer/edge on the lattice. 702 * -# Then, from all possible lexicon states (plus a root state), let them 703 * transfer on this bone's PINYIN string. Put the result states after 704 * transfer on the next bone as Lexicon states. But when 705 * "Pure Left to Right" model enabled, and a user selection is 706 * meet at the starting bone, then: 707 * - clear all states inside the bestWord's range. 708 * - jump directly to the bestWord's right bone, with only one word, 709 * ie. the bestWord, transfer enabled in this range. 710 * - then search from the right bone 711 * . 712 */ 713 bool 714 makeSelection(const CCandidate& candi); 715 /*@}*/ 716 717 /*@{*/ 718 /** 719 * Get candidate list for position at the bone. 720 * 721 * @param bone: iterator pointed to a unit in the bone list. 722 * @param result: the candidate list. Each candidate has a starting 723 * bone, ending bone, and a corresponding string. Currently 724 * all starting bone is the "bone" parameter. 725 * 726 * Currently, it will only give candidates that are starting at 727 * the parameter bone. Normally, the lattice wouldn't save all words' 728 * transfer cost information. But some good transfer edge could give 729 * us infomation. To achieve high efficiency, in this function, no 730 * probabilities are re-get from language model. But, we use the 731 * following information to rank the candidates: 732 * -# If some words starting at bone is selected by the user, or 733 * it is a best edge of the best sentence, it will be listed as 734 * first. 735 * -# The longer the word, the better 736 * -# if some words' transfer cost could be got from the lattice, 737 * they are better than those from lexicon only. 738 * -# Use information stored in lexicon (where words could ranked 739 * by unigram). 740 */ 741 void 742 getCandidates(CSkeletonIter bone, CCandidates& result); 743 744 /** 745 * Get the best sentence corresponding from boneStart to boneEnd 746 * @param boneStart 747 * @param boneEnd 748 * @param result the result string 749 * @return: the number of words converted in the best sentence. ie. not 750 * count the non-pinyin node or invalid pinyin node 751 */ 752 int 753 getBestSentence(wstring& result, CSkeletonIter boneStart, CSkeletonIter boneEnd, bool original_format=false); 754 /*@}*/ 755 756 void 757 searchFrom(CSkeletonIter boneStart); 758 759 protected: 760 CSkeleton m_Skeleton; 761 CSkeletonIter m_EffectiveCandiBoneStart; 762 CSkeletonIter m_EffectiveCandiBoneEnd; 763 764 bool m_bNonCompleteSyllable; 765 bool m_bStrictLeft2Right; 766 bool m_bGBK; 767 bool m_bGB18030; 768 int m_HistoryPower; 769 bool m_ContextRanking; 770 771 CThreadSlm *m_pModel; 772 CPinyinTrie *m_pPinyinTrie; 773 774 CICHistory *m_pHistory; 775 776 777 private: 778 void 779 transferBetween(CSkeletonIter h, CSkeletonIter t, unsigned int id, double ic); 780 781 void 782 buildLatticeStates(CSkeletonIter bone); 783 784 CSkeletonIter 785 forwardOnePinyinBone(CSkeletonIter bone); 786 787 CSkeletonIter 788 forwardPuncBone(CSkeletonIter bone); 789 790 CSkeletonIter 791 forwardPinyinBone(CSkeletonIter bone); 792 793 CSkeletonIter 794 forwardNonPinyinBone(CSkeletonIter bone); 795 796 CSkeletonIter 797 forwardInvalidBone(CSkeletonIter bone); 798 799 CSkeletonIter 800 forwardTailBone(CSkeletonIter bone); 801 802 }; // of CIMIContext 803 804 #endif 805