1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include <config.h> 40 #endif 41 42 #include <algorithm> 43 #include "imi_context.h" 44 #include "lattice_states.h" 45 46 47 class TSkelCursor { 48 public: 49 struct TPos { 50 CSkeletonIter m_bone; 51 int m_idx; 52 TPos(CSkeletonIter bone=CSkeletonIter(), int idx=0) 53 : m_bone(bone), m_idx(idx) { } 54 }; 55 56 public: 57 TSkelCursor(CSkeletonIter h1, CSkeletonIter t1, CSkeletonIter h2, CSkeletonIter t2, bool asis=false) 58 : m_h1(h1), m_t1(t1), m_h2(h2), m_t2(t2), m_bone(h1), m_idx(0), m_iLink(0) 59 { if (!asis) ensureCursor(); } 60 61 inline bool 62 isPinyin() const 63 { return (m_bone != m_t2 && m_bone->isPinyinNode()); } 64 65 inline bool 66 isBreakAfter() const 67 { return (m_bone == m_t2 || m_bone == m_t1 || m_idx >= m_bone->m_String.size()-1); } 68 69 inline bool 70 isBreakAfter(TPos & pos) const 71 { return (pos.m_bone == m_t2 || pos.m_bone == m_t1 || pos.m_idx >= pos.m_bone->m_String.size()-1); } 72 73 inline bool 74 isUserBreakAfter() const 75 { return (m_bone == m_t2 || m_bone == m_t1 || 76 (m_idx == m_bone->m_String.size()-1 && m_bone->isPinyinNode() && 77 m_bone->m_BoundaryType == CBone::USER_BOUNDARY)); } 78 79 inline bool 80 isUserBreakAfter(TPos & pos) const 81 { return (pos.m_bone == m_t2 || pos.m_bone == m_t1 || 82 (pos.m_idx == pos.m_bone->m_String.size()-1 && pos.m_bone->isPinyinNode() && 83 pos.m_bone->m_BoundaryType == CBone::USER_BOUNDARY)); } 84 85 inline TWCHAR 86 getChar() 87 { 88 ensureCursor(); 89 return (m_bone != m_t2)?(m_bone->m_String[m_idx]):0; 90 } 91 92 inline TWCHAR 93 getChar(TPos& pos) const // the pos should be ensured 94 { return (pos.m_bone != m_t2)?(pos.m_bone->m_String[pos.m_idx]):0; } 95 96 void 97 next(bool asis=false) 98 { 99 ensureCursor(); 100 if (m_bone != m_t2) { 101 ++m_idx; 102 if (!asis) ensureCursor(); 103 } 104 } 105 106 void 107 nextBone() 108 { 109 ensureCursor(); 110 if (m_bone != m_t2) { 111 ++m_bone; 112 m_idx=0; 113 } 114 ensureCursor(); 115 } 116 117 inline bool 118 hasNext() 119 { 120 ensureCursor(); 121 return m_bone != m_t2; 122 } 123 124 inline bool 125 atFirstLink() const 126 { return m_iLink == 0; } 127 128 inline TPos 129 getPosition() const 130 { 131 return TPos(m_bone, m_idx); 132 } 133 134 /** The parameters must be retrieved from save object before */ 135 inline void 136 setPosition(const TPos& pos) 137 { m_bone = pos.m_bone; m_idx = pos.m_idx; } 138 139 bool 140 ensureCursor(TPos& curCompare); 141 142 protected: 143 CSkeletonIter m_h1, m_h2, m_t1, m_t2; 144 CSkeletonIter m_bone; 145 int m_iLink, m_idx; 146 147 protected: 148 void 149 ensureCursor(); 150 }; 151 152 void 153 TSkelCursor::ensureCursor() 154 { 155 while (m_bone != m_t1 && m_bone != m_t2 && m_idx >= m_bone->m_String.size()){ 156 m_idx = 0; 157 ++m_bone; 158 } 159 if (m_bone == m_t1) { 160 ++m_iLink; 161 m_bone = m_h2; 162 m_idx = 0; 163 while (m_bone != m_t2 && m_idx >= m_bone->m_String.size()){ 164 m_idx = 0; 165 ++m_bone; 166 } 167 } 168 } 169 170 bool 171 TSkelCursor::ensureCursor(TPos& curCompare) 172 { 173 bool same = false; 174 same = (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx); 175 while (m_bone != m_t1 && m_bone != m_t2 && m_idx >= m_bone->m_String.size()){ 176 m_idx = 0; 177 ++m_bone; 178 same = same || (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx); 179 } 180 if (m_bone == m_t1) { 181 ++m_iLink; 182 m_bone = m_h2; 183 m_idx = 0; 184 same = same || (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx); 185 while (m_bone != m_t2 && m_idx >= m_bone->m_String.size()){ 186 m_idx = 0; 187 ++m_bone; 188 same = same || (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx); 189 } 190 } 191 return same; 192 } 193 194 /** 195 * Determine whether or not the target iterator's position on list of head 196 * is located before the iterator first. More precisely, it return whether 197 * or not target in [head, first) 198 * @param target is the target iterator whose position to be decided 199 * @param head is the head iterator of the container (list or vector) 200 * @param first iterator to be compared with target 201 * @return whether or not the target iterator's position on list of head 202 * is located before the iterator first 203 */ 204 template<class forwardIt> 205 bool 206 isLocatedBefore(forwardIt target, forwardIt head, forwardIt first) 207 { 208 for (; head != first; ++head) { 209 if (target == head) return true; 210 } 211 return false; 212 } 213 214 CIMIContext::CIMIContext() 215 : m_bNonCompleteSyllable(false), m_bStrictLeft2Right(false), 216 m_bGBK(true), m_bGB18030(false), m_HistoryPower(3), m_ContextRanking(true), 217 m_pModel(NULL), m_pPinyinTrie(NULL), m_Skeleton(), 218 m_EffectiveCandiBoneStart(), m_EffectiveCandiBoneEnd() 219 { 220 } 221 222 void 223 CIMIContext::setCoreData(CIMIData *pCoreData) 224 { 225 m_pModel = pCoreData->getSlm(); 226 m_pPinyinTrie = pCoreData->getPinyinTrie(); 227 } 228 229 void 230 CIMIContext::clear() 231 { 232 m_Skeleton.clear(); 233 m_Skeleton.push_back(CBone()); 234 m_Skeleton.push_back(CBone()); 235 m_EffectiveCandiBoneStart = m_EffectiveCandiBoneEnd = getLastBone(); 236 237 // allocate bone's inner data when it is inserted into the skeleton 238 CSkeletonIter itEnd = m_Skeleton.end(); 239 for (CSkeletonIter bone = m_Skeleton.begin(); bone != itEnd; ++bone) { 240 if (bone->m_pInnerData == NULL) 241 bone->m_pInnerData = new CBoneInnerData(); 242 } 243 244 searchFrom(m_Skeleton.begin()); 245 } 246 247 static bool 248 isYuanYinChar(TWCHAR wc) 249 { 250 return (wc == L'a' || wc == L'o' || wc == L'e' || 251 wc == L'i' || wc == L'u' || wc == L'v'); 252 } 253 254 CSkeletonIter 255 CIMIContext::cancelSelection(CSkeletonIter bone, bool update) 256 { 257 bool found = false; 258 CSkeletonIter it = bone; 259 for (CSkeletonIter first=m_Skeleton.begin(); it->m_BoneType == CBone::NODE_PINYIN; --it) { 260 // BestWrod is conjunctive, so no need to check position if user selection 261 // like isLocatedBefore(bone, it, it->m_pInnerData->m_BestWord.m_BoneEnd)) { 262 if (it->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord) { 263 it->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere; 264 found = true; 265 break; 266 } else if (it->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) { 267 break; 268 } 269 if (it == first) 270 break; 271 } 272 if (found && update) 273 searchFrom(it); 274 return (found)?(it):(bone); 275 } 276 277 CSkeletonIter 278 CIMIContext::cancelSelectionCover(CSkeletonIter bone, bool update) 279 { 280 bool found = false; 281 if (bone->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) { 282 return bone; 283 } 284 CSkeletonIter it = bone; 285 for (CSkeletonIter first=m_Skeleton.begin(); it != first; ) { 286 --it; 287 if (it->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) { 288 // BestWord is conjunctive, so no need to check position if user selection 289 // like isLocatedBefore(bone, it, it->m_pInnerData->m_BestWord.m_BoneEnd)) { 290 if (it->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord) { 291 it->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere; 292 found = true; 293 } 294 break; 295 } 296 } 297 if (found && update) 298 searchFrom(it); 299 return (found)?(it):(bone); 300 } 301 302 bool 303 CIMIContext::makeSelection(const CCandidate& candi) 304 { 305 CSkeletonIter boneLeft = cancelSelection(candi.m_BoneStart, false); 306 307 /* 308 candi.m_BoneStart->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere; 309 310 for (CSkeletonIter bone = candi.m_BoneStart; bone != candi.m_BoneEnd; ++bone) 311 bone->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere; 312 */ 313 314 candi.m_BoneStart->m_pInnerData->m_BestWord = candi; 315 candi.m_BoneStart->m_pInnerData->m_BWType = CBoneInnerData::UserSelectedBestWord; 316 searchFrom(boneLeft); 317 318 return true; 319 } 320 321 /** 322 * it is illegal if boneStart == boneEnd and skel.size() == 0 323 */ 324 bool 325 CIMIContext::modify(CSkeletonIter boneStart, 326 CSkeletonIter boneEnd, 327 CSkeleton& skel, 328 bool doSearch, 329 CSkeletonIter* pItLeftmost) 330 { 331 // No change needed, happen on OneLineView, call out a PINYIN i 332 // but return it back withou modification. 333 // FIXME, maybe this should be put back to OnreLineView's code, not here 334 if ((skel.size() == 1) && (boneEnd == ++CSkeletonIter(boneStart)) && 335 (skel.begin()->m_BoneType == boneStart->m_BoneType) && 336 (skel.begin()->m_String == boneStart->m_String)) { 337 if (pItLeftmost) *pItLeftmost = getLastBone(); 338 boneStart->m_BoundaryType = skel.begin()->m_BoundaryType; 339 return false; 340 } 341 342 // check whether or not the modification would affect the candidates 343 // retrieved by the previous getCandidates() call 344 CSkeletonIter first = boneStart; 345 if (first->m_pInnerData->m_LexiconStates.size() > 0) 346 first = first->m_pInnerData->m_LexiconStates[0].m_BoneStart; 347 bool affectCandidates = 348 !isLocatedBefore(m_EffectiveCandiBoneEnd, m_Skeleton.begin(), first); 349 350 // We must check the user selection which may cover this node 351 // if there is such a user selection, we should do search from there 352 // starting bone of such a selection. 353 // The check should only be done when boneStart to be removed 354 CSkeletonIter lefter = cancelSelectionCover(boneStart, false); 355 bool bSearchLefter = (lefter != boneStart); 356 357 // Another case is that previous UserSelection just ending at boneStart, 358 // which is the first bone to be deleted. In this case, new search will 359 // start with the newly inserted bone, and when searching, the User 360 // selection word will be check backward and got a wrong range. So 361 // we must make it change the UserSelection's ending Bone to the first 362 // bone after insertion. 363 bool bLeftUS = false; 364 CSkeletonIter leftUserBone = lefter; 365 if (skel.size() > 0 && !bSearchLefter && leftUserBone != m_Skeleton.begin()) { 366 do { 367 --leftUserBone; 368 int bwType = leftUserBone->m_pInnerData->m_BWType; 369 if (bwType != CBoneInnerData::NoBestWordStartHere) { 370 bLeftUS = (bwType ==CBoneInnerData::UserSelectedBestWord && 371 leftUserBone->m_pInnerData->m_BestWord.m_BoneEnd == boneStart); 372 break; 373 } 374 } while (leftUserBone != m_Skeleton.begin()); 375 } 376 377 CBoneInnerData *pid = NULL; 378 // remove the old range 379 if (boneStart != boneEnd) { 380 // before remove the bone, get the first's bone's innerData 381 // reserve it for the first bone to be inserted. (ie. just 382 // attach it to the first bone after deletion/insertion 383 pid = boneStart->m_pInnerData; 384 boneStart->m_pInnerData = NULL; 385 386 m_Skeleton.erase(boneStart, boneEnd); 387 } 388 389 // insert new list before boneEnd 390 first = boneEnd; 391 CSkeleton::iterator it1 = skel.begin(), h = skel.begin(); 392 CSkeleton::iterator it2 = skel.end(); 393 for (; it1 != it2; ++it1) { 394 CSkeletonIter tmp = m_Skeleton.insert(boneEnd, *it1); 395 if (it1 == h) 396 first = tmp; 397 else 398 tmp->m_pInnerData = new CBoneInnerData(); 399 } 400 401 if (first->m_pInnerData != NULL) { 402 // nothing inserted, must deleted something, ie pid != NULL 403 pid->m_BWType = first->m_pInnerData->m_BWType; 404 pid->m_BestWord = first->m_pInnerData->m_BestWord; 405 delete first->m_pInnerData; 406 first->m_pInnerData = pid; 407 } else if (pid == NULL) { 408 // nothing deleted, just inserting something 409 first->m_pInnerData = boneEnd->m_pInnerData; 410 boneEnd->m_pInnerData = new CBoneInnerData(); 411 boneEnd->m_pInnerData->m_BWType = first->m_pInnerData->m_BWType; 412 boneEnd->m_pInnerData->m_BestWord = first->m_pInnerData->m_BestWord; 413 first->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere; 414 } else { 415 //something deleted, something inserted 416 first->m_pInnerData = pid; 417 pid->m_BWType = CBoneInnerData::NoBestWordStartHere; 418 } 419 420 // change the left user selection bone's best word's ending bone to first 421 if (bLeftUS) 422 leftUserBone->m_pInnerData->m_BestWord.m_BoneEnd = first; 423 424 // rebuild the search lattice from the newly inserted list 425 // using the just copied lattice states (innerData) 426 if (pItLeftmost) 427 *pItLeftmost = (bSearchLefter)?(lefter):(first); 428 if (doSearch) 429 searchFrom((bSearchLefter)?(lefter):(first)); 430 431 return affectCandidates; 432 } 433 434 bool 435 CIMIContext::isValidSyllable(const TWCHAR* pstr) 436 { 437 const CPinyinTrie::TNode* pyn = m_pPinyinTrie->transfer(pstr); 438 return m_pPinyinTrie->isValid(pyn, m_bNonCompleteSyllable, m_bGBK); 439 } 440 441 bool 442 CIMIContext::segPinyinSimplest(const wstring& pinyin, CSkeleton& result) 443 { 444 #ifdef DEBUG 445 printf("SegPinyin:"); 446 print_wide(pinyin.c_str()); 447 printf("-->"); 448 #endif 449 450 //"zhuang" is longest syllable, 16 is enought 451 bool validSyllable[16]; 452 const CPinyinTrie::TNode* pathNodes[16]; 453 const TWCHAR* str = pinyin.c_str(); 454 const CPinyinTrie::TNode* pyn = m_pPinyinTrie->getRootNode(); 455 456 result.clear(); 457 458 //Find out the longest valid PINYIN prefix, save to lastValid 459 int idx, lastValid = - 1; 460 for (idx = 0; str[idx] != 0; ++idx) { 461 pyn = m_pPinyinTrie->transfer(pyn, (unsigned char)(str[idx])); 462 pathNodes[idx] = pyn; 463 if (validSyllable[idx] = m_pPinyinTrie->isValid(pyn, m_bNonCompleteSyllable, m_bGBK)) 464 lastValid = idx; 465 if (pyn == NULL) 466 break; 467 } 468 469 /********************************************************************* 470 Note, when NULL pyn arrived, the char should also be the last one. 471 Try to split it into two nodes if possible: 472 (1) [0..idx-2], [0..idx-1] is both complete syllable 473 [idx-1] is FuYin, [idx] is Yuanyin, 474 [idx-1...] is non-complete or complete (not NULL) 475 ====> split into [0..idx-2] [idx-1, idx] 476 (2) lastValid >= 0 477 ====> split into [0..lastValid] [lastValid+1..] 478 if [lastValid+1...] is not valid, return false 479 (3) lastValid = -1 480 ====> give a invalid PINYIN bone [0..] 481 **********************************************************************/ 482 if (pyn == NULL && idx >= 2 && 483 pathNodes[idx-1]->m_bFullSyllableTransfer == 1 && 484 pathNodes[idx-2]->m_bFullSyllableTransfer == 1 && 485 !isYuanYinChar(str[idx-1]) && isYuanYinChar(str[idx]) && 486 (pathNodes[idx] = m_pPinyinTrie->transfer(str+idx-1)) != NULL) { 487 488 result.push_back(CBone(str, idx-1, CBone::AUTO_BOUNDARY, CBone::NODE_PINYIN)); 489 490 #ifdef DEBUG 491 print_wide(wstring(str, idx-1).c_str()); 492 printf("'"); 493 #endif 494 495 int bt = CBone::NODE_INCOMPLETE_PINYIN; 496 if (pathNodes[idx]->m_bFullSyllableTransfer == 1) 497 bt = CBone::NODE_PINYIN; 498 499 result.push_back( CBone(str+idx-1, CBone::AUTO_BOUNDARY, bt) ); 500 501 #ifdef DEBUG 502 print_wide(str+idx-1); 503 fflush(stdout); 504 #endif 505 506 return true; 507 } 508 509 if (pyn == NULL && lastValid >= 0) { 510 result.push_back(CBone(str, lastValid+1, CBone::AUTO_BOUNDARY, CBone::NODE_PINYIN)); 511 512 #ifdef DEBUG 513 print_wide(wstring(str, lastValid+1).c_str()); 514 printf("'"); 515 #endif 516 517 int bt = CBone::NODE_INCOMPLETE_PINYIN; 518 519 pathNodes[idx] = m_pPinyinTrie->transfer(str+lastValid+1); 520 if (pathNodes[idx] == NULL) 521 bt = CBone::NODE_INVALID_PINYIN; 522 else if (m_pPinyinTrie->isValid(pathNodes[idx], m_bNonCompleteSyllable, m_bGBK)) 523 bt = CBone::NODE_PINYIN; 524 else 525 bt = CBone::NODE_INCOMPLETE_PINYIN; 526 result.push_back(CBone(str+lastValid+1, CBone::AUTO_BOUNDARY, bt)); 527 528 #ifdef DEBUG 529 print_wide(str+lastValid+1); 530 if (bt == CBone::NODE_INVALID_PINYIN) 531 printf("(X)"); 532 fflush(stdout); 533 #endif 534 535 return (bt != CBone::NODE_INVALID_PINYIN); 536 } 537 538 if (pyn == NULL) { 539 result.push_back(CBone(str, CBone::AUTO_BOUNDARY, CBone::NODE_INVALID_PINYIN)); 540 541 #ifdef DEBUG 542 print_wide(str); 543 printf("(X)"); 544 fflush(stdout); 545 #endif 546 547 return false; 548 } 549 550 /******************************************************************** 551 Now, pyn is not NULL, str[idx] should be 0, 552 [0..idx-1] is valid (non-complete or complete) 553 *********************************************************************/ 554 int bt = (validSyllable[idx-1])?(CBone::NODE_PINYIN):(CBone::NODE_INCOMPLETE_PINYIN); 555 result.push_back(CBone(str, CBone::AUTO_BOUNDARY, bt)); 556 557 #ifdef DEBUG 558 print_wide(str); 559 fflush(stdout); 560 #endif 561 562 return true; 563 } 564 565 TCandiRank::TCandiRank(bool user, bool best, unsigned int len, 566 bool fromLattice, TSentenceScore score) 567 { 568 anony.m_user = (user)?0:1; 569 anony.m_best = (best)?0:1; 570 anony.m_len = (len > 31)?(0):(31-len); 571 anony.m_lattice = (fromLattice)?0:1; 572 573 #ifdef DEBUG 574 //assert(fromLattice); 575 //assert(TSentenceScore(+0.0) < score); 576 #endif 577 578 double ds = -score.log2(); 579 580 //make it 24-bit 581 if (ds > 32767.0) 582 ds = 32767.0; 583 else if (ds < -32768.0) 584 ds = -32768.0; 585 unsigned cost = unsigned((ds+32768.0)*256.0); 586 anony.m_cost = cost; 587 } 588 589 TCandiRank::TCandiRank(bool user, bool best, unsigned int len, 590 bool fromLattice, unsigned rank) 591 { 592 anony.m_user = (user)?0:1; 593 anony.m_best = (best)?0:1; 594 anony.m_len = (len > 31)?(0):(31-len); 595 anony.m_lattice = (fromLattice)?0:1; 596 anony.m_cost = rank; 597 } 598 599 struct TCandiPair { 600 CCandidate m_Candi; 601 TCandiRank m_Rank; 602 603 TCandiPair() : m_Candi(), m_Rank() { } 604 }; 605 606 struct TCandiPairPtr { 607 TCandiPair* m_Ptr; 608 609 TCandiPairPtr(TCandiPair* p=NULL) : m_Ptr(p) 610 { } 611 612 bool 613 operator< (const TCandiPairPtr& b) const 614 { return m_Ptr->m_Rank < b.m_Ptr->m_Rank; } 615 }; 616 617 // FIXME, this procedure could be modified largely. 618 void 619 CIMIContext::getCandidates(CSkeletonIter bone, CCandidates& result) 620 { 621 TCandiPair cp; 622 static std::map<unsigned int, TCandiPair> map; 623 std::map<unsigned int, TCandiPair>::iterator it_map; 624 625 map.clear(); 626 result.clear(); 627 m_EffectiveCandiBoneStart = m_EffectiveCandiBoneEnd = bone; 628 629 if (bone->isTailNode()) 630 return; 631 if (!bone->isValidPinyinNode()) { 632 result.push_back(CCandidate(bone->m_String.c_str(), bone, ++CSkeletonIter(bone))); 633 return; 634 } 635 636 // if user selection or best word starting at bone 637 if (bone->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) { 638 cp.m_Candi = bone->m_pInnerData->m_BestWord; 639 cp.m_Rank = 640 TCandiRank(bone->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord, 641 bone->m_pInnerData->m_BWType == CBoneInnerData::BestWordStartHere, 642 0, false, 0); 643 map[cp.m_Candi.m_WordId] = cp; 644 } 645 646 //collecting all candidates, from both lattice and lexicon 647 int len = 1; 648 cp.m_Candi.m_BoneStart = bone; 649 CSkeletonIter b = ++CSkeletonIter(bone); 650 while (b != (--m_Skeleton.end())) { 651 cp.m_Candi.m_BoneEnd = b; 652 653 bool found = false; 654 CLexiconStates::iterator itlex = b->m_pInnerData->m_LexiconStates.begin(); 655 CLexiconStates::iterator itlexe = b->m_pInnerData->m_LexiconStates.end(); 656 for (; itlex != itlexe; ++itlex) { 657 if (itlex->m_BoneStart == bone) { 658 found = true; 659 if (itlex->m_bPinyin) { 660 if (itlex->m_pPYNode && itlex->m_pPYNode->m_nWordId > 0) { 661 unsigned sz = itlex->m_pPYNode->m_nWordId; 662 const CPinyinTrie::TWordIdInfo* p = itlex->m_pPYNode->getWordIdPtr(); 663 for (unsigned int i = 0; i < sz; ++i, ++p) { 664 if (m_bGBK || p->m_bGBK == 0) { 665 cp.m_Candi.m_WordId = p->m_id; 666 cp.m_Candi.m_String = (*m_pPinyinTrie)[cp.m_Candi.m_WordId]; 667 668 //sorting according to the order in PinYinTire 669 cp.m_Rank = TCandiRank(false, false, len, false, i); 670 it_map = map.find(cp.m_Candi.m_WordId); 671 if (it_map == map.end() || cp.m_Rank < it_map->second.m_Rank) 672 map[cp.m_Candi.m_WordId] = cp; 673 } 674 } 675 } 676 } else { 677 cp.m_Candi.m_WordId = itlex->m_WordId; 678 cp.m_Candi.m_String = bone->m_String.c_str(); 679 cp.m_Rank = TCandiRank(false, false, len, false, 0); 680 it_map = map.find(cp.m_Candi.m_WordId); 681 if (it_map == map.end() || cp.m_Rank < it_map->second.m_Rank) 682 map[cp.m_Candi.m_WordId] = cp; 683 } 684 } 685 } 686 687 if (!found) break; 688 689 CLatticeStates::iterator its = b->m_pInnerData->m_LatticeNodes.begin(); 690 CLatticeStates::iterator ite = b->m_pInnerData->m_LatticeNodes.end(); 691 for (; its != ite; ++its) { 692 if (its->m_pBackTraceNode && its->m_pBackTraceNode->m_BoneAfter == bone) { 693 cp.m_Candi.m_WordId = its->m_BackTraceWordId; 694 cp.m_Candi.m_String = (*m_pPinyinTrie)[cp.m_Candi.m_WordId]; 695 if (cp.m_Candi.m_String == NULL) 696 cp.m_Candi.m_String = bone->m_String.c_str(); 697 #ifdef _USE_RAW_PROBABILITY 698 699 #ifdef DEBUG 700 //assert(its->m_pBackTraceNode->m_Score < 0.0 && its->m_Score < 0.0); 701 #endif 702 703 cp.m_Rank = TCandiRank(false, false, len, true, its->m_Score / its->m_pBackTraceNode->m_Score); 704 #else 705 cp.m_Rank = TCandiRank(false, false, len, true, its->m_Score - its->m_pBackTraceNode->m_Score); 706 #endif 707 it_map = map.find(cp.m_Candi.m_WordId); 708 if (it_map == map.end() || cp.m_Rank < it_map->second.m_Rank) 709 map[cp.m_Candi.m_WordId] = cp; 710 } 711 } 712 713 m_EffectiveCandiBoneEnd = b; 714 ++b; 715 ++len; 716 } 717 718 std::vector<TCandiPairPtr> vec; 719 720 vec.reserve(map.size()); 721 std::map<unsigned int, TCandiPair>::iterator it_mapE = map.end(); 722 for (it_map = map.begin(); it_map != it_mapE; ++it_map) 723 vec.push_back(TCandiPairPtr(&(it_map->second))); 724 std::make_heap(vec.begin(), vec.end()); 725 std::sort_heap(vec.begin(), vec.end()); 726 727 for (int i=0, sz=vec.size(); i < sz; ++i) 728 result.push_back(vec[i].m_Ptr->m_Candi); 729 } 730 731 int 732 CIMIContext::getBestSentence(wstring & result, CSkeletonIter boneStart, 733 CSkeletonIter boneEnd, bool original_format) 734 { 735 int nWordConverted = 0; 736 result.clear(); 737 738 // no need to check begin(), because firstBone must at least has some 739 // auto best word or user selection best word starting from, this rule 740 // must be followed in this call 741 int len, prefix = 0; 742 CSkeletonIter realStart = boneStart; 743 while (realStart->m_pInnerData->m_BWType == CBoneInnerData::NoBestWordStartHere) { 744 ++prefix; 745 --realStart; 746 } 747 748 while (true) { 749 #ifdef DEBUG 750 //assert(realStart->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere); 751 #endif 752 753 CSkeletonIter bone = boneStart; 754 CSkeletonIter rightBone = realStart->m_pInnerData->m_BestWord.m_BoneEnd; 755 if (realStart->m_BoneType != CBone::NODE_PINYIN && 756 realStart->m_BoneType != CBone::NODE_INCOMPLETE_PINYIN) { 757 for (; bone != rightBone && bone != boneEnd; ++bone) { 758 if (!original_format) 759 result.push_back(bone->m_String[0]); 760 else 761 result.push_back((unsigned)bone->m_BoundaryType); 762 } 763 } else { 764 ++nWordConverted; 765 // get the length from boneStart to current best word tail or end of range 766 for (len=0; bone != rightBone && bone != boneEnd; ++bone) 767 ++len; 768 result.append(realStart->m_pInnerData->m_BestWord.m_String+prefix, len); 769 } 770 if (bone == boneEnd) 771 break; 772 boneStart = realStart = bone; 773 prefix = 0; 774 } 775 776 return nWordConverted; 777 } 778 779 /** 780 * Search from the bone to the tail. the bone can not beyond first psuedo tail. 781 * Before search, all BoneInnerData should be set. The states of the bones 782 * who's ahead of the bone would not be affected by this function. Yet, states 783 * of the bones beyond this bone will be updated or refreshed. 784 * 785 * After lattice search, only one best path are backtraced and each best word 786 * will be attached to corresponding bone. 787 */ 788 void 789 CIMIContext::searchFrom(CSkeletonIter boneStart) 790 { 791 // iterate every bone from boneStart to the second psuedo tail 792 CSkeletonIter itEnd = ++getLastBone(); 793 CSkeletonIter bone = boneStart; 794 CSkeletonIter boneFirst = m_Skeleton.begin(); 795 for (; bone != itEnd; ) { 796 if (bone == boneFirst) { 797 // do not clear USER_SELECTION_BEST_WORD !! 798 bone->m_pInnerData->m_LexiconStates.clear(); 799 bone->m_pInnerData->m_LatticeNodes.clear(); 800 #ifdef _USE_RAW_PROBABILITY 801 bone->m_pInnerData->m_LatticeNodes.push_back(TLatticeState(-1.0, bone)); 802 #else 803 bone->m_pInnerData->m_LatticeNodes.push_back(TLatticeState(0.0, bone)); 804 #endif 805 } else { 806 buildLatticeStates(bone); 807 } 808 switch (bone->m_BoneType) { 809 case CBone::NODE_TAIL: 810 bone = forwardTailBone(bone); 811 break; 812 case CBone::NODE_PINYIN: 813 bone = forwardPinyinBone(bone); 814 break; 815 case CBone::NODE_INCOMPLETE_PINYIN: 816 case CBone::NODE_INVALID_PINYIN: 817 bone = forwardInvalidBone(bone); 818 break; 819 case CBone::NODE_PUNC: 820 bone = forwardPuncBone(bone); 821 break; 822 case CBone::NODE_ASCII: 823 case CBone::NODE_SIMBOL: 824 case CBone::NODE_DIGITAL: 825 bone = forwardNonPinyinBone(bone); 826 break; 827 }; 828 } 829 830 //Build the last bone's lattice states 831 buildLatticeStates(itEnd); 832 833 #ifdef DEBUG 834 //assert(itEnd->m_pInnerData->m_LatticeNodes.size() == 1); 835 #endif 836 837 // clear all non-user selection 838 for (bone=boneFirst; bone != itEnd; ++bone) { 839 if (bone->m_pInnerData->m_BWType != CBoneInnerData::UserSelectedBestWord) 840 bone->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere; 841 } 842 843 // back tracing, find the best path 844 TLatticeState* bs = &(*(itEnd->m_pInnerData->m_LatticeNodes.begin())); 845 while (bs->m_BoneAfter != boneFirst) { 846 TLatticeState* fs = bs->m_pBackTraceNode; 847 CSkeletonIter fb = fs->m_BoneAfter; 848 if (fb->m_pInnerData->m_BWType != CBoneInnerData::UserSelectedBestWord) { 849 fb->m_pInnerData->m_BWType = CBoneInnerData::BestWordStartHere; 850 } 851 fb->m_pInnerData->m_BestWord.m_BoneStart = fb; 852 fb->m_pInnerData->m_BestWord.m_BoneEnd = bs->m_BoneAfter; 853 fb->m_pInnerData->m_BestWord.m_WordId = bs->m_BackTraceWordId; 854 fb->m_pInnerData->m_BestWord.m_String = (*m_pPinyinTrie)[bs->m_BackTraceWordId]; 855 if (fb->m_pInnerData->m_BestWord.m_String == NULL) 856 fb->m_pInnerData->m_BestWord.m_String = fb->m_String.c_str(); 857 bs = fs; 858 } 859 } 860 861 #ifdef DEBUG 862 static double min_ts = 1.0; 863 #endif 864 865 static double s_history_distribution[11] = { 866 0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 867 }; 868 869 void 870 CIMIContext::transferBetween(CSkeletonIter h, CSkeletonIter t, unsigned int id, double ic) 871 { 872 CLatticeStates& latss1 = h->m_pInnerData->m_LatticeNodes; 873 CLatticeStates& latss2 = t->m_pInnerData->m_LatticeNodes; 874 CLatticeStates::iterator it1 = latss1.begin(); 875 CLatticeStates::iterator ite = latss1.end(); 876 877 #ifdef _USE_RAW_PROBABILITY 878 TLatticeState node(-1.0, t); 879 TSentenceScore efic(1.0); 880 #else 881 TLatticeState node(0.0, t); 882 TSentenceScore efic(0.0); 883 #endif 884 885 if (h->m_pInnerData->m_BestWord.m_WordId == id && 886 h->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord) { 887 #ifdef _USE_RAW_PROBABILITY 888 efic = efic * TSentenceScore(30000, 1.0); 889 #else 890 efic = ic - 30000.0; 891 #endif 892 } 893 894 double weight_h = s_history_distribution[m_HistoryPower]; 895 double weight_s = 1.0 - weight_h; 896 897 for (; it1 != ite; ++it1) { 898 node.m_pBackTraceNode = &(*it1); 899 node.m_BackTraceWordId = id; 900 #ifdef _USE_RAW_PROBABILITY 901 // the fact is that we could only use bigram cache 902 // and all first level node in the language model are non-empty 903 double ts = m_pModel->transfer(it1->m_State, id, node.m_State); 904 m_pModel->historify(node.m_State); 905 // we do not want to shrink the history state if it could be found in cache 906 if (node.m_State.getLevel() == 0 && m_pHistory->seenBefore(id)) { 907 node.m_State.setIdx(id); // an psuedo unigram node state 908 } 909 910 #ifdef DEBUG 911 assert(it1->m_Score < TSentenceScore(0.0)); 912 #endif 913 914 double cost = ts; 915 if (m_pHistory) { 916 unsigned history[2] = {m_pModel->lastWordId(it1->m_State), id}; 917 double hpr = m_pHistory->pr(history, history+2); 918 cost = weight_s * ts + weight_h*hpr; 919 } 920 node.m_Score = it1->m_Score * efic * TSentenceScore(cost); 921 922 #ifdef DEBUG 923 if (!(node.m_Score < TSentenceScore(-0.0))) { 924 static char strangeValue[256]; 925 926 node.m_Score.toString(strangeValue); 927 printf("\n***node.m_Score invalid %s ***\n", strangeValue); 928 929 it1->m_Score.toString(strangeValue); 930 printf("***it1->m_Score is %s ***\n", strangeValue); 931 932 efic.toString(strangeValue); 933 printf("***efic=%s, ic=(%lf)***\n", strangeValue, ic); 934 935 TSentenceScore(cost).toString(strangeValue); 936 printf("***cost=%s(%lf), ts=(%16lf)***\n", strangeValue, cost, ts); 937 938 fflush(stdout); 939 assert(false); 940 } 941 #endif 942 943 #else 944 double ts = m_pModel->transferNegLog(it1->m_State, id, node.m_State); 945 m_pModel->historify(node.m_State); 946 node.m_Score = it1->m_Score + ts + ic; 947 #endif 948 949 latss2.push_back(node); 950 } 951 } 952 953 void 954 CIMIContext::buildLatticeStates(CSkeletonIter bone) 955 { 956 bool bSingleSyllable, bSingleShort; 957 unsigned i, sz; 958 CSkeletonIter bonePrev = bone; 959 960 961 --bonePrev; 962 CBoneInnerData & innerData = *(bone->m_pInnerData); 963 CLexiconStates::iterator itLexState = innerData.m_LexiconStates.begin(); 964 CLexiconStates::iterator itLexStateE = innerData.m_LexiconStates.end(); 965 innerData.m_LatticeNodes.clear(); 966 for (; itLexState != itLexStateE; ++itLexState) { 967 CLexiconState& ls = *itLexState; 968 969 // the user selected word may be cut in first pruning process below, 970 // So, just let it go first, when it ends here 971 CBoneInnerData* pbid = ls.m_BoneStart->m_pInnerData; 972 if (pbid->m_BWType == CBoneInnerData::UserSelectedBestWord && 973 pbid->m_BestWord.m_BoneEnd == bone) { 974 #ifdef _USE_RAW_PROBABILITY 975 transferBetween(ls.m_BoneStart, bone, pbid->m_BestWord.m_WordId, 1.0); 976 #else 977 transferBetween(ls.m_BoneStart, bone, pbid->m_BestWord.m_WordId, 0.0); 978 #endif 979 } 980 981 if (!ls.m_bPinyin) { 982 #ifdef _USE_RAW_PROBABILITY 983 transferBetween(ls.m_BoneStart, bone, ls.m_WordId, 1.0); 984 #else 985 transferBetween(ls.m_BoneStart, bone, ls.m_WordId, 0.0); 986 #endif 987 } else { 988 // Cutting words with little unigram possibilities 989 // at least 2, at most 32 of the words would be tried 990 // if unseed word starting from some position in the first 32 991 // candidates(ranked according to unigram pr in lexicon), do not 992 // let them be checked. 993 bSingleShort = bSingleSyllable = (ls.m_BoneStart == bonePrev); 994 if (bSingleSyllable) { 995 register unsigned char uc = ls.m_BoneStart->m_String[0]; 996 bSingleShort = ((ls.m_BoneStart->m_String.size() == 1 && (uc != 'a' && uc != 'o' && uc !='e')) || 997 (ls.m_BoneStart->m_String.size() == 2 && (ls.m_BoneStart->m_String[1] == 'h'))); 998 } 999 //bSingleShort = (bSingleSyllable && !(m_pPinyinTrie->isValid(ls.m_pPYNode, false))); 1000 1001 const CPinyinTrie::TNode* pn = ls.m_pPYNode; 1002 const CPinyinTrie::TWordIdInfo* pwidinfo = pn->getWordIdPtr(); 1003 sz=pn->m_nWordId; 1004 if (bSingleShort) 1005 sz = 12; 1006 else if (sz > 26) 1007 sz = 26; 1008 1009 int count = 0; 1010 for (i=0; count < sz && i < sz && (pwidinfo[i].m_bSeen == 1 || count < 2); ++i) { 1011 if (m_bGBK || pwidinfo[i].m_bGBK == 0) { 1012 #ifdef _USE_RAW_PROBABILITY 1013 transferBetween(ls.m_BoneStart, bone, pwidinfo[i].m_id, 1.0); 1014 #else 1015 transferBetween(ls.m_BoneStart, bone, pwidinfo[i].m_id, 0.0); 1016 #endif 1017 ++count; 1018 } 1019 } 1020 #ifdef _USE_RAW_PROBABILITY 1021 // try cached words 1022 if (m_pHistory) { 1023 for (sz = pn->m_nWordId; i < sz; ++i) { 1024 if (m_bGBK || pwidinfo[i].m_bGBK == 0) { 1025 if (m_pHistory->seenBefore(pwidinfo[i].m_id)) { 1026 transferBetween(ls.m_BoneStart, bone, pwidinfo[i].m_id, 1.0); 1027 } 1028 } 1029 } 1030 } 1031 #endif 1032 } 1033 } 1034 } 1035 1036 1037 /** 1038 * Fussy Pinyin: 1039 * 1040 */ 1041 CSkeletonIter 1042 CIMIContext::forwardOnePinyinBone(CSkeletonIter bone) 1043 { 1044 const CPinyinTrie::TNode *pn = NULL; 1045 1046 //clear next bone's lexicon states 1047 CSkeletonIter boneNext = ++CSkeletonIter(bone); 1048 CLexiconStates& lexss2 = boneNext->m_pInnerData->m_LexiconStates; 1049 lexss2.clear(); 1050 1051 // insert the root PinYin Lexicon node 1052 CLexiconStates& lexss1 = bone->m_pInnerData->m_LexiconStates; 1053 CLexiconStates::iterator it1 = lexss1.begin(); 1054 CLexiconStates::iterator ite = lexss1.end(); 1055 for (; it1 != ite; ++it1) { 1056 if (it1->m_bPinyin) { 1057 pn = m_pPinyinTrie->transfer(it1->m_pPYNode, bone->m_String.c_str()); 1058 if (pn != NULL && (pn = m_pPinyinTrie->transfer(pn, TWCHAR('\''))) != NULL) { 1059 lexss2.push_back(CLexiconState(it1->m_BoneStart, pn)); 1060 } 1061 } 1062 } 1063 1064 //try transfer from root state of the lexicon 1065 pn = m_pPinyinTrie->transfer(bone->m_String.c_str()); 1066 if (pn != NULL && (pn = m_pPinyinTrie->transfer(pn, TWCHAR('\''))) != NULL) { 1067 lexss2.push_back(CLexiconState(bone, pn)); 1068 } 1069 1070 return boneNext; 1071 } 1072 1073 CSkeletonIter 1074 CIMIContext::forwardPinyinBone(CSkeletonIter bone) 1075 { 1076 if (bone->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord && m_bStrictLeft2Right) { 1077 CSkeletonIter boneLeft = bone; 1078 CSkeletonIter boneRight = bone->m_pInnerData->m_BestWord.m_BoneEnd; 1079 for (; bone != boneRight; ++bone) 1080 (++CSkeletonIter(bone))->m_pInnerData->clear(); 1081 boneRight->m_pInnerData->m_LexiconStates.push_back( 1082 CLexiconState(boneLeft, boneLeft->m_pInnerData->m_BestWord.m_WordId) 1083 ); 1084 return boneRight; 1085 } else { 1086 return forwardOnePinyinBone(bone); 1087 } 1088 } 1089 1090 CSkeletonIter 1091 CIMIContext::forwardInvalidBone(CSkeletonIter bone) 1092 { 1093 CSkeletonIter boneNext = ++CSkeletonIter(bone); 1094 CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates; 1095 lss.clear(); 1096 lss.push_back(CLexiconState(bone, (unsigned int)UNKNOWN_WORD_ID)); 1097 1098 return boneNext; 1099 } 1100 1101 CSkeletonIter 1102 CIMIContext::forwardPuncBone(CSkeletonIter bone) 1103 { 1104 unsigned int wid = m_pPinyinTrie->getSimbolId(bone->m_String); 1105 1106 CSkeletonIter boneNext = ++CSkeletonIter(bone); 1107 CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates; 1108 lss.clear(); 1109 lss.push_back(CLexiconState(bone, wid)); 1110 1111 return boneNext; 1112 } 1113 1114 CSkeletonIter 1115 CIMIContext::forwardNonPinyinBone(CSkeletonIter bone) 1116 { 1117 CSkeletonIter boneNext = ++CSkeletonIter(bone); 1118 CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates; 1119 lss.clear(); 1120 lss.push_back(CLexiconState(bone, (unsigned int)UNKNOWN_WORD_ID)); 1121 1122 return boneNext; 1123 } 1124 1125 1126 CSkeletonIter 1127 CIMIContext::forwardTailBone(CSkeletonIter bone) 1128 { 1129 CSkeletonIter boneNext = ++CSkeletonIter(bone); 1130 CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates; 1131 lss.clear(); 1132 lss.push_back(CLexiconState(bone, OOV_WORD_ID)); 1133 1134 return boneNext; 1135 } 1136 1137 CBone::CBone(const CBone& b) 1138 : m_BoneType(b.m_BoneType), m_BoundaryType(b.m_BoundaryType), 1139 m_String(b.m_String), m_pInnerData(NULL) 1140 { 1141 } 1142 1143 CBone::CBone(int boundType, int boneType) 1144 : m_BoneType(boneType), m_BoundaryType(boundType), 1145 m_String(), m_pInnerData(NULL) 1146 { 1147 } 1148 1149 CBone::CBone(const TWCHAR* pwc, int boundType, int boneType) 1150 : m_BoneType(boneType), m_BoundaryType(boundType), 1151 m_String(pwc), m_pInnerData(NULL) 1152 { 1153 } 1154 1155 CBone::CBone(const TWCHAR* pwc, size_t len, int boundType, int boneType) 1156 : m_BoneType(boneType), m_BoundaryType(boundType), 1157 m_String(pwc, len), m_pInnerData(NULL) 1158 { 1159 } 1160 1161 CBone::~CBone() 1162 { 1163 delete m_pInnerData; 1164 m_pInnerData = NULL; 1165 } 1166 1167 bool 1168 CBone::isUserSelectionStart(void) 1169 { 1170 return (m_pInnerData != NULL && 1171 m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord); 1172 } 1173 1174 int 1175 cursorMapping(CSkeletonIter head1, CSkeletonIter tail1, 1176 CSkeletonIter head2, CSkeletonIter tail2, 1177 CSkeleton& result, 1178 CSkeletonIter& cursor, int& cursorIdx, bool stickLeft = false) 1179 { 1180 TSkelCursor sc(head1, tail1, head2, tail2, true); 1181 TSkelCursor::TPos cp(cursor, cursorIdx); 1182 1183 int len = 0; 1184 bool found =false; 1185 1186 while (true) { 1187 found = sc.ensureCursor(cp); 1188 if (found) break; 1189 if (!sc.hasNext()) break; 1190 sc.next(true); 1191 ++len; 1192 } 1193 1194 if (found) { 1195 TSkelCursor::TPos nc = sc.getPosition(); 1196 1197 int cmplen = cursorIdx = 0, nNode = 0; 1198 for (cursor = result.begin(); cursor != result.end(); ++cmplen) { 1199 if (cmplen == len) break; 1200 ++cursorIdx; 1201 if (cursorIdx >= cursor->m_String.size()) { 1202 ++cursor; 1203 ++nNode; 1204 cursorIdx = 0; 1205 } 1206 } 1207 if (cmplen == len) { // now we found that 1208 if (stickLeft && cursor != result.begin() && cursorIdx == 0) { 1209 --cursor; 1210 --nNode; 1211 cursorIdx = cursor->m_String.size(); 1212 } 1213 } 1214 return nNode; 1215 } 1216 return -1; 1217 } 1218 1219 /** 1220 * it is illegal if boneStart == boneEnd and skel.size() == 0 1221 * 1222 * 1. from current position, seeking left for 3 bones without HumanBoundary, or bones 1223 * of non-pinyin type between it. --> its, also can not beyond m_CandiBone. 1224 * 2. in [its, bonStart) from left to right, find the first bone would cause different 1225 * segmentation result. --> itd, other wise itds <-- boneStart 1226 * 3. from [itd, boneEnd), do automatic segment 1227 * 4. after boneEnd, util user boundary or non-pinyin bone 1228 * or segment result equals to original. --> itd2. 1229 * 5. [itd, itd2) re-segment, all resulting bone goes into a new skeleton --> newskel. 1230 * while convert old cursor position into new position. 1231 * 6. erase old or seg-affected nodes and splice in the newskel. 1232 * do search if needed 1233 * 1234 * The key part of this would seeking a solution for finding the automatic sentence 1235 * segmentation result. 1236 */ 1237 bool 1238 CIMIContext::modifyAndReseg(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel, 1239 CSkeletonIter& cursor, int& cursorIdx, CSkeletonIter& candiStart, 1240 bool stickLeft, bool doSearch) 1241 { 1242 CSkeleton newskel; 1243 1244 // Try to look_left to prevent potential segmentation insufficiency 1245 CSkeletonIter nit, oit, its = boneStart; 1246 int look_left = 0; 1247 for (; look_left < 3 && its != getFirstBone() && its != candiStart; ++look_left) { 1248 --its; 1249 if (!its->isPinyinNode() || its->m_BoundaryType == CBone::USER_BOUNDARY) { 1250 ++its; 1251 break; 1252 } 1253 skel.push_front(*its); 1254 } 1255 1256 // do Syllable segment on the virtual new list 1257 segPinyin(skel.begin(), skel.end(), boneEnd, getLastBone(), newskel); 1258 1259 // Remapping the new cursor 1260 int ncIdx = cursorMapping(skel.begin(), skel.end(), boneEnd, getLastBone(), newskel, cursor, cursorIdx, stickLeft); 1261 1262 // Skip previous look-left nodes that are same with original 1263 int first_diff = 0; 1264 CSkeletonIter dif_oits=skel.begin(); 1265 for (oit=skel.begin(), nit=newskel.begin(); first_diff < look_left; ++first_diff) { 1266 if (nit->m_String.size() != oit->m_String.size()) { 1267 dif_oits = oit; 1268 break; 1269 } 1270 if (ncIdx == 0) cursor = its; 1271 --ncIdx; 1272 ++its; 1273 ++nit; 1274 ++oit; 1275 newskel.pop_front(); 1276 } 1277 1278 // prepare to restore the CandiStart 1279 bool candiStartPositionReset = (its == candiStart); 1280 1281 // Prepare for cursor reposition to restore after modify 1282 CSkeletonIter leftIt; 1283 bool leftItIsHead = (its == getFirstBone()); 1284 if (!leftItIsHead) { 1285 leftIt = its; 1286 --leftIt; 1287 } 1288 1289 // modify original node list 1290 bool affectCandidates = modify(its, getLastBone(), newskel, doSearch); 1291 1292 // Reposition cursor 1293 if (ncIdx >= 0) { 1294 cursor = (leftItIsHead)?(getFirstBone()):(++CSkeletonIter(leftIt)); 1295 for (int i=0; i < ncIdx; ++i) 1296 ++cursor; 1297 } 1298 1299 // Reposition candiStart 1300 if (candiStartPositionReset) { 1301 candiStart = (leftItIsHead)?(getFirstBone()):(++CSkeletonIter(leftIt)); 1302 affectCandidates = true; 1303 } 1304 1305 return affectCandidates; 1306 } 1307 1308 void 1309 CIMIContext::segPinyin(CSkeletonIter head1, CSkeletonIter tail1, 1310 CSkeletonIter head2, CSkeletonIter tail2, 1311 CSkeleton& result) 1312 { 1313 #ifdef DEBUG 1314 printf("SegPinyin:"); 1315 #endif 1316 1317 const CPinyinTrie::TNode* pathNodes[16]; 1318 TSkelCursor::TPos positions[16]; 1319 1320 #ifdef DEBUG 1321 TWCHAR dbg_msg[2] = {0, 0}; 1322 { 1323 TSkelCursor dsc(head1, tail1, head2, tail2); 1324 while (dsc.hasNext()) { 1325 if (dsc.isPinyin()) { 1326 dbg_msg[0] = dsc.getChar(); 1327 print_wide(dbg_msg); 1328 if (dsc.isUserBreakAfter()) { 1329 printf("'"); 1330 } 1331 } else { 1332 printf("_"); 1333 dbg_msg[0] = dsc.getChar(); 1334 print_wide(dbg_msg); 1335 } 1336 dsc.next(); 1337 } 1338 } 1339 #endif 1340 1341 result.clear(); 1342 TSkelCursor sc(head1, tail1, head2, tail2); 1343 while (sc.hasNext()) { 1344 if (sc.isPinyin()) { 1345 int lastValid = 0; 1346 pathNodes[0] = m_pPinyinTrie->getRootNode(); 1347 positions[0] = sc.getPosition(); 1348 for (int idx=1; sc.isPinyin() && pathNodes[idx-1] != NULL; ++idx) { 1349 pathNodes[idx] = m_pPinyinTrie->transfer(pathNodes[idx-1], sc.getChar()); 1350 sc.next(); 1351 positions[idx] = sc.getPosition(); 1352 if (m_pPinyinTrie->isValid(pathNodes[idx], m_bNonCompleteSyllable, m_bGBK)) 1353 lastValid = idx; 1354 if (sc.isUserBreakAfter(positions[idx-1])) 1355 break; 1356 } 1357 bool invalid = false; 1358 if (lastValid == 0) { 1359 invalid = true; 1360 lastValid = 1; 1361 } 1362 if (lastValid >= 2 && pathNodes[lastValid]->m_bFullSyllableTransfer && pathNodes[lastValid-1]->m_bFullSyllableTransfer) { 1363 TWCHAR w1 = sc.getChar(positions[lastValid-1]); 1364 TWCHAR w2 = sc.getChar(positions[lastValid]); 1365 if (!isYuanYinChar(w1) && isYuanYinChar(w2)){ 1366 const CPinyinTrie::TNode* pytmp = NULL; 1367 pytmp = m_pPinyinTrie->transfer(m_pPinyinTrie->getRootNode(), w1); 1368 if (pytmp) pytmp = m_pPinyinTrie->transfer(pytmp, w2); 1369 if (pytmp != NULL) --lastValid; 1370 } 1371 } 1372 CBone bnint(CBone::AUTO_BOUNDARY, (invalid)?(CBone::NODE_INVALID_PINYIN):(CBone::NODE_PINYIN)); 1373 if (sc.isUserBreakAfter(positions[lastValid-1])) 1374 bnint.m_BoundaryType = CBone::USER_BOUNDARY; 1375 for (int idx=0; idx < lastValid; ++idx) 1376 bnint.m_String += sc.getChar(positions[idx]); 1377 result.push_back(bnint); 1378 sc.setPosition(positions[lastValid]); 1379 } else { 1380 result.push_back(*(sc.getPosition().m_bone)); 1381 sc.nextBone(); 1382 } 1383 } 1384 1385 #ifdef DEBUG 1386 { 1387 printf(" ==> "); 1388 TSkelCursor dsc(result.begin(), result.end(), result.end(), result.end()); 1389 while (dsc.hasNext()) { 1390 if (dsc.isPinyin()) { 1391 dbg_msg[0] = dsc.getChar(); 1392 print_wide(dbg_msg); 1393 if (dsc.isBreakAfter()) { 1394 printf("'"); 1395 } 1396 } else { 1397 printf("_"); 1398 dbg_msg[0] = dsc.getChar(); 1399 print_wide(dbg_msg); 1400 } 1401 dsc.next(); 1402 } 1403 } 1404 fflush(stdout); 1405 #endif 1406 return; 1407 } 1408 1409 void 1410 CIMIContext::setHistoryMemory(CICHistory *phm) 1411 { 1412 m_pHistory = phm; 1413 } 1414 1415 CICHistory * 1416 CIMIContext::getHistoryMemory() 1417 { 1418 return m_pHistory; 1419 } 1420 1421 void CIMIContext::memorize(void) 1422 { 1423 if (m_pHistory != NULL) { 1424 std::vector<unsigned int> result; 1425 CSkeletonIter boneStart = getFirstBone(); 1426 CSkeletonIter boneEnd = getLastBone(); 1427 1428 while (boneStart != boneEnd) { 1429 #ifdef DEBUG 1430 //assert(boneStart->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere); 1431 #endif 1432 1433 CSkeletonIter bone = boneStart; 1434 CSkeletonIter rightBone = boneStart->m_pInnerData->m_BestWord.m_BoneEnd; 1435 if (boneStart->m_BoneType != CBone::NODE_PINYIN && boneStart->m_BoneType != CBone::NODE_INCOMPLETE_PINYIN) { 1436 while (bone != rightBone && bone != boneEnd) 1437 ++bone; 1438 result.push_back(0); 1439 } else { 1440 while (bone != rightBone && bone != boneEnd) 1441 ++bone; 1442 result.push_back(boneStart->m_pInnerData->m_BestWord.m_WordId); 1443 } 1444 1445 boneStart = bone; 1446 } 1447 if (result.size() > 0) 1448 m_pHistory->memorize(&(result[0]), (&(result[0])) + result.size()); 1449 } 1450 } 1451 1452 void 1453 CIMIContext::print_lattice() 1454 { 1455 printf("\n"); 1456 std::string prefix; 1457 CSkeletonIter bone = getFirstBone(); 1458 CSkeletonIter boneEnd = getLastBone(); 1459 for (;bone != boneEnd; ++bone) 1460 bone->print(prefix); 1461 boneEnd->print(prefix); 1462 (++boneEnd)->print(prefix); 1463 fflush(stdout); 1464 } 1465 1466 void 1467 CBone::print(std::string& prefix) 1468 { 1469 printf(prefix.c_str()); 1470 printf("{Bone@%X:", this); 1471 print_wide(m_String.c_str()); 1472 printf("}"); 1473 prefix += " "; 1474 if (m_pInnerData) 1475 m_pInnerData->print(prefix); 1476 prefix.resize(prefix.size() - 4); 1477 fflush(stdout); 1478 } 1479 1480 void 1481 CCandidate::print(std::string& prefix) 1482 { 1483 printf(prefix.c_str()); 1484 printf("<Candidate @%X:", this); 1485 print_wide(m_String); 1486 printf("-- %d}", m_WordId); 1487 fflush(stdout); 1488 } 1489 1490 1491 TLongExpFloat::TLongExpFloat(double d) 1492 { 1493 if (d != 0.0 && d != -0.0) { 1494 TDoubleAnatomy da(d); 1495 m_exp = da.getExp(); 1496 da.clearExp(); 1497 m_base = da.getValue(); 1498 } else { 1499 m_base = d; 1500 m_exp = 0; 1501 } 1502 } 1503 1504 TLongExpFloat 1505 TLongExpFloat::operator* (const TLongExpFloat& b) const 1506 { 1507 double d = this->m_base * b.m_base; 1508 TLongExpFloat reda(d); 1509 reda.m_exp += this->m_exp + b.m_exp; 1510 return reda; 1511 } 1512 1513 TLongExpFloat 1514 TLongExpFloat::operator/ (const TLongExpFloat& b) const 1515 { 1516 double d = this->m_base / b.m_base; 1517 TLongExpFloat reda(d); 1518 reda.m_exp += (this->m_exp - b.m_exp); 1519 return reda; 1520 } 1521 1522 bool 1523 TLongExpFloat::operator< (const TLongExpFloat& b) const 1524 { 1525 if (m_base >= 0.0 && b.m_base >= 0.0) { 1526 return (m_exp < b.m_exp || (m_exp == b.m_exp && m_base < b.m_base)); 1527 } else if (m_base < 0.0 && b.m_base < 0.0) { 1528 return (m_exp > b.m_exp || (m_exp == b.m_exp && m_base < b.m_base)); 1529 } else if (m_base < 0.0 && b.m_base >= 0.0) 1530 return true; 1531 else 1532 return false; 1533 } 1534 1535 bool 1536 TLongExpFloat::operator<=(const TLongExpFloat& b) const 1537 { 1538 if (m_base >= 0.0 && b.m_base >= 0.0) { 1539 return (m_exp < b.m_exp || (m_exp == b.m_exp && m_base <= b.m_base)); 1540 } else if (m_base < 0.0 && b.m_base < 0.0) { 1541 return (m_exp > b.m_exp || (m_exp == b.m_exp && m_base <= b.m_base)); 1542 } else if (m_base < 0.0 && b.m_base >= 0.0) 1543 return true; 1544 else 1545 return false; 1546 } 1547 1548 bool 1549 TLongExpFloat::operator==(const TLongExpFloat& b) const 1550 { 1551 return (m_base == b.m_base && m_exp == b.m_exp); 1552 } 1553 1554 void 1555 TLongExpFloat::toString(std::string& str) const 1556 { 1557 char buf[256]; 1558 toString(buf); 1559 str = buf; 1560 } 1561
