Home | History | Annotate | Download | only in src
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifdef HAVE_CONFIG_H
     39 #include <config.h>
     40 #endif
     41 
     42 #include <algorithm>
     43 #include "imi_context.h"
     44 #include "lattice_states.h"
     45 
     46 
     47 class TSkelCursor {
     48 public:
     49     struct TPos {
     50         CSkeletonIter m_bone;
     51         int           m_idx;
     52         TPos(CSkeletonIter bone=CSkeletonIter(), int idx=0)
     53             : m_bone(bone), m_idx(idx) { }
     54     };
     55 
     56 public:
     57     TSkelCursor(CSkeletonIter h1, CSkeletonIter t1, CSkeletonIter h2, CSkeletonIter t2, bool asis=false)
     58             : m_h1(h1), m_t1(t1), m_h2(h2), m_t2(t2), m_bone(h1), m_idx(0), m_iLink(0)
     59         { if (!asis) ensureCursor(); }
     60 
     61     inline bool
     62     isPinyin() const
     63         { return (m_bone != m_t2 && m_bone->isPinyinNode()); }
     64 
     65     inline bool
     66     isBreakAfter() const
     67         { return (m_bone == m_t2 || m_bone == m_t1 || m_idx >= m_bone->m_String.size()-1); }
     68 
     69     inline bool
     70     isBreakAfter(TPos & pos) const
     71         { return (pos.m_bone == m_t2 || pos.m_bone == m_t1 || pos.m_idx >= pos.m_bone->m_String.size()-1); }
     72 
     73     inline bool
     74     isUserBreakAfter() const
     75         { return (m_bone == m_t2 || m_bone == m_t1 ||
     76                     (m_idx == m_bone->m_String.size()-1 && m_bone->isPinyinNode() &&
     77                      m_bone->m_BoundaryType == CBone::USER_BOUNDARY)); }
     78 
     79     inline bool
     80     isUserBreakAfter(TPos & pos) const
     81         { return (pos.m_bone == m_t2 || pos.m_bone == m_t1 ||
     82                       (pos.m_idx == pos.m_bone->m_String.size()-1 && pos.m_bone->isPinyinNode() &&
     83                        pos.m_bone->m_BoundaryType == CBone::USER_BOUNDARY)); }
     84 
     85     inline TWCHAR
     86     getChar()
     87         {
     88             ensureCursor();
     89             return (m_bone != m_t2)?(m_bone->m_String[m_idx]):0;
     90         }
     91 
     92     inline TWCHAR
     93     getChar(TPos& pos) const // the pos should be ensured
     94         { return (pos.m_bone != m_t2)?(pos.m_bone->m_String[pos.m_idx]):0; }
     95 
     96     void
     97     next(bool asis=false)
     98         {
     99             ensureCursor();
    100             if (m_bone != m_t2) {
    101                 ++m_idx;
    102                 if (!asis) ensureCursor();
    103             }
    104         }
    105 
    106     void
    107     nextBone()
    108         {
    109             ensureCursor();
    110             if (m_bone != m_t2) {
    111                 ++m_bone;
    112                 m_idx=0;
    113             }
    114             ensureCursor();
    115         }
    116 
    117     inline bool
    118     hasNext()
    119         {
    120             ensureCursor();
    121             return m_bone != m_t2;
    122         }
    123 
    124     inline bool
    125     atFirstLink() const
    126         { return m_iLink == 0; }
    127 
    128     inline TPos
    129     getPosition() const
    130         {
    131             return TPos(m_bone, m_idx);
    132         }
    133 
    134     /** The parameters must be retrieved from save object before */
    135     inline void
    136     setPosition(const TPos& pos)
    137         { m_bone = pos.m_bone; m_idx = pos.m_idx; }
    138 
    139     bool
    140     ensureCursor(TPos& curCompare);
    141 
    142 protected:
    143     CSkeletonIter   m_h1, m_h2, m_t1, m_t2;
    144     CSkeletonIter   m_bone;
    145     int             m_iLink, m_idx;
    146 
    147 protected:
    148     void
    149     ensureCursor();
    150 };
    151 
    152 void
    153 TSkelCursor::ensureCursor()
    154 {
    155     while (m_bone != m_t1 && m_bone != m_t2 && m_idx >= m_bone->m_String.size()){
    156         m_idx = 0;
    157         ++m_bone;
    158     }
    159     if (m_bone == m_t1) {
    160         ++m_iLink;
    161         m_bone = m_h2;
    162         m_idx = 0;
    163         while (m_bone != m_t2 && m_idx >= m_bone->m_String.size()){
    164             m_idx = 0;
    165             ++m_bone;
    166         }
    167     }
    168 }
    169 
    170 bool
    171 TSkelCursor::ensureCursor(TPos& curCompare)
    172 {
    173     bool same = false;
    174     same = (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx);
    175     while (m_bone != m_t1 && m_bone != m_t2 && m_idx >= m_bone->m_String.size()){
    176         m_idx = 0;
    177         ++m_bone;
    178         same = same || (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx);
    179     }
    180     if (m_bone == m_t1) {
    181         ++m_iLink;
    182         m_bone = m_h2;
    183         m_idx = 0;
    184         same = same || (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx);
    185         while (m_bone != m_t2 && m_idx >= m_bone->m_String.size()){
    186             m_idx = 0;
    187             ++m_bone;
    188             same = same || (m_bone == curCompare.m_bone && m_idx == curCompare.m_idx);
    189         }
    190     }
    191     return same;
    192 }
    193 
    194 /**
    195  * Determine whether or not the target iterator's position on list of head
    196  * is located before the iterator first. More precisely, it return whether
    197  * or not target in [head, first)
    198  * @param target is the target iterator whose position to be decided
    199  * @param head is the head iterator of the container (list or vector)
    200  * @param first iterator to be compared with target
    201  * @return whether or not the target iterator's position on list of head
    202  * is located before the iterator first
    203  */
    204 template<class forwardIt>
    205 bool
    206 isLocatedBefore(forwardIt target, forwardIt head, forwardIt first)
    207 {
    208     for (; head != first; ++head) {
    209         if (target == head) return true;
    210     }
    211     return false;
    212 }
    213 
    214 CIMIContext::CIMIContext()
    215     : m_bNonCompleteSyllable(false), m_bStrictLeft2Right(false),
    216       m_bGBK(true), m_bGB18030(false), m_HistoryPower(3), m_ContextRanking(true),
    217       m_pModel(NULL), m_pPinyinTrie(NULL), m_Skeleton(),
    218       m_EffectiveCandiBoneStart(), m_EffectiveCandiBoneEnd()
    219 {
    220 }
    221 
    222 void
    223 CIMIContext::setCoreData(CIMIData *pCoreData)
    224 {
    225     m_pModel = pCoreData->getSlm();
    226     m_pPinyinTrie = pCoreData->getPinyinTrie();
    227 }
    228 
    229 void
    230 CIMIContext::clear()
    231 {
    232     m_Skeleton.clear();
    233     m_Skeleton.push_back(CBone());
    234     m_Skeleton.push_back(CBone());
    235     m_EffectiveCandiBoneStart = m_EffectiveCandiBoneEnd = getLastBone();
    236 
    237     // allocate bone's inner data when it is inserted into the skeleton
    238     CSkeletonIter itEnd = m_Skeleton.end();
    239     for (CSkeletonIter bone = m_Skeleton.begin(); bone != itEnd; ++bone) {
    240         if (bone->m_pInnerData == NULL)
    241             bone->m_pInnerData = new CBoneInnerData();
    242     }
    243 
    244     searchFrom(m_Skeleton.begin());
    245 }
    246 
    247 static bool
    248 isYuanYinChar(TWCHAR wc)
    249 {
    250     return (wc == L'a' || wc == L'o' || wc == L'e' ||
    251             wc == L'i' || wc == L'u' || wc == L'v');
    252 }
    253 
    254 CSkeletonIter
    255 CIMIContext::cancelSelection(CSkeletonIter bone, bool update)
    256 {
    257     bool found = false;
    258     CSkeletonIter it = bone;
    259     for (CSkeletonIter first=m_Skeleton.begin(); it->m_BoneType == CBone::NODE_PINYIN; --it) {
    260         // BestWrod is conjunctive, so no need to check position if user selection
    261         // like isLocatedBefore(bone, it, it->m_pInnerData->m_BestWord.m_BoneEnd)) {
    262         if (it->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord) {
    263             it->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere;
    264             found = true;
    265             break;
    266         } else if (it->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) {
    267             break;
    268         }
    269         if (it == first)
    270             break;
    271     }
    272     if (found && update)
    273         searchFrom(it);
    274     return (found)?(it):(bone);
    275 }
    276 
    277 CSkeletonIter
    278 CIMIContext::cancelSelectionCover(CSkeletonIter bone, bool update)
    279 {
    280     bool found = false;
    281     if (bone->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) {
    282         return bone;
    283     }
    284     CSkeletonIter it = bone;
    285     for (CSkeletonIter first=m_Skeleton.begin(); it != first; ) {
    286         --it;
    287         if (it->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) {
    288             // BestWord is conjunctive, so no need to check position if user selection
    289             // like isLocatedBefore(bone, it, it->m_pInnerData->m_BestWord.m_BoneEnd)) {
    290             if (it->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord) {
    291                 it->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere;
    292                 found = true;
    293             }
    294             break;
    295         }
    296     }
    297     if (found && update)
    298         searchFrom(it);
    299     return (found)?(it):(bone);
    300 }
    301 
    302 bool
    303 CIMIContext::makeSelection(const CCandidate& candi)
    304 {
    305     CSkeletonIter boneLeft = cancelSelection(candi.m_BoneStart, false);
    306 
    307     /*
    308     candi.m_BoneStart->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere;
    309 
    310     for (CSkeletonIter bone = candi.m_BoneStart; bone != candi.m_BoneEnd; ++bone)
    311         bone->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere;
    312     */
    313 
    314     candi.m_BoneStart->m_pInnerData->m_BestWord = candi;
    315     candi.m_BoneStart->m_pInnerData->m_BWType = CBoneInnerData::UserSelectedBestWord;
    316     searchFrom(boneLeft);
    317 
    318     return true;
    319 }
    320 
    321 /**
    322 * it is illegal if boneStart == boneEnd and skel.size() == 0
    323 */
    324 bool
    325 CIMIContext::modify(CSkeletonIter boneStart,
    326                     CSkeletonIter boneEnd,
    327                     CSkeleton& skel,
    328                     bool doSearch,
    329                     CSkeletonIter* pItLeftmost)
    330 {
    331     // No change needed, happen on OneLineView, call out a PINYIN i
    332     // but return it back withou modification.
    333     // FIXME, maybe this should be put back to OnreLineView's code, not here
    334     if ((skel.size() == 1) && (boneEnd == ++CSkeletonIter(boneStart)) &&
    335             (skel.begin()->m_BoneType == boneStart->m_BoneType) &&
    336             (skel.begin()->m_String == boneStart->m_String)) {
    337         if (pItLeftmost) *pItLeftmost = getLastBone();
    338         boneStart->m_BoundaryType = skel.begin()->m_BoundaryType;
    339         return false;
    340     }
    341 
    342     // check whether or not the modification would affect the candidates
    343     // retrieved by the previous getCandidates() call
    344     CSkeletonIter first = boneStart;
    345     if (first->m_pInnerData->m_LexiconStates.size() > 0)
    346         first = first->m_pInnerData->m_LexiconStates[0].m_BoneStart;
    347     bool affectCandidates =
    348         !isLocatedBefore(m_EffectiveCandiBoneEnd, m_Skeleton.begin(), first);
    349 
    350     // We must check the user selection which may cover this node
    351     // if there is such a user selection, we should do search from there
    352     // starting bone of such a selection.
    353     // The check should only be done when boneStart to be removed
    354     CSkeletonIter lefter = cancelSelectionCover(boneStart, false);
    355     bool bSearchLefter = (lefter != boneStart);
    356 
    357     // Another case is that previous UserSelection just ending at boneStart,
    358     // which is the first bone to be deleted. In this case, new search will
    359     // start with the newly inserted bone, and when searching, the User
    360     // selection word will be check backward and got a wrong range. So
    361     // we must make it change the UserSelection's ending Bone to the first
    362     // bone after insertion.
    363     bool bLeftUS = false;
    364     CSkeletonIter leftUserBone = lefter;
    365     if (skel.size() > 0 && !bSearchLefter && leftUserBone != m_Skeleton.begin()) {
    366         do {
    367             --leftUserBone;
    368             int bwType = leftUserBone->m_pInnerData->m_BWType;
    369             if (bwType != CBoneInnerData::NoBestWordStartHere) {
    370                 bLeftUS = (bwType ==CBoneInnerData::UserSelectedBestWord &&
    371                            leftUserBone->m_pInnerData->m_BestWord.m_BoneEnd == boneStart);
    372                 break;
    373             }
    374         } while (leftUserBone != m_Skeleton.begin());
    375     }
    376 
    377     CBoneInnerData *pid = NULL;
    378     // remove the old range
    379     if (boneStart != boneEnd) {
    380         // before remove the bone, get the first's bone's innerData
    381         // reserve it for the first bone to be inserted. (ie. just
    382         // attach it to the first bone after deletion/insertion
    383         pid = boneStart->m_pInnerData;
    384         boneStart->m_pInnerData = NULL;
    385 
    386         m_Skeleton.erase(boneStart, boneEnd);
    387     }
    388 
    389     // insert new list before boneEnd
    390     first = boneEnd;
    391     CSkeleton::iterator it1 = skel.begin(), h = skel.begin();
    392     CSkeleton::iterator it2 = skel.end();
    393     for (; it1 != it2; ++it1) {
    394         CSkeletonIter tmp = m_Skeleton.insert(boneEnd, *it1);
    395         if (it1 == h)
    396             first = tmp;
    397         else
    398             tmp->m_pInnerData = new CBoneInnerData();
    399     }
    400 
    401     if (first->m_pInnerData != NULL) {
    402         // nothing inserted, must deleted something, ie pid != NULL
    403         pid->m_BWType = first->m_pInnerData->m_BWType;
    404         pid->m_BestWord = first->m_pInnerData->m_BestWord;
    405         delete first->m_pInnerData;
    406         first->m_pInnerData = pid;
    407     } else if (pid == NULL) {
    408         // nothing deleted, just inserting something
    409         first->m_pInnerData = boneEnd->m_pInnerData;
    410         boneEnd->m_pInnerData = new CBoneInnerData();
    411         boneEnd->m_pInnerData->m_BWType = first->m_pInnerData->m_BWType;
    412         boneEnd->m_pInnerData->m_BestWord = first->m_pInnerData->m_BestWord;
    413         first->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere;
    414     } else {
    415         //something deleted, something inserted
    416         first->m_pInnerData = pid;
    417         pid->m_BWType = CBoneInnerData::NoBestWordStartHere;
    418     }
    419 
    420     // change the left user selection bone's best word's ending bone to first
    421     if (bLeftUS)
    422         leftUserBone->m_pInnerData->m_BestWord.m_BoneEnd = first;
    423 
    424     // rebuild the search lattice from the newly inserted list
    425     // using the just copied lattice states (innerData)
    426     if (pItLeftmost)
    427         *pItLeftmost = (bSearchLefter)?(lefter):(first);
    428     if (doSearch)
    429         searchFrom((bSearchLefter)?(lefter):(first));
    430 
    431     return affectCandidates;
    432 }
    433 
    434 bool
    435 CIMIContext::isValidSyllable(const TWCHAR* pstr)
    436 {
    437     const CPinyinTrie::TNode* pyn = m_pPinyinTrie->transfer(pstr);
    438     return m_pPinyinTrie->isValid(pyn, m_bNonCompleteSyllable, m_bGBK);
    439 }
    440 
    441 bool
    442 CIMIContext::segPinyinSimplest(const wstring& pinyin, CSkeleton& result)
    443 {
    444     #ifdef DEBUG
    445         printf("SegPinyin:");
    446         print_wide(pinyin.c_str());
    447         printf("-->");
    448     #endif
    449 
    450     //"zhuang" is longest syllable, 16 is enought
    451     bool  validSyllable[16];
    452     const CPinyinTrie::TNode* pathNodes[16];
    453     const TWCHAR* str = pinyin.c_str();
    454     const CPinyinTrie::TNode* pyn = m_pPinyinTrie->getRootNode();
    455 
    456     result.clear();
    457 
    458     //Find out the longest valid PINYIN prefix, save to lastValid
    459     int idx, lastValid = - 1;
    460     for (idx = 0; str[idx] != 0; ++idx) {
    461         pyn = m_pPinyinTrie->transfer(pyn, (unsigned char)(str[idx]));
    462         pathNodes[idx] = pyn;
    463         if (validSyllable[idx] = m_pPinyinTrie->isValid(pyn, m_bNonCompleteSyllable, m_bGBK))
    464             lastValid = idx;
    465         if (pyn == NULL)
    466             break;
    467     }
    468 
    469     /*********************************************************************
    470     Note, when NULL pyn arrived, the char should also be the last one.
    471     Try to split it into two nodes if possible:
    472         (1) [0..idx-2], [0..idx-1] is both complete syllable
    473             [idx-1] is FuYin, [idx] is Yuanyin,
    474             [idx-1...] is non-complete or complete (not NULL)
    475             ====> split into [0..idx-2] [idx-1, idx]
    476         (2) lastValid >= 0
    477             ====> split into [0..lastValid] [lastValid+1..]
    478                   if [lastValid+1...] is not valid, return false
    479         (3) lastValid = -1
    480             ====> give a invalid PINYIN bone [0..]
    481     **********************************************************************/
    482     if (pyn == NULL && idx >= 2 &&
    483         pathNodes[idx-1]->m_bFullSyllableTransfer == 1 &&
    484         pathNodes[idx-2]->m_bFullSyllableTransfer == 1 &&
    485         !isYuanYinChar(str[idx-1]) && isYuanYinChar(str[idx]) &&
    486         (pathNodes[idx] = m_pPinyinTrie->transfer(str+idx-1)) != NULL) {
    487 
    488         result.push_back(CBone(str, idx-1, CBone::AUTO_BOUNDARY, CBone::NODE_PINYIN));
    489 
    490         #ifdef DEBUG
    491             print_wide(wstring(str, idx-1).c_str());
    492             printf("'");
    493         #endif
    494 
    495         int bt = CBone::NODE_INCOMPLETE_PINYIN;
    496         if (pathNodes[idx]->m_bFullSyllableTransfer == 1)
    497             bt = CBone::NODE_PINYIN;
    498 
    499         result.push_back( CBone(str+idx-1, CBone::AUTO_BOUNDARY, bt) );
    500 
    501         #ifdef DEBUG
    502             print_wide(str+idx-1);
    503             fflush(stdout);
    504         #endif
    505 
    506         return true;
    507     }
    508 
    509     if (pyn == NULL && lastValid >= 0) {
    510         result.push_back(CBone(str, lastValid+1, CBone::AUTO_BOUNDARY, CBone::NODE_PINYIN));
    511 
    512         #ifdef DEBUG
    513             print_wide(wstring(str, lastValid+1).c_str());
    514             printf("'");
    515         #endif
    516 
    517         int bt = CBone::NODE_INCOMPLETE_PINYIN;
    518 
    519         pathNodes[idx] = m_pPinyinTrie->transfer(str+lastValid+1);
    520         if (pathNodes[idx] == NULL)
    521             bt = CBone::NODE_INVALID_PINYIN;
    522         else if (m_pPinyinTrie->isValid(pathNodes[idx], m_bNonCompleteSyllable, m_bGBK))
    523             bt = CBone::NODE_PINYIN;
    524         else
    525             bt = CBone::NODE_INCOMPLETE_PINYIN;
    526         result.push_back(CBone(str+lastValid+1, CBone::AUTO_BOUNDARY, bt));
    527 
    528         #ifdef DEBUG
    529             print_wide(str+lastValid+1);
    530             if (bt == CBone::NODE_INVALID_PINYIN)
    531                 printf("(X)");
    532             fflush(stdout);
    533         #endif
    534 
    535         return (bt != CBone::NODE_INVALID_PINYIN);
    536     }
    537 
    538     if (pyn == NULL) {
    539         result.push_back(CBone(str, CBone::AUTO_BOUNDARY, CBone::NODE_INVALID_PINYIN));
    540 
    541         #ifdef DEBUG
    542             print_wide(str);
    543             printf("(X)");
    544             fflush(stdout);
    545         #endif
    546 
    547         return false;
    548     }
    549 
    550     /********************************************************************
    551     Now, pyn is not NULL, str[idx] should be 0,
    552         [0..idx-1] is valid (non-complete or complete)
    553     *********************************************************************/
    554     int bt = (validSyllable[idx-1])?(CBone::NODE_PINYIN):(CBone::NODE_INCOMPLETE_PINYIN);
    555     result.push_back(CBone(str, CBone::AUTO_BOUNDARY, bt));
    556 
    557     #ifdef DEBUG
    558         print_wide(str);
    559         fflush(stdout);
    560     #endif
    561 
    562     return true;
    563 }
    564 
    565 TCandiRank::TCandiRank(bool user, bool best, unsigned int len,
    566                        bool fromLattice, TSentenceScore score)
    567 {
    568     anony.m_user = (user)?0:1;
    569     anony.m_best = (best)?0:1;
    570     anony.m_len = (len > 31)?(0):(31-len);
    571     anony.m_lattice = (fromLattice)?0:1;
    572 
    573     #ifdef DEBUG
    574         //assert(fromLattice);
    575         //assert(TSentenceScore(+0.0) < score);
    576     #endif
    577 
    578     double ds = -score.log2();
    579 
    580     //make it 24-bit
    581     if (ds > 32767.0)
    582         ds = 32767.0;
    583     else if (ds < -32768.0)
    584         ds = -32768.0;
    585     unsigned cost = unsigned((ds+32768.0)*256.0);
    586     anony.m_cost = cost;
    587 }
    588 
    589 TCandiRank::TCandiRank(bool user, bool best, unsigned int len,
    590                        bool fromLattice, unsigned rank)
    591 {
    592     anony.m_user = (user)?0:1;
    593     anony.m_best = (best)?0:1;
    594     anony.m_len = (len > 31)?(0):(31-len);
    595     anony.m_lattice = (fromLattice)?0:1;
    596     anony.m_cost = rank;
    597 }
    598 
    599 struct TCandiPair {
    600     CCandidate                      m_Candi;
    601     TCandiRank                      m_Rank;
    602 
    603     TCandiPair() : m_Candi(), m_Rank() { }
    604 };
    605 
    606 struct TCandiPairPtr {
    607     TCandiPair*                     m_Ptr;
    608 
    609     TCandiPairPtr(TCandiPair* p=NULL) : m_Ptr(p)
    610     { }
    611 
    612     bool
    613     operator< (const TCandiPairPtr& b) const
    614     { return m_Ptr->m_Rank < b.m_Ptr->m_Rank; }
    615 };
    616 
    617 // FIXME, this procedure could be modified largely.
    618 void
    619 CIMIContext::getCandidates(CSkeletonIter bone, CCandidates& result)
    620 {
    621     TCandiPair cp;
    622     static std::map<unsigned int, TCandiPair> map;
    623     std::map<unsigned int, TCandiPair>::iterator it_map;
    624 
    625     map.clear();
    626     result.clear();
    627     m_EffectiveCandiBoneStart = m_EffectiveCandiBoneEnd = bone;
    628 
    629     if (bone->isTailNode())
    630         return;
    631     if (!bone->isValidPinyinNode()) {
    632         result.push_back(CCandidate(bone->m_String.c_str(), bone, ++CSkeletonIter(bone)));
    633         return;
    634     }
    635 
    636     // if user selection or best word starting at bone
    637     if (bone->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere) {
    638         cp.m_Candi = bone->m_pInnerData->m_BestWord;
    639         cp.m_Rank =
    640             TCandiRank(bone->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord,
    641                        bone->m_pInnerData->m_BWType == CBoneInnerData::BestWordStartHere,
    642                        0, false, 0);
    643         map[cp.m_Candi.m_WordId] = cp;
    644     }
    645 
    646     //collecting all candidates, from both lattice and lexicon
    647     int len = 1;
    648     cp.m_Candi.m_BoneStart = bone;
    649     CSkeletonIter b = ++CSkeletonIter(bone);
    650     while (b != (--m_Skeleton.end())) {
    651         cp.m_Candi.m_BoneEnd = b;
    652 
    653         bool found = false;
    654         CLexiconStates::iterator itlex = b->m_pInnerData->m_LexiconStates.begin();
    655         CLexiconStates::iterator itlexe = b->m_pInnerData->m_LexiconStates.end();
    656         for (; itlex != itlexe; ++itlex) {
    657             if (itlex->m_BoneStart == bone) {
    658                 found = true;
    659                 if (itlex->m_bPinyin) {
    660                     if (itlex->m_pPYNode && itlex->m_pPYNode->m_nWordId > 0) {
    661                         unsigned sz = itlex->m_pPYNode->m_nWordId;
    662                         const CPinyinTrie::TWordIdInfo* p = itlex->m_pPYNode->getWordIdPtr();
    663                         for (unsigned int i = 0; i < sz; ++i, ++p) {
    664                             if (m_bGBK || p->m_bGBK == 0) {
    665                                 cp.m_Candi.m_WordId = p->m_id;
    666                                 cp.m_Candi.m_String = (*m_pPinyinTrie)[cp.m_Candi.m_WordId];
    667 
    668                                 //sorting according to the order in PinYinTire
    669                                 cp.m_Rank = TCandiRank(false, false, len, false, i);
    670                                 it_map = map.find(cp.m_Candi.m_WordId);
    671                                 if (it_map == map.end() || cp.m_Rank < it_map->second.m_Rank)
    672                                     map[cp.m_Candi.m_WordId] = cp;
    673                             }
    674                         }
    675                     }
    676                 } else {
    677                     cp.m_Candi.m_WordId = itlex->m_WordId;
    678                     cp.m_Candi.m_String = bone->m_String.c_str();
    679                     cp.m_Rank = TCandiRank(false, false, len, false, 0);
    680                     it_map = map.find(cp.m_Candi.m_WordId);
    681                     if (it_map == map.end() || cp.m_Rank < it_map->second.m_Rank)
    682                         map[cp.m_Candi.m_WordId] = cp;
    683                 }
    684             }
    685         }
    686 
    687         if (!found) break;
    688 
    689         CLatticeStates::iterator its = b->m_pInnerData->m_LatticeNodes.begin();
    690         CLatticeStates::iterator ite = b->m_pInnerData->m_LatticeNodes.end();
    691         for (;  its != ite; ++its) {
    692             if (its->m_pBackTraceNode && its->m_pBackTraceNode->m_BoneAfter == bone) {
    693                 cp.m_Candi.m_WordId = its->m_BackTraceWordId;
    694                 cp.m_Candi.m_String = (*m_pPinyinTrie)[cp.m_Candi.m_WordId];
    695                 if (cp.m_Candi.m_String == NULL)
    696                     cp.m_Candi.m_String = bone->m_String.c_str();
    697                 #ifdef _USE_RAW_PROBABILITY
    698 
    699                     #ifdef DEBUG
    700                         //assert(its->m_pBackTraceNode->m_Score < 0.0 && its->m_Score < 0.0);
    701                     #endif
    702 
    703                     cp.m_Rank = TCandiRank(false, false, len, true, its->m_Score / its->m_pBackTraceNode->m_Score);
    704                 #else
    705                     cp.m_Rank = TCandiRank(false, false, len, true, its->m_Score - its->m_pBackTraceNode->m_Score);
    706                 #endif
    707                 it_map = map.find(cp.m_Candi.m_WordId);
    708                 if (it_map == map.end() || cp.m_Rank < it_map->second.m_Rank)
    709                     map[cp.m_Candi.m_WordId] = cp;
    710             }
    711         }
    712 
    713         m_EffectiveCandiBoneEnd = b;
    714         ++b;
    715         ++len;
    716     }
    717 
    718     std::vector<TCandiPairPtr> vec;
    719 
    720     vec.reserve(map.size());
    721     std::map<unsigned int, TCandiPair>::iterator it_mapE = map.end();
    722     for (it_map = map.begin(); it_map != it_mapE; ++it_map)
    723         vec.push_back(TCandiPairPtr(&(it_map->second)));
    724     std::make_heap(vec.begin(), vec.end());
    725     std::sort_heap(vec.begin(), vec.end());
    726 
    727     for (int i=0, sz=vec.size(); i < sz; ++i)
    728         result.push_back(vec[i].m_Ptr->m_Candi);
    729 }
    730 
    731 int
    732 CIMIContext::getBestSentence(wstring & result, CSkeletonIter boneStart,
    733                              CSkeletonIter boneEnd, bool original_format)
    734 {
    735     int nWordConverted = 0;
    736     result.clear();
    737 
    738     // no need to check begin(), because firstBone must at least has some
    739     // auto best word or user selection best word starting from, this rule
    740     // must be followed in this call
    741     int len, prefix = 0;
    742     CSkeletonIter realStart = boneStart;
    743     while (realStart->m_pInnerData->m_BWType == CBoneInnerData::NoBestWordStartHere) {
    744         ++prefix;
    745         --realStart;
    746     }
    747 
    748     while (true) {
    749         #ifdef DEBUG
    750             //assert(realStart->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere);
    751         #endif
    752 
    753         CSkeletonIter bone = boneStart;
    754         CSkeletonIter rightBone = realStart->m_pInnerData->m_BestWord.m_BoneEnd;
    755         if (realStart->m_BoneType != CBone::NODE_PINYIN &&
    756                   realStart->m_BoneType != CBone::NODE_INCOMPLETE_PINYIN) {
    757             for (; bone != rightBone && bone != boneEnd; ++bone) {
    758                  if (!original_format)
    759                      result.push_back(bone->m_String[0]);
    760                  else
    761                      result.push_back((unsigned)bone->m_BoundaryType);
    762             }
    763         } else {
    764             ++nWordConverted;
    765             // get the length from boneStart to current best word tail or end of range
    766             for (len=0; bone != rightBone && bone != boneEnd; ++bone)
    767                  ++len;
    768             result.append(realStart->m_pInnerData->m_BestWord.m_String+prefix, len);
    769         }
    770         if (bone == boneEnd)
    771             break;
    772         boneStart = realStart = bone;
    773         prefix = 0;
    774     }
    775 
    776     return nWordConverted;
    777 }
    778 
    779 /**
    780  * Search from the bone to the tail. the bone can not beyond first psuedo tail.
    781  * Before search, all BoneInnerData should be set. The states of the bones
    782  * who's ahead of the bone would not be affected by this function. Yet, states
    783  * of the bones beyond this bone will be updated or refreshed.
    784  *
    785  * After lattice search, only one best path are backtraced and each best word
    786  * will be attached to corresponding bone.
    787  */
    788 void
    789 CIMIContext::searchFrom(CSkeletonIter boneStart)
    790 {
    791     // iterate every bone from boneStart to the second psuedo tail
    792     CSkeletonIter itEnd = ++getLastBone();
    793     CSkeletonIter bone = boneStart;
    794     CSkeletonIter boneFirst = m_Skeleton.begin();
    795     for (; bone != itEnd; ) {
    796         if (bone == boneFirst) {
    797             // do not clear USER_SELECTION_BEST_WORD !!
    798             bone->m_pInnerData->m_LexiconStates.clear();
    799             bone->m_pInnerData->m_LatticeNodes.clear();
    800             #ifdef _USE_RAW_PROBABILITY
    801                 bone->m_pInnerData->m_LatticeNodes.push_back(TLatticeState(-1.0, bone));
    802             #else
    803                 bone->m_pInnerData->m_LatticeNodes.push_back(TLatticeState(0.0, bone));
    804             #endif
    805         } else {
    806             buildLatticeStates(bone);
    807         }
    808         switch (bone->m_BoneType) {
    809         case CBone::NODE_TAIL:
    810             bone = forwardTailBone(bone);
    811             break;
    812         case CBone::NODE_PINYIN:
    813             bone = forwardPinyinBone(bone);
    814             break;
    815         case CBone::NODE_INCOMPLETE_PINYIN:
    816         case CBone::NODE_INVALID_PINYIN:
    817             bone = forwardInvalidBone(bone);
    818             break;
    819         case CBone::NODE_PUNC:
    820             bone = forwardPuncBone(bone);
    821             break;
    822         case CBone::NODE_ASCII:
    823         case CBone::NODE_SIMBOL:
    824         case CBone::NODE_DIGITAL:
    825             bone = forwardNonPinyinBone(bone);
    826             break;
    827         };
    828     }
    829 
    830     //Build the last bone's lattice states
    831     buildLatticeStates(itEnd);
    832 
    833     #ifdef DEBUG
    834         //assert(itEnd->m_pInnerData->m_LatticeNodes.size() == 1);
    835     #endif
    836 
    837     // clear all non-user selection
    838     for (bone=boneFirst; bone != itEnd; ++bone) {
    839         if (bone->m_pInnerData->m_BWType != CBoneInnerData::UserSelectedBestWord)
    840             bone->m_pInnerData->m_BWType = CBoneInnerData::NoBestWordStartHere;
    841     }
    842 
    843     // back tracing, find the best path
    844     TLatticeState* bs = &(*(itEnd->m_pInnerData->m_LatticeNodes.begin()));
    845     while (bs->m_BoneAfter != boneFirst) {
    846         TLatticeState* fs = bs->m_pBackTraceNode;
    847         CSkeletonIter  fb = fs->m_BoneAfter;
    848         if (fb->m_pInnerData->m_BWType != CBoneInnerData::UserSelectedBestWord) {
    849             fb->m_pInnerData->m_BWType = CBoneInnerData::BestWordStartHere;
    850         }
    851         fb->m_pInnerData->m_BestWord.m_BoneStart = fb;
    852         fb->m_pInnerData->m_BestWord.m_BoneEnd = bs->m_BoneAfter;
    853         fb->m_pInnerData->m_BestWord.m_WordId = bs->m_BackTraceWordId;
    854         fb->m_pInnerData->m_BestWord.m_String = (*m_pPinyinTrie)[bs->m_BackTraceWordId];
    855         if (fb->m_pInnerData->m_BestWord.m_String == NULL)
    856             fb->m_pInnerData->m_BestWord.m_String = fb->m_String.c_str();
    857         bs = fs;
    858     }
    859 }
    860 
    861 #ifdef DEBUG
    862 static double min_ts = 1.0;
    863 #endif
    864 
    865 static double s_history_distribution[11] = {
    866     0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
    867 };
    868 
    869 void
    870 CIMIContext::transferBetween(CSkeletonIter h, CSkeletonIter t, unsigned int id, double ic)
    871 {
    872     CLatticeStates& latss1 = h->m_pInnerData->m_LatticeNodes;
    873     CLatticeStates& latss2 = t->m_pInnerData->m_LatticeNodes;
    874     CLatticeStates::iterator it1 = latss1.begin();
    875     CLatticeStates::iterator ite = latss1.end();
    876 
    877     #ifdef _USE_RAW_PROBABILITY
    878         TLatticeState node(-1.0, t);
    879         TSentenceScore efic(1.0);
    880     #else
    881         TLatticeState node(0.0, t);
    882         TSentenceScore efic(0.0);
    883     #endif
    884 
    885     if (h->m_pInnerData->m_BestWord.m_WordId == id &&
    886             h->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord) {
    887         #ifdef _USE_RAW_PROBABILITY
    888             efic = efic * TSentenceScore(30000, 1.0);
    889         #else
    890             efic = ic - 30000.0;
    891         #endif
    892     }
    893 
    894     double weight_h = s_history_distribution[m_HistoryPower];
    895     double weight_s = 1.0 - weight_h;
    896 
    897     for (; it1 != ite; ++it1) {
    898         node.m_pBackTraceNode = &(*it1);
    899         node.m_BackTraceWordId = id;
    900         #ifdef _USE_RAW_PROBABILITY
    901             // the fact is that we could only use bigram cache
    902             // and all first level node in the language model are non-empty
    903             double ts = m_pModel->transfer(it1->m_State, id, node.m_State);
    904             m_pModel->historify(node.m_State);
    905             // we do not want to shrink the history state if it could be found in cache
    906             if (node.m_State.getLevel() == 0 && m_pHistory->seenBefore(id)) {
    907                 node.m_State.setIdx(id);  // an psuedo unigram node state
    908             }
    909 
    910             #ifdef DEBUG
    911                 assert(it1->m_Score < TSentenceScore(0.0));
    912             #endif
    913 
    914             double cost = ts;
    915             if (m_pHistory) {
    916                 unsigned history[2] = {m_pModel->lastWordId(it1->m_State), id};
    917                 double hpr = m_pHistory->pr(history, history+2);
    918                 cost = weight_s * ts + weight_h*hpr;
    919             }
    920             node.m_Score = it1->m_Score * efic * TSentenceScore(cost);
    921 
    922             #ifdef DEBUG
    923                 if (!(node.m_Score < TSentenceScore(-0.0))) {
    924                     static char strangeValue[256];
    925 
    926                     node.m_Score.toString(strangeValue);
    927                     printf("\n***node.m_Score invalid %s ***\n", strangeValue);
    928 
    929                     it1->m_Score.toString(strangeValue);
    930                     printf("***it1->m_Score is %s ***\n", strangeValue);
    931 
    932                     efic.toString(strangeValue);
    933                     printf("***efic=%s, ic=(%lf)***\n", strangeValue, ic);
    934 
    935                     TSentenceScore(cost).toString(strangeValue);
    936                     printf("***cost=%s(%lf), ts=(%16lf)***\n", strangeValue, cost, ts);
    937 
    938                     fflush(stdout);
    939                     assert(false);
    940                 }
    941             #endif
    942 
    943         #else
    944             double ts = m_pModel->transferNegLog(it1->m_State, id, node.m_State);
    945             m_pModel->historify(node.m_State);
    946             node.m_Score = it1->m_Score + ts + ic;
    947         #endif
    948 
    949         latss2.push_back(node);
    950     }
    951 }
    952 
    953 void
    954 CIMIContext::buildLatticeStates(CSkeletonIter bone)
    955 {
    956     bool bSingleSyllable, bSingleShort;
    957     unsigned i, sz;
    958     CSkeletonIter bonePrev = bone;
    959 
    960 
    961     --bonePrev;
    962     CBoneInnerData & innerData = *(bone->m_pInnerData);
    963     CLexiconStates::iterator itLexState = innerData.m_LexiconStates.begin();
    964     CLexiconStates::iterator itLexStateE = innerData.m_LexiconStates.end();
    965     innerData.m_LatticeNodes.clear();
    966     for (; itLexState != itLexStateE; ++itLexState) {
    967         CLexiconState& ls = *itLexState;
    968 
    969         // the user selected word may be cut in first pruning process below,
    970         // So, just let it go first, when it ends here
    971         CBoneInnerData* pbid = ls.m_BoneStart->m_pInnerData;
    972         if (pbid->m_BWType == CBoneInnerData::UserSelectedBestWord &&
    973                     pbid->m_BestWord.m_BoneEnd == bone) {
    974             #ifdef _USE_RAW_PROBABILITY
    975                 transferBetween(ls.m_BoneStart, bone, pbid->m_BestWord.m_WordId, 1.0);
    976             #else
    977                 transferBetween(ls.m_BoneStart, bone, pbid->m_BestWord.m_WordId, 0.0);
    978             #endif
    979         }
    980 
    981         if (!ls.m_bPinyin) {
    982             #ifdef _USE_RAW_PROBABILITY
    983                 transferBetween(ls.m_BoneStart, bone, ls.m_WordId, 1.0);
    984             #else
    985                 transferBetween(ls.m_BoneStart, bone, ls.m_WordId, 0.0);
    986             #endif
    987         } else {
    988             // Cutting words with little unigram possibilities
    989             // at least 2, at most 32 of the words would be tried
    990             // if unseed word starting from some position in the first 32
    991             // candidates(ranked according to unigram pr in lexicon), do not
    992             // let them be checked.
    993             bSingleShort = bSingleSyllable = (ls.m_BoneStart == bonePrev);
    994             if (bSingleSyllable) {
    995                 register unsigned char uc = ls.m_BoneStart->m_String[0];
    996                 bSingleShort = ((ls.m_BoneStart->m_String.size() == 1 && (uc != 'a' && uc != 'o' && uc !='e')) ||
    997                                 (ls.m_BoneStart->m_String.size() == 2 && (ls.m_BoneStart->m_String[1] == 'h')));
    998             }
    999             //bSingleShort = (bSingleSyllable && !(m_pPinyinTrie->isValid(ls.m_pPYNode, false)));
   1000 
   1001             const CPinyinTrie::TNode* pn = ls.m_pPYNode;
   1002             const CPinyinTrie::TWordIdInfo* pwidinfo = pn->getWordIdPtr();
   1003             sz=pn->m_nWordId;
   1004             if (bSingleShort)
   1005                 sz = 12;
   1006             else if (sz > 26)
   1007                 sz = 26;
   1008 
   1009             int count = 0;
   1010             for (i=0; count < sz && i < sz && (pwidinfo[i].m_bSeen == 1 || count < 2); ++i) {
   1011                 if (m_bGBK || pwidinfo[i].m_bGBK == 0) {
   1012                     #ifdef _USE_RAW_PROBABILITY
   1013                         transferBetween(ls.m_BoneStart, bone, pwidinfo[i].m_id, 1.0);
   1014                     #else
   1015                         transferBetween(ls.m_BoneStart, bone, pwidinfo[i].m_id, 0.0);
   1016                     #endif
   1017                     ++count;
   1018                 }
   1019             }
   1020             #ifdef _USE_RAW_PROBABILITY
   1021                 // try cached words
   1022                 if (m_pHistory) {
   1023                     for (sz = pn->m_nWordId; i < sz; ++i) {
   1024                         if (m_bGBK || pwidinfo[i].m_bGBK == 0) {
   1025                             if (m_pHistory->seenBefore(pwidinfo[i].m_id)) {
   1026                                 transferBetween(ls.m_BoneStart, bone, pwidinfo[i].m_id, 1.0);
   1027                             }
   1028                         }
   1029                     }
   1030                 }
   1031             #endif
   1032         }
   1033     }
   1034 }
   1035 
   1036 
   1037 /**
   1038 * Fussy Pinyin:
   1039 *
   1040 */
   1041 CSkeletonIter
   1042 CIMIContext::forwardOnePinyinBone(CSkeletonIter bone)
   1043 {
   1044     const CPinyinTrie::TNode *pn = NULL;
   1045 
   1046     //clear next bone's lexicon states
   1047     CSkeletonIter boneNext = ++CSkeletonIter(bone);
   1048     CLexiconStates& lexss2 = boneNext->m_pInnerData->m_LexiconStates;
   1049     lexss2.clear();
   1050 
   1051     // insert the root PinYin Lexicon node
   1052     CLexiconStates& lexss1 = bone->m_pInnerData->m_LexiconStates;
   1053     CLexiconStates::iterator it1 = lexss1.begin();
   1054     CLexiconStates::iterator ite = lexss1.end();
   1055     for (; it1 != ite; ++it1) {
   1056         if (it1->m_bPinyin) {
   1057             pn = m_pPinyinTrie->transfer(it1->m_pPYNode, bone->m_String.c_str());
   1058             if (pn != NULL && (pn = m_pPinyinTrie->transfer(pn, TWCHAR('\''))) != NULL) {
   1059                 lexss2.push_back(CLexiconState(it1->m_BoneStart, pn));
   1060             }
   1061         }
   1062     }
   1063 
   1064     //try transfer from root state of the lexicon
   1065     pn = m_pPinyinTrie->transfer(bone->m_String.c_str());
   1066     if (pn != NULL && (pn = m_pPinyinTrie->transfer(pn, TWCHAR('\''))) != NULL) {
   1067         lexss2.push_back(CLexiconState(bone, pn));
   1068     }
   1069 
   1070     return boneNext;
   1071 }
   1072 
   1073 CSkeletonIter
   1074 CIMIContext::forwardPinyinBone(CSkeletonIter bone)
   1075 {
   1076     if (bone->m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord && m_bStrictLeft2Right) {
   1077         CSkeletonIter boneLeft = bone;
   1078         CSkeletonIter boneRight = bone->m_pInnerData->m_BestWord.m_BoneEnd;
   1079         for (; bone != boneRight; ++bone)
   1080             (++CSkeletonIter(bone))->m_pInnerData->clear();
   1081         boneRight->m_pInnerData->m_LexiconStates.push_back(
   1082             CLexiconState(boneLeft, boneLeft->m_pInnerData->m_BestWord.m_WordId)
   1083         );
   1084         return boneRight;
   1085     } else {
   1086         return forwardOnePinyinBone(bone);
   1087     }
   1088 }
   1089 
   1090 CSkeletonIter
   1091 CIMIContext::forwardInvalidBone(CSkeletonIter bone)
   1092 {
   1093     CSkeletonIter boneNext = ++CSkeletonIter(bone);
   1094     CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates;
   1095     lss.clear();
   1096     lss.push_back(CLexiconState(bone, (unsigned int)UNKNOWN_WORD_ID));
   1097 
   1098     return boneNext;
   1099 }
   1100 
   1101 CSkeletonIter
   1102 CIMIContext::forwardPuncBone(CSkeletonIter bone)
   1103 {
   1104     unsigned int wid = m_pPinyinTrie->getSimbolId(bone->m_String);
   1105 
   1106     CSkeletonIter boneNext = ++CSkeletonIter(bone);
   1107     CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates;
   1108     lss.clear();
   1109     lss.push_back(CLexiconState(bone, wid));
   1110 
   1111     return boneNext;
   1112 }
   1113 
   1114 CSkeletonIter
   1115 CIMIContext::forwardNonPinyinBone(CSkeletonIter bone)
   1116 {
   1117     CSkeletonIter boneNext = ++CSkeletonIter(bone);
   1118     CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates;
   1119     lss.clear();
   1120     lss.push_back(CLexiconState(bone, (unsigned int)UNKNOWN_WORD_ID));
   1121 
   1122     return boneNext;
   1123 }
   1124 
   1125 
   1126 CSkeletonIter
   1127 CIMIContext::forwardTailBone(CSkeletonIter bone)
   1128 {
   1129     CSkeletonIter boneNext = ++CSkeletonIter(bone);
   1130     CLexiconStates & lss = boneNext->m_pInnerData->m_LexiconStates;
   1131     lss.clear();
   1132     lss.push_back(CLexiconState(bone, OOV_WORD_ID));
   1133 
   1134     return boneNext;
   1135 }
   1136 
   1137 CBone::CBone(const CBone& b)
   1138     : m_BoneType(b.m_BoneType), m_BoundaryType(b.m_BoundaryType),
   1139       m_String(b.m_String), m_pInnerData(NULL)
   1140 {
   1141 }
   1142 
   1143 CBone::CBone(int boundType, int boneType)
   1144     : m_BoneType(boneType), m_BoundaryType(boundType),
   1145       m_String(), m_pInnerData(NULL)
   1146 {
   1147 }
   1148 
   1149 CBone::CBone(const TWCHAR* pwc, int boundType, int boneType)
   1150     : m_BoneType(boneType), m_BoundaryType(boundType),
   1151       m_String(pwc), m_pInnerData(NULL)
   1152 {
   1153 }
   1154 
   1155 CBone::CBone(const TWCHAR* pwc,  size_t len, int boundType, int boneType)
   1156     : m_BoneType(boneType), m_BoundaryType(boundType),
   1157       m_String(pwc, len), m_pInnerData(NULL)
   1158 {
   1159 }
   1160 
   1161 CBone::~CBone()
   1162 {
   1163     delete m_pInnerData;
   1164     m_pInnerData = NULL;
   1165 }
   1166 
   1167 bool
   1168 CBone::isUserSelectionStart(void)
   1169 {
   1170     return (m_pInnerData != NULL &&
   1171               m_pInnerData->m_BWType == CBoneInnerData::UserSelectedBestWord);
   1172 }
   1173 
   1174 int
   1175 cursorMapping(CSkeletonIter head1, CSkeletonIter tail1,
   1176               CSkeletonIter head2, CSkeletonIter tail2,
   1177               CSkeleton& result,
   1178               CSkeletonIter& cursor, int& cursorIdx, bool stickLeft = false)
   1179 {
   1180     TSkelCursor sc(head1, tail1, head2, tail2, true);
   1181     TSkelCursor::TPos cp(cursor, cursorIdx);
   1182 
   1183     int  len = 0;
   1184     bool found =false;
   1185 
   1186     while (true) {
   1187         found = sc.ensureCursor(cp);
   1188         if (found) break;
   1189         if (!sc.hasNext()) break;
   1190         sc.next(true);
   1191         ++len;
   1192     }
   1193 
   1194     if (found) {
   1195         TSkelCursor::TPos nc = sc.getPosition();
   1196 
   1197         int cmplen = cursorIdx = 0,  nNode = 0;
   1198         for (cursor = result.begin(); cursor != result.end(); ++cmplen) {
   1199             if (cmplen == len) break;
   1200             ++cursorIdx;
   1201             if (cursorIdx >= cursor->m_String.size()) {
   1202                 ++cursor;
   1203                 ++nNode;
   1204                 cursorIdx = 0;
   1205             }
   1206         }
   1207         if (cmplen == len) { // now we found that
   1208             if (stickLeft && cursor != result.begin() && cursorIdx == 0) {
   1209                 --cursor;
   1210                 --nNode;
   1211                 cursorIdx = cursor->m_String.size();
   1212             }
   1213         }
   1214         return nNode;
   1215     }
   1216     return -1;
   1217 }
   1218 
   1219 /**
   1220 * it is illegal if boneStart == boneEnd and skel.size() == 0
   1221 *
   1222 * 1. from current position, seeking left for 3 bones without HumanBoundary, or bones
   1223 *    of non-pinyin type between it. --> its, also can not beyond m_CandiBone.
   1224 * 2. in [its, bonStart) from left to right, find the first bone would cause different
   1225 *    segmentation result.  --> itd, other wise itds <-- boneStart
   1226 * 3. from [itd, boneEnd), do automatic segment
   1227 * 4. after boneEnd, util user boundary or non-pinyin bone
   1228 *    or segment result equals to original. --> itd2.
   1229 * 5. [itd, itd2) re-segment, all resulting bone goes into a new skeleton --> newskel.
   1230 *    while convert old cursor position into new position.
   1231 * 6. erase old or seg-affected nodes and splice in the newskel.
   1232 *    do search if needed
   1233 *
   1234 * The key part of this would seeking a solution for finding the automatic sentence
   1235 * segmentation result.
   1236 */
   1237 bool
   1238 CIMIContext::modifyAndReseg(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel,
   1239                             CSkeletonIter& cursor, int& cursorIdx, CSkeletonIter& candiStart,
   1240                             bool stickLeft, bool doSearch)
   1241 {
   1242     CSkeleton newskel;
   1243 
   1244     // Try to look_left to prevent potential segmentation insufficiency
   1245     CSkeletonIter nit, oit, its = boneStart;
   1246     int look_left = 0;
   1247     for (; look_left < 3 && its != getFirstBone() && its != candiStart; ++look_left) {
   1248         --its;
   1249         if (!its->isPinyinNode() || its->m_BoundaryType == CBone::USER_BOUNDARY) {
   1250             ++its;
   1251             break;
   1252         }
   1253         skel.push_front(*its);
   1254     }
   1255 
   1256     // do Syllable segment on the virtual new list
   1257     segPinyin(skel.begin(), skel.end(), boneEnd, getLastBone(), newskel);
   1258 
   1259     // Remapping the new cursor
   1260     int  ncIdx = cursorMapping(skel.begin(), skel.end(), boneEnd, getLastBone(), newskel, cursor, cursorIdx, stickLeft);
   1261 
   1262     // Skip previous look-left nodes that are same with original
   1263     int first_diff = 0;
   1264     CSkeletonIter dif_oits=skel.begin();
   1265     for (oit=skel.begin(), nit=newskel.begin(); first_diff < look_left; ++first_diff) {
   1266         if (nit->m_String.size() != oit->m_String.size()) {
   1267             dif_oits = oit;
   1268             break;
   1269         }
   1270         if (ncIdx == 0) cursor = its;
   1271         --ncIdx;
   1272         ++its;
   1273         ++nit;
   1274         ++oit;
   1275         newskel.pop_front();
   1276     }
   1277 
   1278     // prepare to restore the CandiStart
   1279     bool candiStartPositionReset = (its == candiStart);
   1280 
   1281     // Prepare for cursor reposition to restore after modify
   1282     CSkeletonIter leftIt;
   1283     bool leftItIsHead = (its == getFirstBone());
   1284     if (!leftItIsHead) {
   1285         leftIt = its;
   1286         --leftIt;
   1287     }
   1288 
   1289     // modify original node list
   1290     bool affectCandidates = modify(its, getLastBone(), newskel, doSearch);
   1291 
   1292     // Reposition cursor
   1293     if (ncIdx >= 0) {
   1294         cursor = (leftItIsHead)?(getFirstBone()):(++CSkeletonIter(leftIt));
   1295         for (int i=0; i < ncIdx; ++i)
   1296             ++cursor;
   1297     }
   1298 
   1299     // Reposition candiStart
   1300     if (candiStartPositionReset) {
   1301         candiStart = (leftItIsHead)?(getFirstBone()):(++CSkeletonIter(leftIt));
   1302         affectCandidates = true;
   1303     }
   1304 
   1305     return affectCandidates;
   1306 }
   1307 
   1308 void
   1309 CIMIContext::segPinyin(CSkeletonIter head1, CSkeletonIter tail1,
   1310                        CSkeletonIter head2, CSkeletonIter tail2,
   1311                        CSkeleton& result)
   1312 {
   1313     #ifdef DEBUG
   1314         printf("SegPinyin:");
   1315     #endif
   1316 
   1317     const CPinyinTrie::TNode* pathNodes[16];
   1318     TSkelCursor::TPos         positions[16];
   1319 
   1320     #ifdef  DEBUG
   1321         TWCHAR dbg_msg[2] = {0, 0};
   1322         {
   1323             TSkelCursor dsc(head1, tail1, head2, tail2);
   1324             while (dsc.hasNext()) {
   1325                 if (dsc.isPinyin()) {
   1326                     dbg_msg[0] = dsc.getChar();
   1327                     print_wide(dbg_msg);
   1328                     if (dsc.isUserBreakAfter()) {
   1329                         printf("'");
   1330                     }
   1331                 } else {
   1332                     printf("_");
   1333                     dbg_msg[0] = dsc.getChar();
   1334                     print_wide(dbg_msg);
   1335                 }
   1336                 dsc.next();
   1337             }
   1338         }
   1339     #endif
   1340 
   1341     result.clear();
   1342     TSkelCursor sc(head1, tail1, head2, tail2);
   1343     while (sc.hasNext()) {
   1344         if (sc.isPinyin()) {
   1345             int lastValid = 0;
   1346             pathNodes[0] = m_pPinyinTrie->getRootNode();
   1347             positions[0] = sc.getPosition();
   1348             for (int idx=1; sc.isPinyin() && pathNodes[idx-1] != NULL; ++idx) {
   1349                 pathNodes[idx] = m_pPinyinTrie->transfer(pathNodes[idx-1], sc.getChar());
   1350                 sc.next();
   1351                 positions[idx] = sc.getPosition();
   1352                 if (m_pPinyinTrie->isValid(pathNodes[idx], m_bNonCompleteSyllable, m_bGBK))
   1353                     lastValid = idx;
   1354                 if (sc.isUserBreakAfter(positions[idx-1]))
   1355                     break;
   1356             }
   1357             bool invalid = false;
   1358             if (lastValid == 0) {
   1359                 invalid = true;
   1360                 lastValid = 1;
   1361             }
   1362             if (lastValid >= 2 && pathNodes[lastValid]->m_bFullSyllableTransfer && pathNodes[lastValid-1]->m_bFullSyllableTransfer) {
   1363                 TWCHAR w1 = sc.getChar(positions[lastValid-1]);
   1364                 TWCHAR w2 = sc.getChar(positions[lastValid]);
   1365                 if (!isYuanYinChar(w1) && isYuanYinChar(w2)){
   1366                     const CPinyinTrie::TNode* pytmp = NULL;
   1367                     pytmp = m_pPinyinTrie->transfer(m_pPinyinTrie->getRootNode(), w1);
   1368                     if (pytmp) pytmp = m_pPinyinTrie->transfer(pytmp, w2);
   1369                     if (pytmp != NULL) --lastValid;
   1370                 }
   1371             }
   1372             CBone bnint(CBone::AUTO_BOUNDARY, (invalid)?(CBone::NODE_INVALID_PINYIN):(CBone::NODE_PINYIN));
   1373             if (sc.isUserBreakAfter(positions[lastValid-1]))
   1374                 bnint.m_BoundaryType = CBone::USER_BOUNDARY;
   1375             for (int idx=0; idx < lastValid; ++idx)
   1376                 bnint.m_String += sc.getChar(positions[idx]);
   1377             result.push_back(bnint);
   1378             sc.setPosition(positions[lastValid]);
   1379         } else {
   1380             result.push_back(*(sc.getPosition().m_bone));
   1381             sc.nextBone();
   1382         }
   1383     }
   1384 
   1385     #ifdef  DEBUG
   1386         {
   1387             printf(" ==> ");
   1388             TSkelCursor dsc(result.begin(), result.end(), result.end(), result.end());
   1389             while (dsc.hasNext()) {
   1390                 if (dsc.isPinyin()) {
   1391                     dbg_msg[0] = dsc.getChar();
   1392                     print_wide(dbg_msg);
   1393                     if (dsc.isBreakAfter()) {
   1394                         printf("'");
   1395                     }
   1396                 } else {
   1397                     printf("_");
   1398                     dbg_msg[0] = dsc.getChar();
   1399                     print_wide(dbg_msg);
   1400                 }
   1401                 dsc.next();
   1402             }
   1403         }
   1404         fflush(stdout);
   1405     #endif
   1406     return;
   1407 }
   1408 
   1409 void
   1410 CIMIContext::setHistoryMemory(CICHistory *phm)
   1411 {
   1412     m_pHistory = phm;
   1413 }
   1414 
   1415 CICHistory *
   1416 CIMIContext::getHistoryMemory()
   1417 {
   1418     return m_pHistory;
   1419 }
   1420 
   1421 void CIMIContext::memorize(void)
   1422 {
   1423     if (m_pHistory != NULL) {
   1424         std::vector<unsigned int> result;
   1425         CSkeletonIter boneStart = getFirstBone();
   1426         CSkeletonIter boneEnd = getLastBone();
   1427 
   1428         while (boneStart != boneEnd) {
   1429             #ifdef DEBUG
   1430                 //assert(boneStart->m_pInnerData->m_BWType != CBoneInnerData::NoBestWordStartHere);
   1431             #endif
   1432 
   1433             CSkeletonIter bone = boneStart;
   1434             CSkeletonIter rightBone = boneStart->m_pInnerData->m_BestWord.m_BoneEnd;
   1435             if (boneStart->m_BoneType != CBone::NODE_PINYIN && boneStart->m_BoneType != CBone::NODE_INCOMPLETE_PINYIN) {
   1436                 while (bone != rightBone && bone != boneEnd)
   1437                     ++bone;
   1438                 result.push_back(0);
   1439             } else {
   1440                 while (bone != rightBone && bone != boneEnd)
   1441                      ++bone;
   1442                 result.push_back(boneStart->m_pInnerData->m_BestWord.m_WordId);
   1443             }
   1444 
   1445             boneStart = bone;
   1446         }
   1447         if (result.size() > 0)
   1448             m_pHistory->memorize(&(result[0]), (&(result[0])) + result.size());
   1449     }
   1450 }
   1451 
   1452 void
   1453 CIMIContext::print_lattice()
   1454 {
   1455     printf("\n");
   1456     std::string prefix;
   1457     CSkeletonIter bone = getFirstBone();
   1458     CSkeletonIter boneEnd = getLastBone();
   1459     for (;bone != boneEnd; ++bone)
   1460         bone->print(prefix);
   1461     boneEnd->print(prefix);
   1462     (++boneEnd)->print(prefix);
   1463     fflush(stdout);
   1464 }
   1465 
   1466 void
   1467 CBone::print(std::string& prefix)
   1468 {
   1469     printf(prefix.c_str());
   1470     printf("{Bone@%X:", this);
   1471     print_wide(m_String.c_str());
   1472     printf("}");
   1473     prefix += "    ";
   1474     if (m_pInnerData)
   1475         m_pInnerData->print(prefix);
   1476     prefix.resize(prefix.size() - 4);
   1477     fflush(stdout);
   1478 }
   1479 
   1480 void
   1481 CCandidate::print(std::string& prefix)
   1482 {
   1483     printf(prefix.c_str());
   1484     printf("<Candidate @%X:", this);
   1485     print_wide(m_String);
   1486     printf("-- %d}", m_WordId);
   1487     fflush(stdout);
   1488 }
   1489 
   1490 
   1491 TLongExpFloat::TLongExpFloat(double d)
   1492 {
   1493     if (d != 0.0 && d != -0.0) {
   1494         TDoubleAnatomy da(d);
   1495         m_exp = da.getExp();
   1496         da.clearExp();
   1497         m_base = da.getValue();
   1498     } else {
   1499         m_base = d;
   1500         m_exp = 0;
   1501     }
   1502 }
   1503 
   1504 TLongExpFloat
   1505 TLongExpFloat::operator* (const TLongExpFloat& b) const
   1506 {
   1507     double d = this->m_base * b.m_base;
   1508     TLongExpFloat reda(d);
   1509     reda.m_exp += this->m_exp + b.m_exp;
   1510     return reda;
   1511 }
   1512 
   1513 TLongExpFloat
   1514 TLongExpFloat::operator/ (const TLongExpFloat& b) const
   1515 {
   1516     double d = this->m_base / b.m_base;
   1517     TLongExpFloat reda(d);
   1518     reda.m_exp += (this->m_exp - b.m_exp);
   1519     return reda;
   1520 }
   1521 
   1522 bool
   1523 TLongExpFloat::operator< (const TLongExpFloat& b) const
   1524 {
   1525     if (m_base >= 0.0 && b.m_base >= 0.0) {
   1526         return (m_exp < b.m_exp || (m_exp == b.m_exp && m_base < b.m_base));
   1527     } else if (m_base < 0.0 && b.m_base < 0.0) {
   1528         return (m_exp > b.m_exp || (m_exp == b.m_exp && m_base < b.m_base));
   1529     } else if (m_base < 0.0 && b.m_base >= 0.0)
   1530         return true;
   1531     else
   1532         return false;
   1533 }
   1534 
   1535 bool
   1536 TLongExpFloat::operator<=(const TLongExpFloat& b) const
   1537 {
   1538     if (m_base >= 0.0 && b.m_base >= 0.0) {
   1539         return (m_exp < b.m_exp || (m_exp == b.m_exp && m_base <= b.m_base));
   1540     } else if (m_base < 0.0 && b.m_base < 0.0) {
   1541         return (m_exp > b.m_exp || (m_exp == b.m_exp && m_base <= b.m_base));
   1542     } else if (m_base < 0.0 && b.m_base >= 0.0)
   1543         return true;
   1544     else
   1545         return false;
   1546 }
   1547 
   1548 bool
   1549 TLongExpFloat::operator==(const TLongExpFloat& b) const
   1550 {
   1551     return (m_base == b.m_base && m_exp == b.m_exp);
   1552 }
   1553 
   1554 void
   1555 TLongExpFloat::toString(std::string& str) const
   1556 {
   1557     char buf[256];
   1558     toString(buf);
   1559     str = buf;
   1560 }
   1561