Home | History | Annotate | Download | only in src
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifndef SUNPY_IMI_CONTEXT_H
     39 #define SUNPY_IMI_CONTEXT_H
     40 
     41 #include "portability.h"
     42 
     43 #ifdef HAVE_CONFIG_H
     44 #include <config.h>
     45 #endif
     46 
     47 #ifdef DEBUG
     48     #ifdef HAVE_ASSERT_H
     49     #include <assert.h>
     50     #endif
     51 #endif
     52 
     53 #include <map>
     54 #include <vector>
     55 #include <list>
     56 #include <math.h>
     57 
     58 #include "imi_data.h"
     59 #include "ic_history.h"
     60 
     61 #define UNKNOWN_WORD_ID             0
     62 #define OOV_WORD_ID                 69
     63 #define SENTENCE_BREAKER_ID         10
     64 
     65 struct TLongExpFloat {
     66 public:
     67     TLongExpFloat(const TLongExpFloat& b) : m_base(b.m_base), m_exp(b.m_exp) { }
     68 
     69     TLongExpFloat(int exp = 0, double base=0.0) : m_base(base), m_exp(exp) { }
     70 
     71     TLongExpFloat(double d);
     72 
     73     TLongExpFloat
     74     operator* (const TLongExpFloat& b) const;
     75 
     76     TLongExpFloat
     77     operator/ (const TLongExpFloat& b) const;
     78 
     79     bool
     80     operator< (const TLongExpFloat& b) const;
     81 
     82     bool
     83     operator<=(const TLongExpFloat& b) const;
     84 
     85     bool
     86     operator==(const TLongExpFloat& b) const;
     87 
     88     void
     89     toString(std::string& str) const;
     90 
     91     void
     92     toString(char* buf) const
     93     { if (buf) sprintf(buf, "%10lf*2^%d", m_base, m_exp); }
     94 
     95     double
     96     log2() const
     97     {
     98         #ifdef DEBUG
     99             //assert(m_base > 0.0);
    100         #endif
    101         return ::log2(m_base)+m_exp;
    102     }
    103 
    104 private:
    105     double   m_base;
    106     int      m_exp;
    107 };
    108 
    109 /**
    110 * TSentenceScore is only used for whole sentence score,
    111 * the score from language model still using double.
    112 */
    113 #ifdef _USE_RAW_PROBABILITY
    114     typedef TLongExpFloat           TSentenceScore;
    115 #else
    116     typedef double                  TSentenceScore;
    117 #endif
    118 
    119 class CBone;
    120 class CCandidate;
    121 
    122 typedef std::list<CBone>            CSkeleton;
    123 typedef CSkeleton::iterator         CSkeletonIter;
    124 typedef std::vector<CCandidate>     CCandidates;
    125 typedef CCandidates::iterator       CCandidatesIter;
    126 
    127 class CIMIContext;
    128 class CBoneInnerData;
    129 
    130 union TCandiRank {
    131 public:
    132     bool
    133     operator< (const TCandiRank& b) const
    134         { return m_all < b.m_all; };
    135 
    136     TCandiRank() : m_all(0) { }
    137 
    138     TCandiRank(bool user, bool best, unsigned int len,
    139                bool fromLattice, TSentenceScore score);
    140 
    141     TCandiRank(bool user, bool best, unsigned int len,
    142               bool fromLattice, unsigned score);
    143 
    144 protected:
    145     unsigned  int               m_all;
    146     #if !defined(WORDS_BIGENDIAN)
    147     struct TAnony {
    148         unsigned                m_cost   : 24;
    149         unsigned                m_lattice: 1;
    150         unsigned                m_best   : 1;
    151         unsigned                m_len    : 5;
    152         unsigned                m_user   : 1;
    153     } anony;
    154     #else
    155     struct TAnony {
    156         unsigned                m_user   : 1;
    157         unsigned                m_len    : 5;
    158         unsigned                m_best   : 1;
    159         unsigned                m_lattice: 1;
    160         unsigned                m_cost   : 24;
    161     } anony;
    162     #endif
    163 
    164 };
    165 
    166 /**
    167  * CCandidate represent basic information about a single candidate.
    168  * Its start bone and finishing bone. It's content string. and its
    169  * word id.
    170  */
    171 class CCandidate {
    172 public:
    173     friend class CIMIContext;
    174 public:
    175     CSkeletonIter                   m_BoneStart;
    176     CSkeletonIter                   m_BoneEnd;
    177     const TWCHAR                   *m_String;
    178 
    179 public:
    180     CCandidate(const CCandidate& b)
    181         : m_BoneStart(b.m_BoneStart), m_BoneEnd(b.m_BoneEnd),
    182           m_String(b.m_String), m_WordId(b.m_WordId) { }
    183 
    184     /** Give out the constructor for convinience */
    185     CCandidate(const TWCHAR* s = NULL,
    186                CSkeletonIter h=CSkeletonIter(),
    187                CSkeletonIter t=CSkeletonIter(),
    188                unsigned int wid=0)
    189         : m_BoneStart(h), m_BoneEnd(t), m_String(s), m_WordId(wid) { }
    190 
    191     void
    192     print(std::string& prefix);
    193 
    194 protected:
    195     unsigned int                      m_WordId;
    196 }; // of CCandidate
    197 
    198 
    199 /**
    200  * Bone is the basic unit for CIMSessionDate to store a Syllable, ie
    201  * a Pinyin string for only one Chinese character. Such as "zhang",
    202  * or "zh" under non-complete pinyin.
    203  */
    204 class CBone {
    205     friend class CIMIContext;
    206 public:
    207     /**
    208      * In case that use input is not Pinyin, such as under English input
    209      * mode, A bone contains a string for the input. For English, all
    210      * consecutive string grouped into one bone, such as "SUN Microsystem";
    211      * For Punc, each punctuaction become on bone, such as ": would be
    212      * split into two bones.
    213      *
    214      * For Pinyin type, there are three different type definition, and the
    215      * value for each syllable string depends on whether or not NonComplete
    216      * Pinyin mode is enabled:
    217      *
    218      * NODE_PINYIN           : Valid pinyin. Such as "zh" with NonComplet
    219      *                         Pinyin enabled, or "zhang" \n
    220      * NODE_INVALID_PINYIN   : Invalid syllable string, such as "u"; or "afdasf"\n
    221      * NODE_INCOMPLETE_PINYIN: incomplete syllable string (maybe complet further),
    222      *                         such as "to"; or "zh" when NonComplete Pinyin disabled.
    223      */
    224     enum NODE_TYPE {
    225        NODE_TAIL                = 0x0000,      //pusedo tail node
    226 
    227        CATE_PINYIN              = 0x0100,
    228        NODE_PINYIN              = 0x0101,      //pinyin
    229        NODE_INVALID_PINYIN      = 0x0102,      //invalid syllable string
    230        NODE_INCOMPLETE_PINYIN   = 0x0103,      //incomplete syllable string
    231 
    232 
    233        CATE_OTHER               = 0x0200,
    234        NODE_ASCII               = 0x0201,      //english string
    235        NODE_PUNC                = 0x0202,      //punctuation
    236        NODE_SIMBOL              = 0x0203,      //other simbol
    237        NODE_DIGITAL             = 0x0204       //not implemeted here
    238 
    239     }; // of NODE_TYPE
    240 
    241     /**
    242      * Boundary type indicate how the bone is seperated, by (1) Automatic
    243      * Syllable segmentation, (2) different bone type or punc bone seperate
    244      * rule, (3) user sepecified.
    245      */
    246     enum BOUNDARY_TYPE {
    247        AUTO_BOUNDARY,               //automatic segmentation result
    248        ABSOLUTE_BOUNDARY,           //boundary without ambiguation
    249        USER_BOUNDARY               //user given boundary
    250     }; // of BOUNDARY_TYPE
    251 
    252 public:
    253     int                             m_BoneType;
    254     int                             m_BoundaryType;   // original code for m_String[0] in non_pinyin node
    255     wstring                         m_String;
    256 
    257 public:
    258     /**
    259      * Never copy or allocate space for m_pInnerData;
    260      */
    261     CBone(const CBone& b);
    262 
    263     /**
    264      * @param boundType: Boundary type.
    265      * @param boneType: BoneType.
    266      * No space is allocated for m_pInnerData
    267      */
    268     CBone(int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL);
    269 
    270     /**
    271      * @param pwc  can not be NULL
    272      * @param boundType: Boundary type.
    273      * @param boneType: BoneType.
    274      * No space is allocated for m_pInnerData
    275      */
    276     CBone(const TWCHAR* pwc, int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL);
    277 
    278     /**
    279      * @param pwc  the string should be copied into this bone, not NULL
    280      * @param len  the string len
    281      * @param boundType: Boundary type.
    282      * @param boneType: BoneType.
    283      * No space is allocated for m_pInnerData
    284      */
    285     CBone(const TWCHAR* pwc, size_t len, int boundType, int boneType);
    286 
    287     /** Free all space if necessary. */
    288     ~CBone();
    289 
    290     inline bool
    291     isPinyinNode() const
    292         { return ((m_BoneType & CATE_PINYIN)!= 0); }
    293 
    294     inline bool
    295     isValidPinyinNode() const
    296         { return (m_BoneType == NODE_PINYIN || m_BoneType == NODE_INCOMPLETE_PINYIN); }
    297 
    298     inline bool
    299     isUserBoundary() const
    300         { return m_BoundaryType == USER_BOUNDARY; }
    301 
    302     inline bool
    303     isAutoBoundary() const
    304         { return m_BoundaryType == AUTO_BOUNDARY; }
    305 
    306     inline bool
    307     isTailNode() const
    308         { return (m_BoneType == 0); }
    309 
    310     bool
    311     isUserSelectionStart(void);
    312 
    313     void
    314     print(std::string& prefix);
    315 
    316 protected:
    317     CBoneInnerData                 *m_pInnerData;
    318 }; // of CBone
    319 
    320 
    321 /**
    322  * It is more suitable to call this as Input Context together with I
    323  * MSessionView. These data record
    324  * input history for a input session. Normally a seesion data would
    325  * only contains history keys and the cursor position. It would be enough
    326  * to find corresponding result from the history for most IME.
    327  *
    328  * The Session data class here take responsible for generating best sentence
    329  * from Pinyin string. It also contains all core algorithm for this conversion
    330  * progress.
    331  *
    332  * All Key processing job should be done by Session View class, and only several
    333  * interface exist here for the SessionView to modify Input Context and get
    334  * best sentence and ranked candidates.
    335  *
    336  * The other important function it provide is Automatic Pinyin segmentation.
    337  *
    338  * For our input method, from the aspect of effeciency, a internal
    339  * search lattice should also be remained and only partial of the lattice
    340  * would be rebuild or updated when user give a new input. The reason is
    341  * that it is time consuming to construct a whole search lattice, especially
    342  * for a long sentence and/or with Non-Complete syllables, which cause
    343  * potential candidates number increase. Yet all search related data are
    344  * hidden to outer usage.
    345  */
    346 class CIMIContext {
    347 public:
    348     /*@{*/
    349     /**
    350      * Constructor of CIMIContext. Set all the pointer to NULL.
    351      * set Non-Complete Syllable to true, set Strict Left2Right Model
    352      * to false.
    353      *
    354      * Note: At this time, CIMIContext could not be used to
    355      *       search directly. Only after setCoreData() and clear(),
    356      *       the internal search lattice are constructed and can
    357      *       be used.
    358      */
    359     CIMIContext();
    360 
    361     /**
    362      * free all resource/spaces
    363      */
    364     virtual
    365     ~CIMIContext()
    366         { m_Skeleton.clear(); }
    367     /*@}*/
    368 
    369     /*@{*/
    370     /**
    371      * Copy language model ptr and Pinyin-Trie Ptr inside the IMCoreData
    372      * into my own member.
    373      * Also build Chinese Punctuation Map from Pinyin-Trie.
    374      *
    375      * @param pCoreData is the core resource data for the Input Method
    376      */
    377     void
    378     setCoreData(CIMIData *pCoreData);
    379 
    380     /**
    381      * clear all internal Input Context, after this call, the Session data
    382      * or Input Context should same as they were just created. (Of cause,
    383      * the values from core data and desktop data remains.) More specifically,
    384      * it will clear skeleton, add a psuedo tail node, set internal candidate
    385      * position to skeleton.end() (which means no candidates needed now). then
    386      * it will construct a initial search lattice.
    387      *
    388      * Note: This function should be called only after setCoreData, because
    389      *       it will use the language model to construct a empty search
    390      *       lattice.
    391      */
    392     void
    393     clear();
    394 
    395     void
    396     setHistoryMemory(CICHistory *phm);
    397 
    398     CICHistory *
    399     getHistoryMemory();
    400 
    401     /** return true if defined DEBUG */
    402     void
    403     print_lattice();
    404 
    405     //memorize sentence in current text
    406     void
    407     memorize(void);
    408 
    409     /**
    410      * Return the bone list to let the view read them directly.
    411      */
    412     CSkeleton &
    413     getSkeleton(void)
    414         { return m_Skeleton; }
    415 
    416     /**
    417      * To construct the lattice, algorithm need to append two
    418      * psuedo tail bone at the end of the bone list.
    419      * For the SessionView could operate the list as the two tailing
    420      * bone were not there, SessionView should call this function
    421      * to replace call like getSekelton()->end() when iterating the
    422      * Skeleton (bone list).
    423      *
    424      * @return the first psuedo tailing node at the end of bone
    425      *     list. For SessionView usage.
    426      */
    427     CSkeletonIter
    428     getLastBone(void)
    429         { return --(--(m_Skeleton.end())); }
    430 
    431     /**
    432     * To cooperate with the getLastBone.
    433     * @return the first bone of the skeleton
    434     */
    435     CSkeletonIter
    436     getFirstBone(void)
    437         { return m_Skeleton.begin(); }
    438 
    439     bool
    440     isEmpty(void)
    441         { return m_Skeleton.size() == 2; }
    442     /*@}*/
    443 
    444 
    445     // functions to set options. Options should be set when Session Data
    446     // is clear, ie, just created or just after clear() is called
    447 
    448     /*@{*/
    449     /**
    450      * NonCompleteSyllable means we can give a candidates when PinYin is
    451      * just partial a complete syllable string. Such as "sh" would give
    452      * all characters with one of it PinYin starts from "sh", ex, "shi",
    453      * "sheng" etc.
    454      * In our definition, Non-Complete PinYin not only limitied on "SHENGMU",
    455      * For examplet, "to" is not a valid full pinyin, yet it could lead
    456      * "tong", "tou". So it would also gives out those corresponding
    457      * characters.
    458      * On the other hand, when a PinYin String is a valid Full Syllable
    459      * PinYin, even it could lead other Full Syllable PinYin, it will
    460      * not be treated as non-complete PinYin. For example, "da" will only
    461      * give candidates characters pronounced "da", although it could lead
    462      * "dan" "dao" "dang", etc.
    463      */
    464     void
    465     setNonCompleteSyllable(bool use = true)
    466         { m_bNonCompleteSyllable = use; }
    467 
    468     bool
    469     canNonCompleteSyllable()
    470         { return m_bNonCompleteSyllable; }
    471 
    472     /**
    473      * Left2RightSelection could improve the performance under
    474      * the TwoLine view style. But could not be used with OneLine view
    475      * style and ThreeLine view style.
    476      */
    477     void
    478     setLeft2RightSelection(bool use = true)
    479         { m_bStrictLeft2Right = use; }
    480 
    481     bool
    482     isGBKEnabled()
    483         { return m_bGBK; }
    484 
    485     void
    486     enableGBK(bool enable)
    487         { m_bGBK = enable; }
    488 
    489     void
    490     setHistoryPower(int power)
    491         { m_HistoryPower = ((power>= 0 && power <=10)?(power):(3)); }
    492 
    493     int
    494     getHistoryPower()
    495         { return m_HistoryPower; }
    496 
    497     void
    498     enableContextRanking(bool enable)
    499         { m_ContextRanking = enable; }
    500 
    501     int
    502     isContextRankingEnabled()
    503         { return m_ContextRanking; }
    504     /*@}*/
    505 
    506     /*@{*/
    507     bool
    508     isValidSyllable(const wstring& sy)
    509         { return isValidSyllable(sy.c_str()); }
    510 
    511     bool
    512     isValidSyllable(const TWCHAR* pstr);
    513 
    514     /**
    515      * segPinyinSimplest() do simplest segmentation for given PinYin. That, the
    516      * last char is the just input by user (or just delete by user), and
    517      * prefix string is at least non-complet syllable. This kind of segment
    518      * suitable for OneLineView. Syllable Pinyin string can only be modified
    519      * at the end.
    520      *
    521      * @param pinyin : a wide char string. Each char in the string should only
    522      * be one of the following: [a-z]. Note, that prefix string (except the
    523      * last char should already be complete or non-complet).
    524      *
    525      * @param result: After segmentation, result contains sequence of syllable
    526      * Bones. The result may be one of the following status:\n
    527      *   (1) two bones, the first is complete, and
    528      *                  the second is complete or non-complete.\n
    529      *   (2) two bones, the first is complete, and
    530      *                  the second is invalid.\n
    531      *   (3) single complete or non-complet bone, with AUTO_BOUNDARY.\n
    532      *   (4) single invalid bone.\n
    533      * Status (2) and (4) is invalid, program should reject the last
    534      * input char. Status (1)  program should commit the first bone and
    535      * use the remaining string as prefix. For status (3), program
    536      * should wait for further input.
    537      *
    538      * @return whether the segmentation is valid(or whether the last char
    539      * is valid)
    540      */
    541     bool
    542     segPinyinSimplest(const wstring& pinyin, CSkeleton& result);
    543 
    544     /**
    545     * [head1, tail1) [head2, tail2) may from two link. and form an virtual link.
    546     * generate the segmentation result into result where each node represent a
    547     * syllable. Note that invalid_syllable is inevitable.
    548     */
    549     void
    550     segPinyin(CSkeletonIter head1, CSkeletonIter tail1,
    551               CSkeletonIter head2, CSkeletonIter tail2,
    552               CSkeleton& result);
    553     /*@}*/
    554 
    555     /*@{*/
    556     /**
    557      * modify skeleton. remove bones [boneStart, boneEnd). and insert all
    558      * bones in the skel just before the boneEnd. Then update the
    559      * search lattice and search for a new result.
    560      *
    561      * @param boneStart    start bone iterator in the skeleton
    562      * @param boneEnd      ending bone iterator in the skeleton
    563      * @param skel         the new list of bones to be inserted.
    564      * @param pItLeftmost  the leftmost bone affected by the operation.
    565      *                     for manual search
    566      *
    567      * @return  whether or not the original gotted candidates called
    568      *          by getCandidates() would be affected by this modification.
    569      * @retval true   Affected.  getCandidates() should be call again.
    570      * @retval false  Not affected.
    571      */
    572     bool
    573     modify(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel,
    574            bool doSearch=true, CSkeletonIter* pItLeftmost=NULL);
    575 
    576     /**
    577      * modify skeleton. remove bones [boneStart, boneEnd). and insert all
    578      * bones in the skel just before the boneEnd. Do Syllable segmentation
    579      * again to make syllables legal. This may look left for 2/3 nodes. Also
    580      * new cursor position are counted.
    581      * Then update the search lattice and search for a new result if needed.
    582      *
    583      * @param boneStart    start bone iterator in the skeleton
    584      * @param boneEnd      ending bone iterator in the skeleton
    585      * @param skel         the new list of bones to be inserted.
    586      * @param cursor       the cursor bone in the skel, after this function,
    587      *                     it contains the corresponding cursor bone iterator
    588      *                     in the IC's Skeleton.
    589      * @param cursorIdx    the cursor's idx inside the cursor bone. Also contain
    590      *                     new position index after this function.
    591      * @param candiStart   The candidate list's starting position. Also the leftmost
    592      *                     when looking left to prevent re-segment insufficient.
    593      *                     Return value are set to its new position, because it would
    594      *                     change onto the new list.
    595      * @param stickLeft    When the cursor position after segment is located at
    596      *                     boundary. This give how it should be, @value true for
    597      *                     the tail of the left bone, @value false for the head
    598      *                     of the right bone.
    599      * @param doSearch     should be always true, otherwise you have to research the
    600      *                     whole skeleton after this call, because currently we do
    601      *                     not provide an interface to return the righmost bone to
    602      *                     remember the minimized the research demand cause by this.
    603      * @return  whether or not the original gotted candidates called
    604      *          by getCandidates() would be affected by this modification.
    605      * @retval true   Affected.  getCandidates() should be call again.
    606      * @retval false  Not affected.
    607      */
    608     bool
    609     modifyAndReseg(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel,
    610                    CSkeletonIter& cursor, int& cursorIdx, CSkeletonIter& candiStart,
    611                    bool stickLeft=true, bool doSearch=true);
    612 
    613 
    614     /**
    615      * Cancel original selection that including the bone
    616      *
    617      * @param bone  Selection is a candidate, which have a bone range.
    618      *              If any selection's range contains the bone, it
    619      *              would be canceled.
    620      * @param update "true" to update the search lattice. Normally,
    621      *        when called by SessionView, update should be set. For
    622      *        internal call of the function, one could let the update
    623      *        parameter to false, and do lattice searching later.
    624      *
    625      * @return the Bone on the left of param bone, which is the start bone
    626      *         of a user selection, and the selection include param bone.
    627      *         if no such a bone found, just return param bone itself.
    628      *
    629      * The algorithm working like following:
    630      * - Find the left most bone whose PinYin lexicon state could arrived here.
    631      *   This could be done just read the first element of BoneInnerData's
    632      *   m_LexiconStates.
    633      * - From current bone to the left-most bone, try to find the first meet
    634      *   bone whose BestWord candidates is valid.
    635      *     -# if the BestWord's right bone is at the left of target bone:
    636      *        ====>do nothing
    637      *     -# if the BestWord's right bone is at the right of the target bone:
    638      *        ====> Invalidate the BestWord. And if update is set,
    639      *              ====> searchFrom the BestWord's right bone.
    640      */
    641     CSkeletonIter
    642     cancelSelection(CSkeletonIter bone, bool update=true);
    643 
    644     /**
    645      * Cancel original selection that including the bone, but do
    646      * not count User Selection right starting at bone.
    647      *
    648      * @param bone  Selection is a candidate, which have a bone range.
    649      *              If any selection's range contains the bone, it
    650      *              would be canceled.
    651      * @param update "true" to update the search lattice. Normally,
    652      *        when called by SessionView, update should be set. For
    653      *        internal call of the function, one could let the update
    654      *        parameter to false, and do lattice searching later.
    655      *
    656      * @return the Bone on the left of param bone, which is the start bone
    657      *         of a user selection, and the selection include param bone.
    658      *         if no such a bone found, just return param bone itself.
    659      */
    660     CSkeletonIter
    661     cancelSelectionCover(CSkeletonIter bone, bool update=true);
    662 
    663 
    664     /**
    665      * Tell me that user make a selection for a specific candidate.
    666      * The lattice will be updated (only the neccessary part). And
    667      * best sentence are searched.
    668      *
    669      * @param candi   the user selection.
    670      * @return whether or not it will affect the previous gotted
    671      *         candidate list.
    672      *
    673      * Algorithm-Description:
    674      *    - Put the candi to the candi's leftBone's bestWord, and validate it.
    675      *        -# If Pure Left to Right model enabled, then:
    676      *        ====> rebuild the lattice from bestWord's left bone.
    677      *        -# If Pure Left to Right model disabled, then
    678      *        ====> rebuild the lattice from bestWord's right bone.
    679      *        .
    680      *    .
    681      *
    682      *   the search function will deal with the human selection, the processing
    683      *   of selection focused on two different things:
    684      *    - the word-set finding
    685      *    - the word scoring
    686      *    .
    687      *    More specificlly, the search routine will, normally, finding candidates
    688      *    words bone after bone. At each bone:
    689      *    -# It first check all finishing lexicon states, ie., get all possible
    690      *       words that will ending right before this bone, using them to construct
    691      *       lattice states of this bone. For each word, there is a inner cost
    692      *       associated with them. (For future word class usage) This cost is
    693      *       positive double. But for user selection, the cost should will be
    694      *       set to a suitable negtive double to make the best path alway go
    695      *       throught this word. The value of the negetive is affected by:
    696      *       maximum word length, N-gram's N, mini-backoff weight, mini-common-pr.
    697      *       (Due to float plus operation accuracy lost, the value should not be
    698      *        set to a very negitive, although the less the better.) But when
    699      *       "Pure Left to Right" is enabled, this trick will not be used, on
    700      *       the other hand, the following step would make sure there is only
    701      *       one possible transfer/edge on the lattice.
    702      *    -# Then, from all possible lexicon states (plus a root state), let them
    703      *       transfer on this bone's PINYIN string. Put the result states after
    704      *       transfer on the next bone as Lexicon states. But when
    705      *       "Pure Left to Right" model enabled, and a user selection is
    706      *       meet at the starting bone, then:
    707      *        - clear all states inside the bestWord's range.
    708      *        - jump directly to the bestWord's right bone, with only one word,
    709      *          ie. the bestWord, transfer enabled in this range.
    710      *        - then search from the right bone
    711      *    .
    712      */
    713     bool
    714     makeSelection(const CCandidate& candi);
    715     /*@}*/
    716 
    717     /*@{*/
    718     /**
    719      * Get candidate list for position at the bone.
    720      *
    721      * @param bone:   iterator pointed to a unit in the bone list.
    722      * @param result: the candidate list. Each candidate has a starting
    723      *     bone, ending bone, and a corresponding string. Currently
    724      *     all starting bone is the "bone" parameter.
    725      *
    726      * Currently, it will only give candidates that are starting at
    727      * the parameter bone. Normally, the lattice wouldn't save all words'
    728      * transfer cost information. But some good transfer edge could give
    729      * us infomation. To achieve high efficiency, in this function, no
    730      * probabilities are re-get from language model. But, we use the
    731      * following information to rank the candidates:
    732      *    -# If some words starting at bone is selected by the user, or
    733      *       it is a best edge of the best sentence, it will be listed as
    734      *       first.
    735      *    -# The longer the word, the better
    736      *    -# if some words' transfer cost could be got from the lattice,
    737      *       they are better than those from lexicon only.
    738      *    -# Use information stored in lexicon (where words could ranked
    739      *       by unigram).
    740      */
    741     void
    742     getCandidates(CSkeletonIter bone, CCandidates& result);
    743 
    744     /**
    745      * Get the best sentence corresponding from boneStart to boneEnd
    746      * @param boneStart
    747      * @param boneEnd
    748      * @param result  the result string
    749      * @return: the number of words converted in the best sentence. ie. not
    750      *          count the non-pinyin node or invalid pinyin node
    751      */
    752     int
    753     getBestSentence(wstring& result, CSkeletonIter boneStart, CSkeletonIter boneEnd, bool original_format=false);
    754     /*@}*/
    755 
    756     void
    757     searchFrom(CSkeletonIter boneStart);
    758 
    759 protected:
    760     CSkeleton                       m_Skeleton;
    761     CSkeletonIter                   m_EffectiveCandiBoneStart;
    762     CSkeletonIter                   m_EffectiveCandiBoneEnd;
    763 
    764     bool                            m_bNonCompleteSyllable;
    765     bool                            m_bStrictLeft2Right;
    766     bool                            m_bGBK;
    767     bool                            m_bGB18030;
    768     int                             m_HistoryPower;
    769     bool                            m_ContextRanking;
    770 
    771     CThreadSlm                     *m_pModel;
    772     CPinyinTrie                    *m_pPinyinTrie;
    773 
    774     CICHistory                     *m_pHistory;
    775 
    776 
    777 private:
    778     void
    779     transferBetween(CSkeletonIter h, CSkeletonIter t, unsigned int id, double ic);
    780 
    781     void
    782     buildLatticeStates(CSkeletonIter bone);
    783 
    784     CSkeletonIter
    785     forwardOnePinyinBone(CSkeletonIter bone);
    786 
    787     CSkeletonIter
    788     forwardPuncBone(CSkeletonIter bone);
    789 
    790     CSkeletonIter
    791     forwardPinyinBone(CSkeletonIter bone);
    792 
    793     CSkeletonIter
    794     forwardNonPinyinBone(CSkeletonIter bone);
    795 
    796     CSkeletonIter
    797     forwardInvalidBone(CSkeletonIter bone);
    798 
    799     CSkeletonIter
    800     forwardTailBone(CSkeletonIter bone);
    801 
    802 }; // of CIMIContext
    803 
    804 #endif
    805