Home | History | Annotate | Download | only in src
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifndef SUNPY_IMI_CONTEXT_H
     39    0   yongsun #define SUNPY_IMI_CONTEXT_H
     40    0   yongsun 
     41    0   yongsun #include "portability.h"
     42    0   yongsun 
     43    0   yongsun #ifdef HAVE_CONFIG_H
     44    0   yongsun #include <config.h>
     45    0   yongsun #endif
     46    0   yongsun 
     47    0   yongsun #ifdef DEBUG
     48    0   yongsun     #ifdef HAVE_ASSERT_H
     49    0   yongsun     #include <assert.h>
     50    0   yongsun     #endif
     51    0   yongsun #endif
     52    0   yongsun 
     53    0   yongsun #include <map>
     54    0   yongsun #include <vector>
     55    0   yongsun #include <list>
     56    0   yongsun #include <math.h>
     57    0   yongsun 
     58    0   yongsun #include "imi_data.h"
     59    0   yongsun #include "ic_history.h"
     60    0   yongsun 
     61    0   yongsun #define UNKNOWN_WORD_ID             0
     62    0   yongsun #define OOV_WORD_ID                 69
     63    0   yongsun #define SENTENCE_BREAKER_ID         10
     64    0   yongsun 
     65    0   yongsun struct TLongExpFloat {
     66    0   yongsun public:
     67    0   yongsun     TLongExpFloat(const TLongExpFloat& b) : m_base(b.m_base), m_exp(b.m_exp) { }
     68    0   yongsun 
     69    0   yongsun     TLongExpFloat(int exp = 0, double base=0.0) : m_base(base), m_exp(exp) { }
     70    0   yongsun 
     71    0   yongsun     TLongExpFloat(double d);
     72    0   yongsun 
     73    0   yongsun     TLongExpFloat
     74    0   yongsun     operator* (const TLongExpFloat& b) const;
     75    0   yongsun 
     76    0   yongsun     TLongExpFloat
     77    0   yongsun     operator/ (const TLongExpFloat& b) const;
     78    0   yongsun 
     79    0   yongsun     bool
     80    0   yongsun     operator< (const TLongExpFloat& b) const;
     81    0   yongsun 
     82    0   yongsun     bool
     83    0   yongsun     operator<=(const TLongExpFloat& b) const;
     84    0   yongsun 
     85    0   yongsun     bool
     86    0   yongsun     operator==(const TLongExpFloat& b) const;
     87    0   yongsun 
     88    0   yongsun     void
     89    0   yongsun     toString(std::string& str) const;
     90    0   yongsun 
     91    0   yongsun     void
     92    0   yongsun     toString(char* buf) const
     93    0   yongsun     { if (buf) sprintf(buf, "%10lf*2^%d", m_base, m_exp); }
     94    0   yongsun 
     95    0   yongsun     double
     96    0   yongsun     log2() const
     97    0   yongsun     {
     98    0   yongsun         #ifdef DEBUG
     99    0   yongsun             //assert(m_base > 0.0);
    100    0   yongsun         #endif
    101    0   yongsun         return ::log2(m_base)+m_exp;
    102    0   yongsun     }
    103    0   yongsun 
    104    0   yongsun private:
    105    0   yongsun     double   m_base;
    106    0   yongsun     int      m_exp;
    107    0   yongsun };
    108    0   yongsun 
    109    0   yongsun /**
    110    0   yongsun * TSentenceScore is only used for whole sentence score,
    111    0   yongsun * the score from language model still using double.
    112    0   yongsun */
    113    0   yongsun #ifdef _USE_RAW_PROBABILITY
    114    0   yongsun     typedef TLongExpFloat           TSentenceScore;
    115    0   yongsun #else
    116    0   yongsun     typedef double                  TSentenceScore;
    117    0   yongsun #endif
    118    0   yongsun 
    119    0   yongsun class CBone;
    120    0   yongsun class CCandidate;
    121    0   yongsun 
    122    0   yongsun typedef std::list<CBone>            CSkeleton;
    123    0   yongsun typedef CSkeleton::iterator         CSkeletonIter;
    124    0   yongsun typedef std::vector<CCandidate>     CCandidates;
    125    0   yongsun typedef CCandidates::iterator       CCandidatesIter;
    126    0   yongsun 
    127    0   yongsun class CIMIContext;
    128    0   yongsun class CBoneInnerData;
    129    0   yongsun 
    130    0   yongsun union TCandiRank {
    131    0   yongsun public:
    132    0   yongsun     bool
    133    0   yongsun     operator< (const TCandiRank& b) const
    134    0   yongsun         { return m_all < b.m_all; };
    135    0   yongsun 
    136    0   yongsun     TCandiRank() : m_all(0) { }
    137    0   yongsun 
    138    0   yongsun     TCandiRank(bool user, bool best, unsigned int len,
    139    0   yongsun                bool fromLattice, TSentenceScore score);
    140    0   yongsun 
    141    0   yongsun     TCandiRank(bool user, bool best, unsigned int len,
    142    0   yongsun               bool fromLattice, unsigned score);
    143    0   yongsun 
    144    0   yongsun protected:
    145    0   yongsun     unsigned  int               m_all;
    146  198  tchaikov     #if !defined(WORDS_BIGENDIAN)
    147    0   yongsun     struct TAnony {
    148    0   yongsun         unsigned                m_cost   : 24;
    149    0   yongsun         unsigned                m_lattice: 1;
    150  184   yongsun         unsigned                m_best   : 1;
    151    0   yongsun         unsigned                m_len    : 5;
    152    0   yongsun         unsigned                m_user   : 1;
    153    0   yongsun     } anony;
    154  198  tchaikov     #else
    155    0   yongsun     struct TAnony {
    156    0   yongsun         unsigned                m_user   : 1;
    157  184   yongsun         unsigned                m_len    : 5;
    158    0   yongsun         unsigned                m_best   : 1;
    159    0   yongsun         unsigned                m_lattice: 1;
    160    0   yongsun         unsigned                m_cost   : 24;
    161    0   yongsun     } anony;
    162    0   yongsun     #endif
    163    0   yongsun 
    164    0   yongsun };
    165    0   yongsun 
    166    0   yongsun /**
    167    0   yongsun  * CCandidate represent basic information about a single candidate.
    168    0   yongsun  * Its start bone and finishing bone. It's content string. and its
    169    0   yongsun  * word id.
    170    0   yongsun  */
    171    0   yongsun class CCandidate {
    172    0   yongsun public:
    173    0   yongsun     friend class CIMIContext;
    174    0   yongsun public:
    175    0   yongsun     CSkeletonIter                   m_BoneStart;
    176    0   yongsun     CSkeletonIter                   m_BoneEnd;
    177    0   yongsun     const TWCHAR                   *m_String;
    178    0   yongsun 
    179    0   yongsun public:
    180    0   yongsun     CCandidate(const CCandidate& b)
    181    0   yongsun         : m_BoneStart(b.m_BoneStart), m_BoneEnd(b.m_BoneEnd),
    182    0   yongsun           m_String(b.m_String), m_WordId(b.m_WordId) { }
    183    0   yongsun 
    184    0   yongsun     /** Give out the constructor for convinience */
    185    0   yongsun     CCandidate(const TWCHAR* s = NULL,
    186    0   yongsun                CSkeletonIter h=CSkeletonIter(),
    187    0   yongsun                CSkeletonIter t=CSkeletonIter(),
    188    0   yongsun                unsigned int wid=0)
    189   18       Kov         : m_BoneStart(h), m_BoneEnd(t), m_String(s), m_WordId(wid) { }
    190    0   yongsun 
    191    0   yongsun     void
    192    0   yongsun     print(std::string& prefix);
    193    0   yongsun 
    194    0   yongsun protected:
    195    0   yongsun     unsigned int                      m_WordId;
    196    0   yongsun }; // of CCandidate
    197    0   yongsun 
    198    0   yongsun 
    199    0   yongsun /**
    200    0   yongsun  * Bone is the basic unit for CIMSessionDate to store a Syllable, ie
    201    0   yongsun  * a Pinyin string for only one Chinese character. Such as "zhang",
    202    0   yongsun  * or "zh" under non-complete pinyin.
    203    0   yongsun  */
    204    0   yongsun class CBone {
    205    0   yongsun     friend class CIMIContext;
    206    0   yongsun public:
    207    0   yongsun     /**
    208    0   yongsun      * In case that use input is not Pinyin, such as under English input
    209    0   yongsun      * mode, A bone contains a string for the input. For English, all
    210    0   yongsun      * consecutive string grouped into one bone, such as "SUN Microsystem";
    211    0   yongsun      * For Punc, each punctuaction become on bone, such as ": would be
    212    0   yongsun      * split into two bones.
    213    0   yongsun      *
    214    0   yongsun      * For Pinyin type, there are three different type definition, and the
    215    0   yongsun      * value for each syllable string depends on whether or not NonComplete
    216    0   yongsun      * Pinyin mode is enabled:
    217    0   yongsun      *
    218    0   yongsun      * NODE_PINYIN           : Valid pinyin. Such as "zh" with NonComplet
    219    0   yongsun      *                         Pinyin enabled, or "zhang" \n
    220    0   yongsun      * NODE_INVALID_PINYIN   : Invalid syllable string, such as "u"; or "afdasf"\n
    221    0   yongsun      * NODE_INCOMPLETE_PINYIN: incomplete syllable string (maybe complet further),
    222    0   yongsun      *                         such as "to"; or "zh" when NonComplete Pinyin disabled.
    223    0   yongsun      */
    224    0   yongsun     enum NODE_TYPE {
    225    0   yongsun        NODE_TAIL                = 0x0000,      //pusedo tail node
    226    0   yongsun 
    227    0   yongsun        CATE_PINYIN              = 0x0100,
    228    0   yongsun        NODE_PINYIN              = 0x0101,      //pinyin
    229    0   yongsun        NODE_INVALID_PINYIN      = 0x0102,      //invalid syllable string
    230    0   yongsun        NODE_INCOMPLETE_PINYIN   = 0x0103,      //incomplete syllable string
    231    0   yongsun 
    232    0   yongsun 
    233    0   yongsun        CATE_OTHER               = 0x0200,
    234    0   yongsun        NODE_ASCII               = 0x0201,      //english string
    235    0   yongsun        NODE_PUNC                = 0x0202,      //punctuation
    236    0   yongsun        NODE_SIMBOL              = 0x0203,      //other simbol
    237    0   yongsun        NODE_DIGITAL             = 0x0204       //not implemeted here
    238    0   yongsun 
    239    0   yongsun     }; // of NODE_TYPE
    240    0   yongsun 
    241    0   yongsun     /**
    242    0   yongsun      * Boundary type indicate how the bone is seperated, by (1) Automatic
    243    0   yongsun      * Syllable segmentation, (2) different bone type or punc bone seperate
    244    0   yongsun      * rule, (3) user sepecified.
    245    0   yongsun      */
    246    0   yongsun     enum BOUNDARY_TYPE {
    247    0   yongsun        AUTO_BOUNDARY,               //automatic segmentation result
    248    0   yongsun        ABSOLUTE_BOUNDARY,           //boundary without ambiguation
    249    0   yongsun        USER_BOUNDARY               //user given boundary
    250    0   yongsun     }; // of BOUNDARY_TYPE
    251    0   yongsun 
    252    0   yongsun public:
    253    0   yongsun     int                             m_BoneType;
    254    0   yongsun     int                             m_BoundaryType;   // original code for m_String[0] in non_pinyin node
    255    0   yongsun     wstring                         m_String;
    256    0   yongsun 
    257    0   yongsun public:
    258    0   yongsun     /**
    259    0   yongsun      * Never copy or allocate space for m_pInnerData;
    260    0   yongsun      */
    261    0   yongsun     CBone(const CBone& b);
    262    0   yongsun 
    263    0   yongsun     /**
    264    0   yongsun      * @param boundType: Boundary type.
    265    0   yongsun      * @param boneType: BoneType.
    266    0   yongsun      * No space is allocated for m_pInnerData
    267    0   yongsun      */
    268    0   yongsun     CBone(int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL);
    269    0   yongsun 
    270    0   yongsun     /**
    271    0   yongsun      * @param pwc  can not be NULL
    272    0   yongsun      * @param boundType: Boundary type.
    273    0   yongsun      * @param boneType: BoneType.
    274    0   yongsun      * No space is allocated for m_pInnerData
    275    0   yongsun      */
    276    0   yongsun     CBone(const TWCHAR* pwc, int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL);
    277    0   yongsun 
    278    0   yongsun     /**
    279    0   yongsun      * @param pwc  the string should be copied into this bone, not NULL
    280    0   yongsun      * @param len  the string len
    281    0   yongsun      * @param boundType: Boundary type.
    282    0   yongsun      * @param boneType: BoneType.
    283    0   yongsun      * No space is allocated for m_pInnerData
    284    0   yongsun      */
    285    0   yongsun     CBone(const TWCHAR* pwc, size_t len, int boundType, int boneType);
    286    0   yongsun 
    287    0   yongsun     /** Free all space if necessary. */
    288    0   yongsun     ~CBone();
    289    0   yongsun 
    290    0   yongsun     inline bool
    291    0   yongsun     isPinyinNode() const
    292    0   yongsun         { return ((m_BoneType & CATE_PINYIN)!= 0); }
    293    0   yongsun 
    294    0   yongsun     inline bool
    295    0   yongsun     isValidPinyinNode() const
    296    0   yongsun         { return (m_BoneType == NODE_PINYIN || m_BoneType == NODE_INCOMPLETE_PINYIN); }
    297    0   yongsun 
    298    0   yongsun     inline bool
    299    0   yongsun     isUserBoundary() const
    300    0   yongsun         { return m_BoundaryType == USER_BOUNDARY; }
    301    0   yongsun 
    302    0   yongsun     inline bool
    303    0   yongsun     isAutoBoundary() const
    304    0   yongsun         { return m_BoundaryType == AUTO_BOUNDARY; }
    305    0   yongsun 
    306    0   yongsun     inline bool
    307    0   yongsun     isTailNode() const
    308    0   yongsun         { return (m_BoneType == 0); }
    309    0   yongsun 
    310    0   yongsun     bool
    311    0   yongsun     isUserSelectionStart(void);
    312    0   yongsun 
    313    0   yongsun     void
    314    0   yongsun     print(std::string& prefix);
    315    0   yongsun 
    316    0   yongsun protected:
    317    0   yongsun     CBoneInnerData                 *m_pInnerData;
    318    0   yongsun }; // of CBone
    319    0   yongsun 
    320    0   yongsun 
    321    0   yongsun /**
    322    0   yongsun  * It is more suitable to call this as Input Context together with I
    323    0   yongsun  * MSessionView. These data record
    324    0   yongsun  * input history for a input session. Normally a seesion data would
    325    0   yongsun  * only contains history keys and the cursor position. It would be enough
    326    0   yongsun  * to find corresponding result from the history for most IME.
    327    0   yongsun  *
    328    0   yongsun  * The Session data class here take responsible for generating best sentence
    329    0   yongsun  * from Pinyin string. It also contains all core algorithm for this conversion
    330    0   yongsun  * progress.
    331    0   yongsun  *
    332    0   yongsun  * All Key processing job should be done by Session View class, and only several
    333    0   yongsun  * interface exist here for the SessionView to modify Input Context and get
    334    0   yongsun  * best sentence and ranked candidates.
    335    0   yongsun  *
    336    0   yongsun  * The other important function it provide is Automatic Pinyin segmentation.
    337    0   yongsun  *
    338    0   yongsun  * For our input method, from the aspect of effeciency, a internal
    339    0   yongsun  * search lattice should also be remained and only partial of the lattice
    340    0   yongsun  * would be rebuild or updated when user give a new input. The reason is
    341    0   yongsun  * that it is time consuming to construct a whole search lattice, especially
    342    0   yongsun  * for a long sentence and/or with Non-Complete syllables, which cause
    343    0   yongsun  * potential candidates number increase. Yet all search related data are
    344    0   yongsun  * hidden to outer usage.
    345    0   yongsun  */
    346    0   yongsun class CIMIContext {
    347    0   yongsun public:
    348    0   yongsun     /*@{*/
    349    0   yongsun     /**
    350    0   yongsun      * Constructor of CIMIContext. Set all the pointer to NULL.
    351    0   yongsun      * set Non-Complete Syllable to true, set Strict Left2Right Model
    352    0   yongsun      * to false.
    353    0   yongsun      *
    354    0   yongsun      * Note: At this time, CIMIContext could not be used to
    355    0   yongsun      *       search directly. Only after setCoreData() and clear(),
    356    0   yongsun      *       the internal search lattice are constructed and can
    357    0   yongsun      *       be used.
    358    0   yongsun      */
    359    0   yongsun     CIMIContext();
    360    0   yongsun 
    361    0   yongsun     /**
    362    0   yongsun      * free all resource/spaces
    363    0   yongsun      */
    364    0   yongsun     virtual
    365    0   yongsun     ~CIMIContext()
    366    0   yongsun         { m_Skeleton.clear(); }
    367    0   yongsun     /*@}*/
    368    0   yongsun 
    369    0   yongsun     /*@{*/
    370    0   yongsun     /**
    371    0   yongsun      * Copy language model ptr and Pinyin-Trie Ptr inside the IMCoreData
    372    0   yongsun      * into my own member.
    373    0   yongsun      * Also build Chinese Punctuation Map from Pinyin-Trie.
    374    0   yongsun      *
    375    0   yongsun      * @param pCoreData is the core resource data for the Input Method
    376    0   yongsun      */
    377    0   yongsun     void
    378    0   yongsun     setCoreData(CIMIData *pCoreData);
    379    0   yongsun 
    380    0   yongsun     /**
    381    0   yongsun      * clear all internal Input Context, after this call, the Session data
    382    0   yongsun      * or Input Context should same as they were just created. (Of cause,
    383    0   yongsun      * the values from core data and desktop data remains.) More specifically,
    384    0   yongsun      * it will clear skeleton, add a psuedo tail node, set internal candidate
    385    0   yongsun      * position to skeleton.end() (which means no candidates needed now). then
    386    0   yongsun      * it will construct a initial search lattice.
    387    0   yongsun      *
    388    0   yongsun      * Note: This function should be called only after setCoreData, because
    389    0   yongsun      *       it will use the language model to construct a empty search
    390    0   yongsun      *       lattice.
    391    0   yongsun      */
    392    0   yongsun     void
    393    0   yongsun     clear();
    394    0   yongsun 
    395    0   yongsun     void
    396    0   yongsun     setHistoryMemory(CICHistory *phm);
    397    0   yongsun 
    398    0   yongsun     CICHistory *
    399    0   yongsun     getHistoryMemory();
    400    0   yongsun 
    401    0   yongsun     /** return true if defined DEBUG */
    402    0   yongsun     void
    403    0   yongsun     print_lattice();
    404    0   yongsun 
    405    0   yongsun     //memorize sentence in current text
    406    0   yongsun     void
    407    0   yongsun     memorize(void);
    408    0   yongsun 
    409    0   yongsun     /**
    410    0   yongsun      * Return the bone list to let the view read them directly.
    411    0   yongsun      */
    412    0   yongsun     CSkeleton &
    413    0   yongsun     getSkeleton(void)
    414    0   yongsun         { return m_Skeleton; }
    415    0   yongsun 
    416    0   yongsun     /**
    417    0   yongsun      * To construct the lattice, algorithm need to append two
    418    0   yongsun      * psuedo tail bone at the end of the bone list.
    419    0   yongsun      * For the SessionView could operate the list as the two tailing
    420    0   yongsun      * bone were not there, SessionView should call this function
    421    0   yongsun      * to replace call like getSekelton()->end() when iterating the
    422    0   yongsun      * Skeleton (bone list).
    423    0   yongsun      *
    424    0   yongsun      * @return the first psuedo tailing node at the end of bone
    425    0   yongsun      *     list. For SessionView usage.
    426    0   yongsun      */
    427    0   yongsun     CSkeletonIter
    428    0   yongsun     getLastBone(void)
    429    0   yongsun         { return --(--(m_Skeleton.end())); }
    430    0   yongsun 
    431    0   yongsun     /**
    432    0   yongsun     * To cooperate with the getLastBone.
    433    0   yongsun     * @return the first bone of the skeleton
    434    0   yongsun     */
    435    0   yongsun     CSkeletonIter
    436    0   yongsun     getFirstBone(void)
    437    0   yongsun         { return m_Skeleton.begin(); }
    438    0   yongsun 
    439    0   yongsun     bool
    440    0   yongsun     isEmpty(void)
    441    0   yongsun         { return m_Skeleton.size() == 2; }
    442    0   yongsun     /*@}*/
    443    0   yongsun 
    444    0   yongsun 
    445    0   yongsun     // functions to set options. Options should be set when Session Data
    446    0   yongsun     // is clear, ie, just created or just after clear() is called
    447    0   yongsun 
    448    0   yongsun     /*@{*/
    449    0   yongsun     /**
    450    0   yongsun      * NonCompleteSyllable means we can give a candidates when PinYin is
    451    0   yongsun      * just partial a complete syllable string. Such as "sh" would give
    452    0   yongsun      * all characters with one of it PinYin starts from "sh", ex, "shi",
    453    0   yongsun      * "sheng" etc.
    454    0   yongsun      * In our definition, Non-Complete PinYin not only limitied on "SHENGMU",
    455    0   yongsun      * For examplet, "to" is not a valid full pinyin, yet it could lead
    456    0   yongsun      * "tong", "tou". So it would also gives out those corresponding
    457    0   yongsun      * characters.
    458    0   yongsun      * On the other hand, when a PinYin String is a valid Full Syllable
    459    0   yongsun      * PinYin, even it could lead other Full Syllable PinYin, it will
    460    0   yongsun      * not be treated as non-complete PinYin. For example, "da" will only
    461    0   yongsun      * give candidates characters pronounced "da", although it could lead
    462    0   yongsun      * "dan" "dao" "dang", etc.
    463    0   yongsun      */
    464    0   yongsun     void
    465    0   yongsun     setNonCompleteSyllable(bool use = true)
    466    0   yongsun         { m_bNonCompleteSyllable = use; }
    467    0   yongsun 
    468    0   yongsun     bool
    469    0   yongsun     canNonCompleteSyllable()
    470    0   yongsun         { return m_bNonCompleteSyllable; }
    471    0   yongsun 
    472    0   yongsun     /**
    473    0   yongsun      * Left2RightSelection could improve the performance under
    474    0   yongsun      * the TwoLine view style. But could not be used with OneLine view
    475    0   yongsun      * style and ThreeLine view style.
    476    0   yongsun      */
    477    0   yongsun     void
    478    0   yongsun     setLeft2RightSelection(bool use = true)
    479    0   yongsun         { m_bStrictLeft2Right = use; }
    480    0   yongsun 
    481    0   yongsun     bool
    482    0   yongsun     isGBKEnabled()
    483    0   yongsun         { return m_bGBK; }
    484    0   yongsun 
    485    0   yongsun     void
    486    0   yongsun     enableGBK(bool enable)
    487    0   yongsun         { m_bGBK = enable; }
    488    0   yongsun 
    489    0   yongsun     void
    490    0   yongsun     setHistoryPower(int power)
    491    0   yongsun         { m_HistoryPower = ((power>= 0 && power <=10)?(power):(3)); }
    492    0   yongsun 
    493    0   yongsun     int
    494    0   yongsun     getHistoryPower()
    495    0   yongsun         { return m_HistoryPower; }
    496    0   yongsun 
    497    0   yongsun     void
    498    0   yongsun     enableContextRanking(bool enable)
    499    0   yongsun         { m_ContextRanking = enable; }
    500    0   yongsun 
    501    0   yongsun     int
    502    0   yongsun     isContextRankingEnabled()
    503    0   yongsun         { return m_ContextRanking; }
    504    0   yongsun     /*@}*/
    505    0   yongsun 
    506    0   yongsun     /*@{*/
    507    0   yongsun     bool
    508    0   yongsun     isValidSyllable(const wstring& sy)
    509    0   yongsun         { return isValidSyllable(sy.c_str()); }
    510    0   yongsun 
    511    0   yongsun     bool
    512    0   yongsun     isValidSyllable(const TWCHAR* pstr);
    513    0   yongsun 
    514    0   yongsun     /**
    515    0   yongsun      * segPinyinSimplest() do simplest segmentation for given PinYin. That, the
    516    0   yongsun      * last char is the just input by user (or just delete by user), and
    517    0   yongsun      * prefix string is at least non-complet syllable. This kind of segment
    518    0   yongsun      * suitable for OneLineView. Syllable Pinyin string can only be modified
    519    0   yongsun      * at the end.
    520    0   yongsun      *
    521    0   yongsun      * @param pinyin : a wide char string. Each char in the string should only
    522    0   yongsun      * be one of the following: [a-z]. Note, that prefix string (except the
    523    0   yongsun      * last char should already be complete or non-complet).
    524    0   yongsun      *
    525    0   yongsun      * @param result: After segmentation, result contains sequence of syllable
    526    0   yongsun      * Bones. The result may be one of the following status:\n
    527    0   yongsun      *   (1) two bones, the first is complete, and
    528    0   yongsun      *                  the second is complete or non-complete.\n
    529    0   yongsun      *   (2) two bones, the first is complete, and
    530    0   yongsun      *                  the second is invalid.\n
    531    0   yongsun      *   (3) single complete or non-complet bone, with AUTO_BOUNDARY.\n
    532    0   yongsun      *   (4) single invalid bone.\n
    533    0   yongsun      * Status (2) and (4) is invalid, program should reject the last
    534    0   yongsun      * input char. Status (1)  program should commit the first bone and
    535    0   yongsun      * use the remaining string as prefix. For status (3), program
    536    0   yongsun      * should wait for further input.
    537    0   yongsun      *
    538    0   yongsun      * @return whether the segmentation is valid(or whether the last char
    539    0   yongsun      * is valid)
    540    0   yongsun      */
    541    0   yongsun     bool
    542    0   yongsun     segPinyinSimplest(const wstring& pinyin, CSkeleton& result);
    543    0   yongsun 
    544    0   yongsun     /**
    545    0   yongsun     * [head1, tail1) [head2, tail2) may from two link. and form an virtual link.
    546    0   yongsun     * generate the segmentation result into result where each node represent a
    547    0   yongsun     * syllable. Note that invalid_syllable is inevitable.
    548    0   yongsun     */
    549    0   yongsun     void
    550    0   yongsun     segPinyin(CSkeletonIter head1, CSkeletonIter tail1,
    551    0   yongsun               CSkeletonIter head2, CSkeletonIter tail2,
    552    0   yongsun               CSkeleton& result);
    553    0   yongsun     /*@}*/
    554    0   yongsun 
    555    0   yongsun     /*@{*/
    556    0   yongsun     /**
    557    0   yongsun      * modify skeleton. remove bones [boneStart, boneEnd). and insert all
    558    0   yongsun      * bones in the skel just before the boneEnd. Then update the
    559    0   yongsun      * search lattice and search for a new result.
    560    0   yongsun      *
    561    0   yongsun      * @param boneStart    start bone iterator in the skeleton
    562    0   yongsun      * @param boneEnd      ending bone iterator in the skeleton
    563    0   yongsun      * @param skel         the new list of bones to be inserted.
    564    0   yongsun      * @param pItLeftmost  the leftmost bone affected by the operation.
    565    0   yongsun      *                     for manual search
    566    0   yongsun      *
    567    0   yongsun      * @return  whether or not the original gotted candidates called
    568    0   yongsun      *          by getCandidates() would be affected by this modification.
    569    0   yongsun      * @retval true   Affected.  getCandidates() should be call again.
    570    0   yongsun      * @retval false  Not affected.
    571    0   yongsun      */
    572    0   yongsun     bool
    573    0   yongsun     modify(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel,
    574    0   yongsun            bool doSearch=true, CSkeletonIter* pItLeftmost=NULL);
    575    0   yongsun 
    576    0   yongsun     /**
    577    0   yongsun      * modify skeleton. remove bones [boneStart, boneEnd). and insert all
    578    0   yongsun      * bones in the skel just before the boneEnd. Do Syllable segmentation
    579    0   yongsun      * again to make syllables legal. This may look left for 2/3 nodes. Also
    580    0   yongsun      * new cursor position are counted.
    581    0   yongsun      * Then update the search lattice and search for a new result if needed.
    582    0   yongsun      *
    583    0   yongsun      * @param boneStart    start bone iterator in the skeleton
    584    0   yongsun      * @param boneEnd      ending bone iterator in the skeleton
    585    0   yongsun      * @param skel         the new list of bones to be inserted.
    586    0   yongsun      * @param cursor       the cursor bone in the skel, after this function,
    587    0   yongsun      *                     it contains the corresponding cursor bone iterator
    588    0   yongsun      *                     in the IC's Skeleton.
    589    0   yongsun      * @param cursorIdx    the cursor's idx inside the cursor bone. Also contain
    590    0   yongsun      *                     new position index after this function.
    591    0   yongsun      * @param candiStart   The candidate list's starting position. Also the leftmost
    592    0   yongsun      *                     when looking left to prevent re-segment insufficient.
    593    0   yongsun      *                     Return value are set to its new position, because it would
    594    0   yongsun      *                     change onto the new list.
    595    0   yongsun      * @param stickLeft    When the cursor position after segment is located at
    596    0   yongsun      *                     boundary. This give how it should be, @value true for
    597    0   yongsun      *                     the tail of the left bone, @value false for the head
    598    0   yongsun      *                     of the right bone.
    599    0   yongsun      * @param doSearch     should be always true, otherwise you have to research the
    600    0   yongsun      *                     whole skeleton after this call, because currently we do
    601    0   yongsun      *                     not provide an interface to return the righmost bone to
    602    0   yongsun      *                     remember the minimized the research demand cause by this.
    603    0   yongsun      * @return  whether or not the original gotted candidates called
    604    0   yongsun      *          by getCandidates() would be affected by this modification.
    605    0   yongsun      * @retval true   Affected.  getCandidates() should be call again.
    606    0   yongsun      * @retval false  Not affected.
    607    0   yongsun      */
    608    0   yongsun     bool
    609    0   yongsun     modifyAndReseg(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel,
    610    0   yongsun                    CSkeletonIter& cursor, int& cursorIdx, CSkeletonIter& candiStart,
    611    0   yongsun                    bool stickLeft=true, bool doSearch=true);
    612    0   yongsun 
    613    0   yongsun 
    614    0   yongsun     /**
    615    0   yongsun      * Cancel original selection that including the bone
    616    0   yongsun      *
    617    0   yongsun      * @param bone  Selection is a candidate, which have a bone range.
    618    0   yongsun      *              If any selection's range contains the bone, it
    619    0   yongsun      *              would be canceled.
    620    0   yongsun      * @param update "true" to update the search lattice. Normally,
    621    0   yongsun      *        when called by SessionView, update should be set. For
    622    0   yongsun      *        internal call of the function, one could let the update
    623    0   yongsun      *        parameter to false, and do lattice searching later.
    624    0   yongsun      *
    625    0   yongsun      * @return the Bone on the left of param bone, which is the start bone
    626    0   yongsun      *         of a user selection, and the selection include param bone.
    627    0   yongsun      *         if no such a bone found, just return param bone itself.
    628    0   yongsun      *
    629    0   yongsun      * The algorithm working like following:
    630    0   yongsun      * - Find the left most bone whose PinYin lexicon state could arrived here.
    631    0   yongsun      *   This could be done just read the first element of BoneInnerData's
    632    0   yongsun      *   m_LexiconStates.
    633    0   yongsun      * - From current bone to the left-most bone, try to find the first meet
    634    0   yongsun      *   bone whose BestWord candidates is valid.
    635    0   yongsun      *     -# if the BestWord's right bone is at the left of target bone:
    636    0   yongsun      *        ====>do nothing
    637    0   yongsun      *     -# if the BestWord's right bone is at the right of the target bone:
    638    0   yongsun      *        ====> Invalidate the BestWord. And if update is set,
    639    0   yongsun      *              ====> searchFrom the BestWord's right bone.
    640    0   yongsun      */
    641    0   yongsun     CSkeletonIter
    642    0   yongsun     cancelSelection(CSkeletonIter bone, bool update=true);
    643    0   yongsun 
    644    0   yongsun     /**
    645    0   yongsun      * Cancel original selection that including the bone, but do
    646    0   yongsun      * not count User Selection right starting at bone.
    647    0   yongsun      *
    648    0   yongsun      * @param bone  Selection is a candidate, which have a bone range.
    649    0   yongsun      *              If any selection's range contains the bone, it
    650    0   yongsun      *              would be canceled.
    651    0   yongsun      * @param update "true" to update the search lattice. Normally,
    652    0   yongsun      *        when called by SessionView, update should be set. For
    653    0   yongsun      *        internal call of the function, one could let the update
    654    0   yongsun      *        parameter to false, and do lattice searching later.
    655    0   yongsun      *
    656    0   yongsun      * @return the Bone on the left of param bone, which is the start bone
    657    0   yongsun      *         of a user selection, and the selection include param bone.
    658    0   yongsun      *         if no such a bone found, just return param bone itself.
    659    0   yongsun      */
    660    0   yongsun     CSkeletonIter
    661    0   yongsun     cancelSelectionCover(CSkeletonIter bone, bool update=true);
    662    0   yongsun 
    663    0   yongsun 
    664    0   yongsun     /**
    665    0   yongsun      * Tell me that user make a selection for a specific candidate.
    666    0   yongsun      * The lattice will be updated (only the neccessary part). And
    667    0   yongsun      * best sentence are searched.
    668    0   yongsun      *
    669    0   yongsun      * @param candi   the user selection.
    670    0   yongsun      * @return whether or not it will affect the previous gotted
    671    0   yongsun      *         candidate list.
    672    0   yongsun      *
    673    0   yongsun      * Algorithm-Description:
    674    0   yongsun      *    - Put the candi to the candi's leftBone's bestWord, and validate it.
    675    0   yongsun      *        -# If Pure Left to Right model enabled, then:
    676    0   yongsun      *        ====> rebuild the lattice from bestWord's left bone.
    677    0   yongsun      *        -# If Pure Left to Right model disabled, then
    678    0   yongsun      *        ====> rebuild the lattice from bestWord's right bone.
    679    0   yongsun      *        .
    680    0   yongsun      *    .
    681    0   yongsun      *
    682    0   yongsun      *   the search function will deal with the human selection, the processing
    683    0   yongsun      *   of selection focused on two different things:
    684    0   yongsun      *    - the word-set finding
    685    0   yongsun      *    - the word scoring
    686    0   yongsun      *    .
    687    0   yongsun      *    More specificlly, the search routine will, normally, finding candidates
    688    0   yongsun      *    words bone after bone. At each bone:
    689    0   yongsun      *    -# It first check all finishing lexicon states, ie., get all possible
    690    0   yongsun      *       words that will ending right before this bone, using them to construct
    691    0   yongsun      *       lattice states of this bone. For each word, there is a inner cost
    692    0   yongsun      *       associated with them. (For future word class usage) This cost is
    693    0   yongsun      *       positive double. But for user selection, the cost should will be
    694    0   yongsun      *       set to a suitable negtive double to make the best path alway go
    695    0   yongsun      *       throught this word. The value of the negetive is affected by:
    696    0   yongsun      *       maximum word length, N-gram's N, mini-backoff weight, mini-common-pr.
    697    0   yongsun      *       (Due to float plus operation accuracy lost, the value should not be
    698    0   yongsun      *        set to a very negitive, although the less the better.) But when
    699    0   yongsun      *       "Pure Left to Right" is enabled, this trick will not be used, on
    700    0   yongsun      *       the other hand, the following step would make sure there is only
    701    0   yongsun      *       one possible transfer/edge on the lattice.
    702    0   yongsun      *    -# Then, from all possible lexicon states (plus a root state), let them
    703    0   yongsun      *       transfer on this bone's PINYIN string. Put the result states after
    704    0   yongsun      *       transfer on the next bone as Lexicon states. But when
    705    0   yongsun      *       "Pure Left to Right" model enabled, and a user selection is
    706    0   yongsun      *       meet at the starting bone, then:
    707    0   yongsun      *        - clear all states inside the bestWord's range.
    708    0   yongsun      *        - jump directly to the bestWord's right bone, with only one word,
    709    0   yongsun      *          ie. the bestWord, transfer enabled in this range.
    710    0   yongsun      *        - then search from the right bone
    711    0   yongsun      *    .
    712    0   yongsun      */
    713    0   yongsun     bool
    714    0   yongsun     makeSelection(const CCandidate& candi);
    715    0   yongsun     /*@}*/
    716    0   yongsun 
    717    0   yongsun     /*@{*/
    718    0   yongsun     /**
    719    0   yongsun      * Get candidate list for position at the bone.
    720    0   yongsun      *
    721    0   yongsun      * @param bone:   iterator pointed to a unit in the bone list.
    722    0   yongsun      * @param result: the candidate list. Each candidate has a starting
    723    0   yongsun      *     bone, ending bone, and a corresponding string. Currently
    724    0   yongsun      *     all starting bone is the "bone" parameter.
    725    0   yongsun      *
    726    0   yongsun      * Currently, it will only give candidates that are starting at
    727    0   yongsun      * the parameter bone. Normally, the lattice wouldn't save all words'
    728    0   yongsun      * transfer cost information. But some good transfer edge could give
    729    0   yongsun      * us infomation. To achieve high efficiency, in this function, no
    730    0   yongsun      * probabilities are re-get from language model. But, we use the
    731    0   yongsun      * following information to rank the candidates:
    732    0   yongsun      *    -# If some words starting at bone is selected by the user, or
    733    0   yongsun      *       it is a best edge of the best sentence, it will be listed as
    734    0   yongsun      *       first.
    735    0   yongsun      *    -# The longer the word, the better
    736    0   yongsun      *    -# if some words' transfer cost could be got from the lattice,
    737    0   yongsun      *       they are better than those from lexicon only.
    738    0   yongsun      *    -# Use information stored in lexicon (where words could ranked
    739    0   yongsun      *       by unigram).
    740    0   yongsun      */
    741    0   yongsun     void
    742    0   yongsun     getCandidates(CSkeletonIter bone, CCandidates& result);
    743    0   yongsun 
    744    0   yongsun     /**
    745    0   yongsun      * Get the best sentence corresponding from boneStart to boneEnd
    746    0   yongsun      * @param boneStart
    747    0   yongsun      * @param boneEnd
    748    0   yongsun      * @param result  the result string
    749    0   yongsun      * @return: the number of words converted in the best sentence. ie. not
    750    0   yongsun      *          count the non-pinyin node or invalid pinyin node
    751    0   yongsun      */
    752    0   yongsun     int
    753    0   yongsun     getBestSentence(wstring& result, CSkeletonIter boneStart, CSkeletonIter boneEnd, bool original_format=false);
    754    0   yongsun     /*@}*/
    755    0   yongsun 
    756    0   yongsun     void
    757    0   yongsun     searchFrom(CSkeletonIter boneStart);
    758    0   yongsun 
    759    0   yongsun protected:
    760    0   yongsun     CSkeleton                       m_Skeleton;
    761    0   yongsun     CSkeletonIter                   m_EffectiveCandiBoneStart;
    762    0   yongsun     CSkeletonIter                   m_EffectiveCandiBoneEnd;
    763    0   yongsun 
    764    0   yongsun     bool                            m_bNonCompleteSyllable;
    765    0   yongsun     bool                            m_bStrictLeft2Right;
    766    0   yongsun     bool                            m_bGBK;
    767    0   yongsun     bool                            m_bGB18030;
    768    0   yongsun     int                             m_HistoryPower;
    769    0   yongsun     bool                            m_ContextRanking;
    770    0   yongsun 
    771    0   yongsun     CThreadSlm                     *m_pModel;
    772    0   yongsun     CPinyinTrie                    *m_pPinyinTrie;
    773    0   yongsun 
    774    0   yongsun     CICHistory                     *m_pHistory;
    775    0   yongsun 
    776    0   yongsun 
    777    0   yongsun private:
    778    0   yongsun     void
    779    0   yongsun     transferBetween(CSkeletonIter h, CSkeletonIter t, unsigned int id, double ic);
    780    0   yongsun 
    781    0   yongsun     void
    782    0   yongsun     buildLatticeStates(CSkeletonIter bone);
    783    0   yongsun 
    784    0   yongsun     CSkeletonIter
    785    0   yongsun     forwardOnePinyinBone(CSkeletonIter bone);
    786    0   yongsun 
    787    0   yongsun     CSkeletonIter
    788    0   yongsun     forwardPuncBone(CSkeletonIter bone);
    789    0   yongsun 
    790    0   yongsun     CSkeletonIter
    791    0   yongsun     forwardPinyinBone(CSkeletonIter bone);
    792    0   yongsun 
    793    0   yongsun     CSkeletonIter
    794    0   yongsun     forwardNonPinyinBone(CSkeletonIter bone);
    795    0   yongsun 
    796    0   yongsun     CSkeletonIter
    797    0   yongsun     forwardInvalidBone(CSkeletonIter bone);
    798    0   yongsun 
    799    0   yongsun     CSkeletonIter
    800    0   yongsun     forwardTailBone(CSkeletonIter bone);
    801    0   yongsun 
    802    0   yongsun }; // of CIMIContext
    803    0   yongsun 
    804    0   yongsun #endif
    805