1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifndef SUNPY_IMI_CONTEXT_H 39 0 yongsun #define SUNPY_IMI_CONTEXT_H 40 0 yongsun 41 0 yongsun #include "portability.h" 42 0 yongsun 43 0 yongsun #ifdef HAVE_CONFIG_H 44 0 yongsun #include <config.h> 45 0 yongsun #endif 46 0 yongsun 47 0 yongsun #ifdef DEBUG 48 0 yongsun #ifdef HAVE_ASSERT_H 49 0 yongsun #include <assert.h> 50 0 yongsun #endif 51 0 yongsun #endif 52 0 yongsun 53 0 yongsun #include <map> 54 0 yongsun #include <vector> 55 0 yongsun #include <list> 56 0 yongsun #include <math.h> 57 0 yongsun 58 0 yongsun #include "imi_data.h" 59 0 yongsun #include "ic_history.h" 60 0 yongsun 61 0 yongsun #define UNKNOWN_WORD_ID 0 62 0 yongsun #define OOV_WORD_ID 69 63 0 yongsun #define SENTENCE_BREAKER_ID 10 64 0 yongsun 65 0 yongsun struct TLongExpFloat { 66 0 yongsun public: 67 0 yongsun TLongExpFloat(const TLongExpFloat& b) : m_base(b.m_base), m_exp(b.m_exp) { } 68 0 yongsun 69 0 yongsun TLongExpFloat(int exp = 0, double base=0.0) : m_base(base), m_exp(exp) { } 70 0 yongsun 71 0 yongsun TLongExpFloat(double d); 72 0 yongsun 73 0 yongsun TLongExpFloat 74 0 yongsun operator* (const TLongExpFloat& b) const; 75 0 yongsun 76 0 yongsun TLongExpFloat 77 0 yongsun operator/ (const TLongExpFloat& b) const; 78 0 yongsun 79 0 yongsun bool 80 0 yongsun operator< (const TLongExpFloat& b) const; 81 0 yongsun 82 0 yongsun bool 83 0 yongsun operator<=(const TLongExpFloat& b) const; 84 0 yongsun 85 0 yongsun bool 86 0 yongsun operator==(const TLongExpFloat& b) const; 87 0 yongsun 88 0 yongsun void 89 0 yongsun toString(std::string& str) const; 90 0 yongsun 91 0 yongsun void 92 0 yongsun toString(char* buf) const 93 0 yongsun { if (buf) sprintf(buf, "%10lf*2^%d", m_base, m_exp); } 94 0 yongsun 95 0 yongsun double 96 0 yongsun log2() const 97 0 yongsun { 98 0 yongsun #ifdef DEBUG 99 0 yongsun //assert(m_base > 0.0); 100 0 yongsun #endif 101 0 yongsun return ::log2(m_base)+m_exp; 102 0 yongsun } 103 0 yongsun 104 0 yongsun private: 105 0 yongsun double m_base; 106 0 yongsun int m_exp; 107 0 yongsun }; 108 0 yongsun 109 0 yongsun /** 110 0 yongsun * TSentenceScore is only used for whole sentence score, 111 0 yongsun * the score from language model still using double. 112 0 yongsun */ 113 0 yongsun #ifdef _USE_RAW_PROBABILITY 114 0 yongsun typedef TLongExpFloat TSentenceScore; 115 0 yongsun #else 116 0 yongsun typedef double TSentenceScore; 117 0 yongsun #endif 118 0 yongsun 119 0 yongsun class CBone; 120 0 yongsun class CCandidate; 121 0 yongsun 122 0 yongsun typedef std::list<CBone> CSkeleton; 123 0 yongsun typedef CSkeleton::iterator CSkeletonIter; 124 0 yongsun typedef std::vector<CCandidate> CCandidates; 125 0 yongsun typedef CCandidates::iterator CCandidatesIter; 126 0 yongsun 127 0 yongsun class CIMIContext; 128 0 yongsun class CBoneInnerData; 129 0 yongsun 130 0 yongsun union TCandiRank { 131 0 yongsun public: 132 0 yongsun bool 133 0 yongsun operator< (const TCandiRank& b) const 134 0 yongsun { return m_all < b.m_all; }; 135 0 yongsun 136 0 yongsun TCandiRank() : m_all(0) { } 137 0 yongsun 138 0 yongsun TCandiRank(bool user, bool best, unsigned int len, 139 0 yongsun bool fromLattice, TSentenceScore score); 140 0 yongsun 141 0 yongsun TCandiRank(bool user, bool best, unsigned int len, 142 0 yongsun bool fromLattice, unsigned score); 143 0 yongsun 144 0 yongsun protected: 145 0 yongsun unsigned int m_all; 146 198 tchaikov #if !defined(WORDS_BIGENDIAN) 147 0 yongsun struct TAnony { 148 0 yongsun unsigned m_cost : 24; 149 0 yongsun unsigned m_lattice: 1; 150 184 yongsun unsigned m_best : 1; 151 0 yongsun unsigned m_len : 5; 152 0 yongsun unsigned m_user : 1; 153 0 yongsun } anony; 154 198 tchaikov #else 155 0 yongsun struct TAnony { 156 0 yongsun unsigned m_user : 1; 157 184 yongsun unsigned m_len : 5; 158 0 yongsun unsigned m_best : 1; 159 0 yongsun unsigned m_lattice: 1; 160 0 yongsun unsigned m_cost : 24; 161 0 yongsun } anony; 162 0 yongsun #endif 163 0 yongsun 164 0 yongsun }; 165 0 yongsun 166 0 yongsun /** 167 0 yongsun * CCandidate represent basic information about a single candidate. 168 0 yongsun * Its start bone and finishing bone. It's content string. and its 169 0 yongsun * word id. 170 0 yongsun */ 171 0 yongsun class CCandidate { 172 0 yongsun public: 173 0 yongsun friend class CIMIContext; 174 0 yongsun public: 175 0 yongsun CSkeletonIter m_BoneStart; 176 0 yongsun CSkeletonIter m_BoneEnd; 177 0 yongsun const TWCHAR *m_String; 178 0 yongsun 179 0 yongsun public: 180 0 yongsun CCandidate(const CCandidate& b) 181 0 yongsun : m_BoneStart(b.m_BoneStart), m_BoneEnd(b.m_BoneEnd), 182 0 yongsun m_String(b.m_String), m_WordId(b.m_WordId) { } 183 0 yongsun 184 0 yongsun /** Give out the constructor for convinience */ 185 0 yongsun CCandidate(const TWCHAR* s = NULL, 186 0 yongsun CSkeletonIter h=CSkeletonIter(), 187 0 yongsun CSkeletonIter t=CSkeletonIter(), 188 0 yongsun unsigned int wid=0) 189 18 Kov : m_BoneStart(h), m_BoneEnd(t), m_String(s), m_WordId(wid) { } 190 0 yongsun 191 0 yongsun void 192 0 yongsun print(std::string& prefix); 193 0 yongsun 194 0 yongsun protected: 195 0 yongsun unsigned int m_WordId; 196 0 yongsun }; // of CCandidate 197 0 yongsun 198 0 yongsun 199 0 yongsun /** 200 0 yongsun * Bone is the basic unit for CIMSessionDate to store a Syllable, ie 201 0 yongsun * a Pinyin string for only one Chinese character. Such as "zhang", 202 0 yongsun * or "zh" under non-complete pinyin. 203 0 yongsun */ 204 0 yongsun class CBone { 205 0 yongsun friend class CIMIContext; 206 0 yongsun public: 207 0 yongsun /** 208 0 yongsun * In case that use input is not Pinyin, such as under English input 209 0 yongsun * mode, A bone contains a string for the input. For English, all 210 0 yongsun * consecutive string grouped into one bone, such as "SUN Microsystem"; 211 0 yongsun * For Punc, each punctuaction become on bone, such as ": would be 212 0 yongsun * split into two bones. 213 0 yongsun * 214 0 yongsun * For Pinyin type, there are three different type definition, and the 215 0 yongsun * value for each syllable string depends on whether or not NonComplete 216 0 yongsun * Pinyin mode is enabled: 217 0 yongsun * 218 0 yongsun * NODE_PINYIN : Valid pinyin. Such as "zh" with NonComplet 219 0 yongsun * Pinyin enabled, or "zhang" \n 220 0 yongsun * NODE_INVALID_PINYIN : Invalid syllable string, such as "u"; or "afdasf"\n 221 0 yongsun * NODE_INCOMPLETE_PINYIN: incomplete syllable string (maybe complet further), 222 0 yongsun * such as "to"; or "zh" when NonComplete Pinyin disabled. 223 0 yongsun */ 224 0 yongsun enum NODE_TYPE { 225 0 yongsun NODE_TAIL = 0x0000, //pusedo tail node 226 0 yongsun 227 0 yongsun CATE_PINYIN = 0x0100, 228 0 yongsun NODE_PINYIN = 0x0101, //pinyin 229 0 yongsun NODE_INVALID_PINYIN = 0x0102, //invalid syllable string 230 0 yongsun NODE_INCOMPLETE_PINYIN = 0x0103, //incomplete syllable string 231 0 yongsun 232 0 yongsun 233 0 yongsun CATE_OTHER = 0x0200, 234 0 yongsun NODE_ASCII = 0x0201, //english string 235 0 yongsun NODE_PUNC = 0x0202, //punctuation 236 0 yongsun NODE_SIMBOL = 0x0203, //other simbol 237 0 yongsun NODE_DIGITAL = 0x0204 //not implemeted here 238 0 yongsun 239 0 yongsun }; // of NODE_TYPE 240 0 yongsun 241 0 yongsun /** 242 0 yongsun * Boundary type indicate how the bone is seperated, by (1) Automatic 243 0 yongsun * Syllable segmentation, (2) different bone type or punc bone seperate 244 0 yongsun * rule, (3) user sepecified. 245 0 yongsun */ 246 0 yongsun enum BOUNDARY_TYPE { 247 0 yongsun AUTO_BOUNDARY, //automatic segmentation result 248 0 yongsun ABSOLUTE_BOUNDARY, //boundary without ambiguation 249 0 yongsun USER_BOUNDARY //user given boundary 250 0 yongsun }; // of BOUNDARY_TYPE 251 0 yongsun 252 0 yongsun public: 253 0 yongsun int m_BoneType; 254 0 yongsun int m_BoundaryType; // original code for m_String[0] in non_pinyin node 255 0 yongsun wstring m_String; 256 0 yongsun 257 0 yongsun public: 258 0 yongsun /** 259 0 yongsun * Never copy or allocate space for m_pInnerData; 260 0 yongsun */ 261 0 yongsun CBone(const CBone& b); 262 0 yongsun 263 0 yongsun /** 264 0 yongsun * @param boundType: Boundary type. 265 0 yongsun * @param boneType: BoneType. 266 0 yongsun * No space is allocated for m_pInnerData 267 0 yongsun */ 268 0 yongsun CBone(int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL); 269 0 yongsun 270 0 yongsun /** 271 0 yongsun * @param pwc can not be NULL 272 0 yongsun * @param boundType: Boundary type. 273 0 yongsun * @param boneType: BoneType. 274 0 yongsun * No space is allocated for m_pInnerData 275 0 yongsun */ 276 0 yongsun CBone(const TWCHAR* pwc, int boundType = AUTO_BOUNDARY, int boneType = NODE_TAIL); 277 0 yongsun 278 0 yongsun /** 279 0 yongsun * @param pwc the string should be copied into this bone, not NULL 280 0 yongsun * @param len the string len 281 0 yongsun * @param boundType: Boundary type. 282 0 yongsun * @param boneType: BoneType. 283 0 yongsun * No space is allocated for m_pInnerData 284 0 yongsun */ 285 0 yongsun CBone(const TWCHAR* pwc, size_t len, int boundType, int boneType); 286 0 yongsun 287 0 yongsun /** Free all space if necessary. */ 288 0 yongsun ~CBone(); 289 0 yongsun 290 0 yongsun inline bool 291 0 yongsun isPinyinNode() const 292 0 yongsun { return ((m_BoneType & CATE_PINYIN)!= 0); } 293 0 yongsun 294 0 yongsun inline bool 295 0 yongsun isValidPinyinNode() const 296 0 yongsun { return (m_BoneType == NODE_PINYIN || m_BoneType == NODE_INCOMPLETE_PINYIN); } 297 0 yongsun 298 0 yongsun inline bool 299 0 yongsun isUserBoundary() const 300 0 yongsun { return m_BoundaryType == USER_BOUNDARY; } 301 0 yongsun 302 0 yongsun inline bool 303 0 yongsun isAutoBoundary() const 304 0 yongsun { return m_BoundaryType == AUTO_BOUNDARY; } 305 0 yongsun 306 0 yongsun inline bool 307 0 yongsun isTailNode() const 308 0 yongsun { return (m_BoneType == 0); } 309 0 yongsun 310 0 yongsun bool 311 0 yongsun isUserSelectionStart(void); 312 0 yongsun 313 0 yongsun void 314 0 yongsun print(std::string& prefix); 315 0 yongsun 316 0 yongsun protected: 317 0 yongsun CBoneInnerData *m_pInnerData; 318 0 yongsun }; // of CBone 319 0 yongsun 320 0 yongsun 321 0 yongsun /** 322 0 yongsun * It is more suitable to call this as Input Context together with I 323 0 yongsun * MSessionView. These data record 324 0 yongsun * input history for a input session. Normally a seesion data would 325 0 yongsun * only contains history keys and the cursor position. It would be enough 326 0 yongsun * to find corresponding result from the history for most IME. 327 0 yongsun * 328 0 yongsun * The Session data class here take responsible for generating best sentence 329 0 yongsun * from Pinyin string. It also contains all core algorithm for this conversion 330 0 yongsun * progress. 331 0 yongsun * 332 0 yongsun * All Key processing job should be done by Session View class, and only several 333 0 yongsun * interface exist here for the SessionView to modify Input Context and get 334 0 yongsun * best sentence and ranked candidates. 335 0 yongsun * 336 0 yongsun * The other important function it provide is Automatic Pinyin segmentation. 337 0 yongsun * 338 0 yongsun * For our input method, from the aspect of effeciency, a internal 339 0 yongsun * search lattice should also be remained and only partial of the lattice 340 0 yongsun * would be rebuild or updated when user give a new input. The reason is 341 0 yongsun * that it is time consuming to construct a whole search lattice, especially 342 0 yongsun * for a long sentence and/or with Non-Complete syllables, which cause 343 0 yongsun * potential candidates number increase. Yet all search related data are 344 0 yongsun * hidden to outer usage. 345 0 yongsun */ 346 0 yongsun class CIMIContext { 347 0 yongsun public: 348 0 yongsun /*@{*/ 349 0 yongsun /** 350 0 yongsun * Constructor of CIMIContext. Set all the pointer to NULL. 351 0 yongsun * set Non-Complete Syllable to true, set Strict Left2Right Model 352 0 yongsun * to false. 353 0 yongsun * 354 0 yongsun * Note: At this time, CIMIContext could not be used to 355 0 yongsun * search directly. Only after setCoreData() and clear(), 356 0 yongsun * the internal search lattice are constructed and can 357 0 yongsun * be used. 358 0 yongsun */ 359 0 yongsun CIMIContext(); 360 0 yongsun 361 0 yongsun /** 362 0 yongsun * free all resource/spaces 363 0 yongsun */ 364 0 yongsun virtual 365 0 yongsun ~CIMIContext() 366 0 yongsun { m_Skeleton.clear(); } 367 0 yongsun /*@}*/ 368 0 yongsun 369 0 yongsun /*@{*/ 370 0 yongsun /** 371 0 yongsun * Copy language model ptr and Pinyin-Trie Ptr inside the IMCoreData 372 0 yongsun * into my own member. 373 0 yongsun * Also build Chinese Punctuation Map from Pinyin-Trie. 374 0 yongsun * 375 0 yongsun * @param pCoreData is the core resource data for the Input Method 376 0 yongsun */ 377 0 yongsun void 378 0 yongsun setCoreData(CIMIData *pCoreData); 379 0 yongsun 380 0 yongsun /** 381 0 yongsun * clear all internal Input Context, after this call, the Session data 382 0 yongsun * or Input Context should same as they were just created. (Of cause, 383 0 yongsun * the values from core data and desktop data remains.) More specifically, 384 0 yongsun * it will clear skeleton, add a psuedo tail node, set internal candidate 385 0 yongsun * position to skeleton.end() (which means no candidates needed now). then 386 0 yongsun * it will construct a initial search lattice. 387 0 yongsun * 388 0 yongsun * Note: This function should be called only after setCoreData, because 389 0 yongsun * it will use the language model to construct a empty search 390 0 yongsun * lattice. 391 0 yongsun */ 392 0 yongsun void 393 0 yongsun clear(); 394 0 yongsun 395 0 yongsun void 396 0 yongsun setHistoryMemory(CICHistory *phm); 397 0 yongsun 398 0 yongsun CICHistory * 399 0 yongsun getHistoryMemory(); 400 0 yongsun 401 0 yongsun /** return true if defined DEBUG */ 402 0 yongsun void 403 0 yongsun print_lattice(); 404 0 yongsun 405 0 yongsun //memorize sentence in current text 406 0 yongsun void 407 0 yongsun memorize(void); 408 0 yongsun 409 0 yongsun /** 410 0 yongsun * Return the bone list to let the view read them directly. 411 0 yongsun */ 412 0 yongsun CSkeleton & 413 0 yongsun getSkeleton(void) 414 0 yongsun { return m_Skeleton; } 415 0 yongsun 416 0 yongsun /** 417 0 yongsun * To construct the lattice, algorithm need to append two 418 0 yongsun * psuedo tail bone at the end of the bone list. 419 0 yongsun * For the SessionView could operate the list as the two tailing 420 0 yongsun * bone were not there, SessionView should call this function 421 0 yongsun * to replace call like getSekelton()->end() when iterating the 422 0 yongsun * Skeleton (bone list). 423 0 yongsun * 424 0 yongsun * @return the first psuedo tailing node at the end of bone 425 0 yongsun * list. For SessionView usage. 426 0 yongsun */ 427 0 yongsun CSkeletonIter 428 0 yongsun getLastBone(void) 429 0 yongsun { return --(--(m_Skeleton.end())); } 430 0 yongsun 431 0 yongsun /** 432 0 yongsun * To cooperate with the getLastBone. 433 0 yongsun * @return the first bone of the skeleton 434 0 yongsun */ 435 0 yongsun CSkeletonIter 436 0 yongsun getFirstBone(void) 437 0 yongsun { return m_Skeleton.begin(); } 438 0 yongsun 439 0 yongsun bool 440 0 yongsun isEmpty(void) 441 0 yongsun { return m_Skeleton.size() == 2; } 442 0 yongsun /*@}*/ 443 0 yongsun 444 0 yongsun 445 0 yongsun // functions to set options. Options should be set when Session Data 446 0 yongsun // is clear, ie, just created or just after clear() is called 447 0 yongsun 448 0 yongsun /*@{*/ 449 0 yongsun /** 450 0 yongsun * NonCompleteSyllable means we can give a candidates when PinYin is 451 0 yongsun * just partial a complete syllable string. Such as "sh" would give 452 0 yongsun * all characters with one of it PinYin starts from "sh", ex, "shi", 453 0 yongsun * "sheng" etc. 454 0 yongsun * In our definition, Non-Complete PinYin not only limitied on "SHENGMU", 455 0 yongsun * For examplet, "to" is not a valid full pinyin, yet it could lead 456 0 yongsun * "tong", "tou". So it would also gives out those corresponding 457 0 yongsun * characters. 458 0 yongsun * On the other hand, when a PinYin String is a valid Full Syllable 459 0 yongsun * PinYin, even it could lead other Full Syllable PinYin, it will 460 0 yongsun * not be treated as non-complete PinYin. For example, "da" will only 461 0 yongsun * give candidates characters pronounced "da", although it could lead 462 0 yongsun * "dan" "dao" "dang", etc. 463 0 yongsun */ 464 0 yongsun void 465 0 yongsun setNonCompleteSyllable(bool use = true) 466 0 yongsun { m_bNonCompleteSyllable = use; } 467 0 yongsun 468 0 yongsun bool 469 0 yongsun canNonCompleteSyllable() 470 0 yongsun { return m_bNonCompleteSyllable; } 471 0 yongsun 472 0 yongsun /** 473 0 yongsun * Left2RightSelection could improve the performance under 474 0 yongsun * the TwoLine view style. But could not be used with OneLine view 475 0 yongsun * style and ThreeLine view style. 476 0 yongsun */ 477 0 yongsun void 478 0 yongsun setLeft2RightSelection(bool use = true) 479 0 yongsun { m_bStrictLeft2Right = use; } 480 0 yongsun 481 0 yongsun bool 482 0 yongsun isGBKEnabled() 483 0 yongsun { return m_bGBK; } 484 0 yongsun 485 0 yongsun void 486 0 yongsun enableGBK(bool enable) 487 0 yongsun { m_bGBK = enable; } 488 0 yongsun 489 0 yongsun void 490 0 yongsun setHistoryPower(int power) 491 0 yongsun { m_HistoryPower = ((power>= 0 && power <=10)?(power):(3)); } 492 0 yongsun 493 0 yongsun int 494 0 yongsun getHistoryPower() 495 0 yongsun { return m_HistoryPower; } 496 0 yongsun 497 0 yongsun void 498 0 yongsun enableContextRanking(bool enable) 499 0 yongsun { m_ContextRanking = enable; } 500 0 yongsun 501 0 yongsun int 502 0 yongsun isContextRankingEnabled() 503 0 yongsun { return m_ContextRanking; } 504 0 yongsun /*@}*/ 505 0 yongsun 506 0 yongsun /*@{*/ 507 0 yongsun bool 508 0 yongsun isValidSyllable(const wstring& sy) 509 0 yongsun { return isValidSyllable(sy.c_str()); } 510 0 yongsun 511 0 yongsun bool 512 0 yongsun isValidSyllable(const TWCHAR* pstr); 513 0 yongsun 514 0 yongsun /** 515 0 yongsun * segPinyinSimplest() do simplest segmentation for given PinYin. That, the 516 0 yongsun * last char is the just input by user (or just delete by user), and 517 0 yongsun * prefix string is at least non-complet syllable. This kind of segment 518 0 yongsun * suitable for OneLineView. Syllable Pinyin string can only be modified 519 0 yongsun * at the end. 520 0 yongsun * 521 0 yongsun * @param pinyin : a wide char string. Each char in the string should only 522 0 yongsun * be one of the following: [a-z]. Note, that prefix string (except the 523 0 yongsun * last char should already be complete or non-complet). 524 0 yongsun * 525 0 yongsun * @param result: After segmentation, result contains sequence of syllable 526 0 yongsun * Bones. The result may be one of the following status:\n 527 0 yongsun * (1) two bones, the first is complete, and 528 0 yongsun * the second is complete or non-complete.\n 529 0 yongsun * (2) two bones, the first is complete, and 530 0 yongsun * the second is invalid.\n 531 0 yongsun * (3) single complete or non-complet bone, with AUTO_BOUNDARY.\n 532 0 yongsun * (4) single invalid bone.\n 533 0 yongsun * Status (2) and (4) is invalid, program should reject the last 534 0 yongsun * input char. Status (1) program should commit the first bone and 535 0 yongsun * use the remaining string as prefix. For status (3), program 536 0 yongsun * should wait for further input. 537 0 yongsun * 538 0 yongsun * @return whether the segmentation is valid(or whether the last char 539 0 yongsun * is valid) 540 0 yongsun */ 541 0 yongsun bool 542 0 yongsun segPinyinSimplest(const wstring& pinyin, CSkeleton& result); 543 0 yongsun 544 0 yongsun /** 545 0 yongsun * [head1, tail1) [head2, tail2) may from two link. and form an virtual link. 546 0 yongsun * generate the segmentation result into result where each node represent a 547 0 yongsun * syllable. Note that invalid_syllable is inevitable. 548 0 yongsun */ 549 0 yongsun void 550 0 yongsun segPinyin(CSkeletonIter head1, CSkeletonIter tail1, 551 0 yongsun CSkeletonIter head2, CSkeletonIter tail2, 552 0 yongsun CSkeleton& result); 553 0 yongsun /*@}*/ 554 0 yongsun 555 0 yongsun /*@{*/ 556 0 yongsun /** 557 0 yongsun * modify skeleton. remove bones [boneStart, boneEnd). and insert all 558 0 yongsun * bones in the skel just before the boneEnd. Then update the 559 0 yongsun * search lattice and search for a new result. 560 0 yongsun * 561 0 yongsun * @param boneStart start bone iterator in the skeleton 562 0 yongsun * @param boneEnd ending bone iterator in the skeleton 563 0 yongsun * @param skel the new list of bones to be inserted. 564 0 yongsun * @param pItLeftmost the leftmost bone affected by the operation. 565 0 yongsun * for manual search 566 0 yongsun * 567 0 yongsun * @return whether or not the original gotted candidates called 568 0 yongsun * by getCandidates() would be affected by this modification. 569 0 yongsun * @retval true Affected. getCandidates() should be call again. 570 0 yongsun * @retval false Not affected. 571 0 yongsun */ 572 0 yongsun bool 573 0 yongsun modify(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel, 574 0 yongsun bool doSearch=true, CSkeletonIter* pItLeftmost=NULL); 575 0 yongsun 576 0 yongsun /** 577 0 yongsun * modify skeleton. remove bones [boneStart, boneEnd). and insert all 578 0 yongsun * bones in the skel just before the boneEnd. Do Syllable segmentation 579 0 yongsun * again to make syllables legal. This may look left for 2/3 nodes. Also 580 0 yongsun * new cursor position are counted. 581 0 yongsun * Then update the search lattice and search for a new result if needed. 582 0 yongsun * 583 0 yongsun * @param boneStart start bone iterator in the skeleton 584 0 yongsun * @param boneEnd ending bone iterator in the skeleton 585 0 yongsun * @param skel the new list of bones to be inserted. 586 0 yongsun * @param cursor the cursor bone in the skel, after this function, 587 0 yongsun * it contains the corresponding cursor bone iterator 588 0 yongsun * in the IC's Skeleton. 589 0 yongsun * @param cursorIdx the cursor's idx inside the cursor bone. Also contain 590 0 yongsun * new position index after this function. 591 0 yongsun * @param candiStart The candidate list's starting position. Also the leftmost 592 0 yongsun * when looking left to prevent re-segment insufficient. 593 0 yongsun * Return value are set to its new position, because it would 594 0 yongsun * change onto the new list. 595 0 yongsun * @param stickLeft When the cursor position after segment is located at 596 0 yongsun * boundary. This give how it should be, @value true for 597 0 yongsun * the tail of the left bone, @value false for the head 598 0 yongsun * of the right bone. 599 0 yongsun * @param doSearch should be always true, otherwise you have to research the 600 0 yongsun * whole skeleton after this call, because currently we do 601 0 yongsun * not provide an interface to return the righmost bone to 602 0 yongsun * remember the minimized the research demand cause by this. 603 0 yongsun * @return whether or not the original gotted candidates called 604 0 yongsun * by getCandidates() would be affected by this modification. 605 0 yongsun * @retval true Affected. getCandidates() should be call again. 606 0 yongsun * @retval false Not affected. 607 0 yongsun */ 608 0 yongsun bool 609 0 yongsun modifyAndReseg(CSkeletonIter boneStart, CSkeletonIter boneEnd, CSkeleton& skel, 610 0 yongsun CSkeletonIter& cursor, int& cursorIdx, CSkeletonIter& candiStart, 611 0 yongsun bool stickLeft=true, bool doSearch=true); 612 0 yongsun 613 0 yongsun 614 0 yongsun /** 615 0 yongsun * Cancel original selection that including the bone 616 0 yongsun * 617 0 yongsun * @param bone Selection is a candidate, which have a bone range. 618 0 yongsun * If any selection's range contains the bone, it 619 0 yongsun * would be canceled. 620 0 yongsun * @param update "true" to update the search lattice. Normally, 621 0 yongsun * when called by SessionView, update should be set. For 622 0 yongsun * internal call of the function, one could let the update 623 0 yongsun * parameter to false, and do lattice searching later. 624 0 yongsun * 625 0 yongsun * @return the Bone on the left of param bone, which is the start bone 626 0 yongsun * of a user selection, and the selection include param bone. 627 0 yongsun * if no such a bone found, just return param bone itself. 628 0 yongsun * 629 0 yongsun * The algorithm working like following: 630 0 yongsun * - Find the left most bone whose PinYin lexicon state could arrived here. 631 0 yongsun * This could be done just read the first element of BoneInnerData's 632 0 yongsun * m_LexiconStates. 633 0 yongsun * - From current bone to the left-most bone, try to find the first meet 634 0 yongsun * bone whose BestWord candidates is valid. 635 0 yongsun * -# if the BestWord's right bone is at the left of target bone: 636 0 yongsun * ====>do nothing 637 0 yongsun * -# if the BestWord's right bone is at the right of the target bone: 638 0 yongsun * ====> Invalidate the BestWord. And if update is set, 639 0 yongsun * ====> searchFrom the BestWord's right bone. 640 0 yongsun */ 641 0 yongsun CSkeletonIter 642 0 yongsun cancelSelection(CSkeletonIter bone, bool update=true); 643 0 yongsun 644 0 yongsun /** 645 0 yongsun * Cancel original selection that including the bone, but do 646 0 yongsun * not count User Selection right starting at bone. 647 0 yongsun * 648 0 yongsun * @param bone Selection is a candidate, which have a bone range. 649 0 yongsun * If any selection's range contains the bone, it 650 0 yongsun * would be canceled. 651 0 yongsun * @param update "true" to update the search lattice. Normally, 652 0 yongsun * when called by SessionView, update should be set. For 653 0 yongsun * internal call of the function, one could let the update 654 0 yongsun * parameter to false, and do lattice searching later. 655 0 yongsun * 656 0 yongsun * @return the Bone on the left of param bone, which is the start bone 657 0 yongsun * of a user selection, and the selection include param bone. 658 0 yongsun * if no such a bone found, just return param bone itself. 659 0 yongsun */ 660 0 yongsun CSkeletonIter 661 0 yongsun cancelSelectionCover(CSkeletonIter bone, bool update=true); 662 0 yongsun 663 0 yongsun 664 0 yongsun /** 665 0 yongsun * Tell me that user make a selection for a specific candidate. 666 0 yongsun * The lattice will be updated (only the neccessary part). And 667 0 yongsun * best sentence are searched. 668 0 yongsun * 669 0 yongsun * @param candi the user selection. 670 0 yongsun * @return whether or not it will affect the previous gotted 671 0 yongsun * candidate list. 672 0 yongsun * 673 0 yongsun * Algorithm-Description: 674 0 yongsun * - Put the candi to the candi's leftBone's bestWord, and validate it. 675 0 yongsun * -# If Pure Left to Right model enabled, then: 676 0 yongsun * ====> rebuild the lattice from bestWord's left bone. 677 0 yongsun * -# If Pure Left to Right model disabled, then 678 0 yongsun * ====> rebuild the lattice from bestWord's right bone. 679 0 yongsun * . 680 0 yongsun * . 681 0 yongsun * 682 0 yongsun * the search function will deal with the human selection, the processing 683 0 yongsun * of selection focused on two different things: 684 0 yongsun * - the word-set finding 685 0 yongsun * - the word scoring 686 0 yongsun * . 687 0 yongsun * More specificlly, the search routine will, normally, finding candidates 688 0 yongsun * words bone after bone. At each bone: 689 0 yongsun * -# It first check all finishing lexicon states, ie., get all possible 690 0 yongsun * words that will ending right before this bone, using them to construct 691 0 yongsun * lattice states of this bone. For each word, there is a inner cost 692 0 yongsun * associated with them. (For future word class usage) This cost is 693 0 yongsun * positive double. But for user selection, the cost should will be 694 0 yongsun * set to a suitable negtive double to make the best path alway go 695 0 yongsun * throught this word. The value of the negetive is affected by: 696 0 yongsun * maximum word length, N-gram's N, mini-backoff weight, mini-common-pr. 697 0 yongsun * (Due to float plus operation accuracy lost, the value should not be 698 0 yongsun * set to a very negitive, although the less the better.) But when 699 0 yongsun * "Pure Left to Right" is enabled, this trick will not be used, on 700 0 yongsun * the other hand, the following step would make sure there is only 701 0 yongsun * one possible transfer/edge on the lattice. 702 0 yongsun * -# Then, from all possible lexicon states (plus a root state), let them 703 0 yongsun * transfer on this bone's PINYIN string. Put the result states after 704 0 yongsun * transfer on the next bone as Lexicon states. But when 705 0 yongsun * "Pure Left to Right" model enabled, and a user selection is 706 0 yongsun * meet at the starting bone, then: 707 0 yongsun * - clear all states inside the bestWord's range. 708 0 yongsun * - jump directly to the bestWord's right bone, with only one word, 709 0 yongsun * ie. the bestWord, transfer enabled in this range. 710 0 yongsun * - then search from the right bone 711 0 yongsun * . 712 0 yongsun */ 713 0 yongsun bool 714 0 yongsun makeSelection(const CCandidate& candi); 715 0 yongsun /*@}*/ 716 0 yongsun 717 0 yongsun /*@{*/ 718 0 yongsun /** 719 0 yongsun * Get candidate list for position at the bone. 720 0 yongsun * 721 0 yongsun * @param bone: iterator pointed to a unit in the bone list. 722 0 yongsun * @param result: the candidate list. Each candidate has a starting 723 0 yongsun * bone, ending bone, and a corresponding string. Currently 724 0 yongsun * all starting bone is the "bone" parameter. 725 0 yongsun * 726 0 yongsun * Currently, it will only give candidates that are starting at 727 0 yongsun * the parameter bone. Normally, the lattice wouldn't save all words' 728 0 yongsun * transfer cost information. But some good transfer edge could give 729 0 yongsun * us infomation. To achieve high efficiency, in this function, no 730 0 yongsun * probabilities are re-get from language model. But, we use the 731 0 yongsun * following information to rank the candidates: 732 0 yongsun * -# If some words starting at bone is selected by the user, or 733 0 yongsun * it is a best edge of the best sentence, it will be listed as 734 0 yongsun * first. 735 0 yongsun * -# The longer the word, the better 736 0 yongsun * -# if some words' transfer cost could be got from the lattice, 737 0 yongsun * they are better than those from lexicon only. 738 0 yongsun * -# Use information stored in lexicon (where words could ranked 739 0 yongsun * by unigram). 740 0 yongsun */ 741 0 yongsun void 742 0 yongsun getCandidates(CSkeletonIter bone, CCandidates& result); 743 0 yongsun 744 0 yongsun /** 745 0 yongsun * Get the best sentence corresponding from boneStart to boneEnd 746 0 yongsun * @param boneStart 747 0 yongsun * @param boneEnd 748 0 yongsun * @param result the result string 749 0 yongsun * @return: the number of words converted in the best sentence. ie. not 750 0 yongsun * count the non-pinyin node or invalid pinyin node 751 0 yongsun */ 752 0 yongsun int 753 0 yongsun getBestSentence(wstring& result, CSkeletonIter boneStart, CSkeletonIter boneEnd, bool original_format=false); 754 0 yongsun /*@}*/ 755 0 yongsun 756 0 yongsun void 757 0 yongsun searchFrom(CSkeletonIter boneStart); 758 0 yongsun 759 0 yongsun protected: 760 0 yongsun CSkeleton m_Skeleton; 761 0 yongsun CSkeletonIter m_EffectiveCandiBoneStart; 762 0 yongsun CSkeletonIter m_EffectiveCandiBoneEnd; 763 0 yongsun 764 0 yongsun bool m_bNonCompleteSyllable; 765 0 yongsun bool m_bStrictLeft2Right; 766 0 yongsun bool m_bGBK; 767 0 yongsun bool m_bGB18030; 768 0 yongsun int m_HistoryPower; 769 0 yongsun bool m_ContextRanking; 770 0 yongsun 771 0 yongsun CThreadSlm *m_pModel; 772 0 yongsun CPinyinTrie *m_pPinyinTrie; 773 0 yongsun 774 0 yongsun CICHistory *m_pHistory; 775 0 yongsun 776 0 yongsun 777 0 yongsun private: 778 0 yongsun void 779 0 yongsun transferBetween(CSkeletonIter h, CSkeletonIter t, unsigned int id, double ic); 780 0 yongsun 781 0 yongsun void 782 0 yongsun buildLatticeStates(CSkeletonIter bone); 783 0 yongsun 784 0 yongsun CSkeletonIter 785 0 yongsun forwardOnePinyinBone(CSkeletonIter bone); 786 0 yongsun 787 0 yongsun CSkeletonIter 788 0 yongsun forwardPuncBone(CSkeletonIter bone); 789 0 yongsun 790 0 yongsun CSkeletonIter 791 0 yongsun forwardPinyinBone(CSkeletonIter bone); 792 0 yongsun 793 0 yongsun CSkeletonIter 794 0 yongsun forwardNonPinyinBone(CSkeletonIter bone); 795 0 yongsun 796 0 yongsun CSkeletonIter 797 0 yongsun forwardInvalidBone(CSkeletonIter bone); 798 0 yongsun 799 0 yongsun CSkeletonIter 800 0 yongsun forwardTailBone(CSkeletonIter bone); 801 0 yongsun 802 0 yongsun }; // of CIMIContext 803 0 yongsun 804 0 yongsun #endif 805