1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifndef SUNPY_IMI_CONTEXT_H 39 #define SUNPY_IMI_CONTEXT_H 40 41 #include "portability.h" 42 43 #ifdef HAVE_CONFIG_H 44 #include <config.h> 45 #endif 46 47 #if defined(DEBUG) && defined (HAVE_ASSET_H) 48 #include <assert.h> 49 #endif 50 51 #include <climits> 52 #include <map> 53 #include <vector> 54 55 #include "pinyin_seg.h" 56 #include "imi_data.h" 57 #include "ic_history.h" 58 #include "userdict.h" 59 #include "lattice_states.h" 60 #include "imi_funcobjs.h" 61 62 /** 63 * TSentenceScore is only used for whole sentence score, 64 * the score from language model still using double. 65 */ 66 typedef TLongExpFloat TSentenceScore; 67 68 class CLatticeFrame; 69 class CCandidate; 70 class CIMIContext; 71 72 typedef std::vector<CLatticeFrame> CLattice; 73 typedef std::vector<CCandidate> CCandidates; 74 typedef CCandidates::iterator CCandidatesIter; 75 76 union TCandiRank { 77 public: 78 bool 79 operator< (const TCandiRank& b) const 80 { return m_all < b.m_all; }; 81 82 TCandiRank() : m_all(0) { } 83 84 TCandiRank(bool user, bool best, unsigned int len, 85 bool fromLattice, TSentenceScore score); 86 87 TCandiRank(bool user, bool best, unsigned int len, 88 bool fromLattice, unsigned score); 89 90 protected: 91 unsigned int m_all; 92 #if !defined(WORDS_BIGENDIAN) 93 struct TAnony { 94 unsigned m_cost : 24; 95 unsigned m_lattice: 1; 96 unsigned m_best : 1; 97 unsigned m_len : 5; 98 unsigned m_user : 1; 99 } anony; 100 #else 101 struct TAnony { 102 unsigned m_user : 1; 103 unsigned m_len : 5; 104 unsigned m_best : 1; 105 unsigned m_lattice: 1; 106 unsigned m_cost : 24; 107 } anony; 108 #endif 109 }; // TCandiRank 110 111 /** 112 * CCandidate represent basic information about a single candidate. 113 * Its start bone and finishing bone. It's content string. and its 114 * word id. 115 */ 116 class CCandidate { 117 friend class CIMIContext; 118 public: 119 unsigned m_start; 120 unsigned m_end; 121 const TWCHAR *m_cwstr; 122 123 public: 124 /** Give out the constructor for convinience */ 125 CCandidate(unsigned start=0, unsigned end=0, const TWCHAR* s = NULL, unsigned int wid=0) 126 : m_start(start), m_end(end), m_cwstr(s), m_wordId(wid) {} 127 128 protected: 129 unsigned int m_wordId; 130 }; // of CCandidate 131 132 class CLatticeFrame { 133 friend class CIMIContext; 134 public: 135 enum TYPE { 136 UNUSED = 0x0000, // unused frame 137 TAIL = 0x0001, // tail frame 138 139 CATE_SYLLABLE = 0x0100, 140 SYLLABLE = 0x0101, // pinyin 141 SYLLABLE_SEP = 0x0102, // pinyin 142 INCOMPLETE_SYLLABLE = 0x0104, // incomplete syllable string 143 144 CATE_OTHER = 0x0200, 145 ASCII = 0x0201, // english string 146 PUNC = 0x0202, // punctuation 147 SYMBOL = 0x0204, // other symbol 148 DIGITAL = 0x0208, // not implemeted here 149 }; // TYPE 150 151 enum BESTWORD_TYPE { 152 NO_BESTWORD = 1 << 0, 153 BESTWORD = 1 << 1, 154 USER_SELECTED = 1 << 2, 155 IGNORED = 1 << 3, 156 }; // BESTWORD_TYPE 157 158 unsigned m_type; 159 unsigned m_bwType; 160 wstring m_wstr; 161 162 CLatticeFrame () : m_type (UNUSED), m_bwType (NO_BESTWORD) {} 163 164 bool isUnusedFrame () const 165 {return m_type == 0;} 166 167 bool isSyllableFrame () const 168 {return (m_type & CATE_SYLLABLE);} 169 170 bool isSyllableSepFrame () const 171 {return ((m_type & SYLLABLE_SEP) > CATE_SYLLABLE);} 172 173 bool isTailFrame () const 174 {return (m_type == TAIL);} 175 176 void clear () 177 { 178 m_type = UNUSED; 179 m_bwType = NO_BESTWORD; 180 m_lexiconStates.clear(); 181 m_latticeStates.clear(); 182 m_wstr.clear (); 183 } 184 185 void print (std::string prefix); 186 187 protected: 188 CCandidate m_bestWord; 189 CLexiconStates m_lexiconStates; 190 CLatticeStates m_latticeStates; 191 }; // CLatticeFrame 192 193 class CIMIContext { 194 public: 195 CIMIContext (); 196 ~CIMIContext () {clear();} 197 198 void clear (); 199 200 void setCoreData (CIMIData *pCoreData); 201 void setUserDict (CUserDict *pUserDict) {m_pUserDict = pUserDict;} 202 203 void setHistoryMemory (CICHistory *phm) {m_pHistory = phm;} 204 CICHistory * getHistoryMemory () {return m_pHistory;} 205 206 void setHistoryPower (unsigned power) 207 {m_historyPower = power <=10? power: 3;} 208 209 int getHistoryPower () 210 {return m_historyPower;} 211 212 void setFullSymbolForwarding (bool value=true) {m_bFullSymbolForwarding = value;} 213 bool getFullSymbolForwarding () {return m_bFullSymbolForwarding;} 214 void setGetFullSymbolOp (CGetFullSymbolOp *op) {m_pGetFullSymbolOp = op;} 215 216 void setFullPunctForwarding (bool value=true) {m_bFullPunctForwarding = value;} 217 bool getFullPunctForwarding () {return m_bFullPunctForwarding;} 218 void setGetFullPunctOp (CGetFullPunctOp *op) {m_pGetFullPunctOp = op;} 219 220 void setNonCompleteSyllable(bool value=true) {m_bNonCompleteSyllable = value;} 221 bool getNonCompleteSyllable() {return m_bNonCompleteSyllable;} 222 223 void setCharsetLevel (unsigned l) {m_csLevel = l;} 224 unsigned getCharsetLevel () {return m_csLevel;} 225 226 void setDynamicCandidateOrder (bool value=true) {m_bDynaCandiOrder = value;} 227 bool getDynaCandiOrder () {return m_bDynaCandiOrder;} 228 229 CLattice& getLattice () {return m_lattice;} 230 bool buildLattice (IPySegmentor::TSegmentVec &segments, unsigned rebuildFrom=1, bool doSearch=true); 231 bool isEmpty () {return m_tailIdx <= 1;} 232 unsigned getLastFrIdx () {return m_tailIdx-1;} 233 234 bool searchFrom (unsigned from=1); 235 std::vector<unsigned>& getBestPath () {return m_bestPath;} 236 unsigned getBestSentence (wstring& result, unsigned start=0, unsigned end=UINT_MAX); 237 238 void getCandidates (unsigned frIdx, CCandidates& result); 239 unsigned cancelSelection (unsigned frIdx, bool doSearch=true); 240 void makeSelection (CCandidate &candi, bool doSearch=true); 241 242 void memorize (); 243 void printLattice (); 244 245 protected: 246 void _clearFrom (unsigned from); 247 248 inline void _forwardSyllables (unsigned i, unsigned j, std::vector<unsigned>& syllables); 249 inline void _forwardSingleSyllable (unsigned i, unsigned j, TSyllable syllable, bool isFuzzy=false); 250 inline void _forwardSyllableSep (unsigned i, unsigned j); 251 inline void _forwardString (unsigned i, unsigned j, std::vector<unsigned>& strbuf); 252 inline void _forwardPunctChar (unsigned i, unsigned j, unsigned ch); 253 inline void _forwardOrdinaryChar (unsigned i, unsigned j, unsigned ch); 254 inline void _forwardTail (unsigned i, unsigned j); 255 256 inline void _transferBetween (unsigned start, unsigned end, unsigned wid, double ic=1.0); 257 inline void _backTraceBestPath (); 258 inline void _clearBestPath (); 259 260 inline const TWCHAR *_getWstr (unsigned wid); 261 262 inline void _saveUserDict (); 263 inline void _saveHistoryCache (); 264 265 protected: 266 CLattice m_lattice; 267 unsigned m_tailIdx; 268 std::vector<unsigned> m_bestPath; 269 270 CThreadSlm *m_pModel; 271 CPinyinTrie *m_pPinyinTrie; 272 CUserDict *m_pUserDict; 273 CICHistory *m_pHistory; 274 unsigned m_historyPower; 275 276 unsigned m_csLevel; 277 278 bool m_bFullSymbolForwarding; 279 CGetFullSymbolOp *m_pGetFullSymbolOp; 280 281 bool m_bFullPunctForwarding; 282 CGetFullPunctOp *m_pGetFullPunctOp; 283 284 bool m_bNonCompleteSyllable; 285 bool m_bDynaCandiOrder; 286 287 unsigned m_candiStarts; 288 unsigned m_candiEnds; 289 }; // CIMIContext 290 291 #endif 292
