1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifndef _SIM_PYTRIE_H 39 0 yongsun #define _SIM_PYTRIE_H 40 0 yongsun 41 0 yongsun #include <stdio.h> 42 0 yongsun 43 0 yongsun #include "../portability.h" 44 0 yongsun 45 314 yongsun #include <stdint.h> 46 0 yongsun #include <string> 47 0 yongsun #include <vector> 48 0 yongsun #include <map> 49 0 yongsun #include <set> 50 0 yongsun #include <list> 51 0 yongsun 52 0 yongsun #define WORD_ID_WIDTH 18 53 0 yongsun 54 0 yongsun class CPinyinTrie { 55 0 yongsun public: 56 0 yongsun static const unsigned char SYLLABLE_BREAKER = '\''; 57 0 yongsun friend class CPinyinTrieMaker; 58 0 yongsun 59 0 yongsun class TTransUnit { 60 0 yongsun public: 61 198 tchaikov #if !defined(WORDS_BIGENDIAN) 62 0 yongsun unsigned m_Char : 8; 63 0 yongsun unsigned m_Offset: 24; 64 198 tchaikov #else 65 0 yongsun unsigned m_Offset: 24; 66 0 yongsun unsigned m_Char : 8; 67 0 yongsun #endif 68 0 yongsun }; 69 0 yongsun 70 0 yongsun class TWordIdInfo { 71 0 yongsun public: 72 198 tchaikov #if !defined(WORDS_BIGENDIAN) 73 0 yongsun unsigned m_id : WORD_ID_WIDTH; 74 0 yongsun unsigned m_bGBK : 1; 75 0 yongsun unsigned m_bGB18030 : 1; 76 0 yongsun unsigned m_len : 6; 77 0 yongsun unsigned m_cost : 5; 78 0 yongsun unsigned m_bSeen : 1; 79 198 tchaikov #else 80 0 yongsun unsigned m_bSeen : 1; 81 0 yongsun unsigned m_cost : 5; 82 0 yongsun unsigned m_len : 6; 83 0 yongsun unsigned m_bGB18030 : 1; 84 0 yongsun unsigned m_bGBK : 1; 85 0 yongsun unsigned m_id : WORD_ID_WIDTH; 86 0 yongsun #endif 87 0 yongsun 88 0 yongsun public: 89 0 yongsun TWordIdInfo() 90 0 yongsun { memset(this, 0, sizeof(TWordIdInfo)); } 91 0 yongsun 92 0 yongsun TWordIdInfo(unsigned id, unsigned len=0, unsigned seen=0, unsigned cost = 0, unsigned gbk = 0, unsigned gb18030 = 0) 93 34 tchaikov : m_id(id), m_bGBK(gbk), m_bGB18030(gb18030), m_len(len), m_cost(cost), m_bSeen(seen) { } 94 0 yongsun 95 0 yongsun operator 96 0 yongsun unsigned int() const { return m_id; } 97 0 yongsun }; 98 0 yongsun 99 0 yongsun class TNode { 100 0 yongsun public: 101 198 tchaikov #if !defined(WORDS_BIGENDIAN) 102 0 yongsun unsigned m_nWordId : 16; 103 0 yongsun unsigned m_nTransfer : 8; 104 0 yongsun unsigned m_bGBK : 1; 105 0 yongsun unsigned m_bGB18030 : 1; 106 0 yongsun unsigned m_bFullSyllableTransfer: 1; 107 0 yongsun unsigned m_bOther : 5; 108 0 yongsun 109 198 tchaikov #else 110 0 yongsun unsigned m_bOther : 5; 111 0 yongsun unsigned m_bFullSyllableTransfer: 1; 112 0 yongsun unsigned m_bGB18030 : 1; 113 0 yongsun unsigned m_bGBK : 1; 114 0 yongsun unsigned m_nTransfer : 8; 115 0 yongsun unsigned m_nWordId : 16; 116 0 yongsun 117 0 yongsun #endif 118 0 yongsun 119 0 yongsun public: 120 0 yongsun static unsigned int 121 0 yongsun size_for(unsigned int nTransfer, unsigned int nWordId) 122 0 yongsun { return sizeof(TNode) + sizeof(TTransUnit)*nTransfer + 123 0 yongsun sizeof(TWordIdInfo)*nWordId; } 124 0 yongsun 125 0 yongsun public: 126 198 tchaikov TNode() { *((uint32_t*)this) = 0; } 127 0 yongsun 128 0 yongsun bool 129 0 yongsun hasPinyinChild(void) const 130 0 yongsun { return (m_nTransfer > 1) || 131 0 yongsun (m_nTransfer ==1 && getTrans()->m_Char != SYLLABLE_BREAKER); } 132 0 yongsun 133 0 yongsun const TTransUnit* 134 0 yongsun getTrans() const 135 0 yongsun { return (TTransUnit*)(this+1); } 136 0 yongsun 137 0 yongsun const TWordIdInfo* 138 0 yongsun getWordIdPtr() const 139 0 yongsun { return (TWordIdInfo*)(((char*)(this+1))+sizeof(TTransUnit)*m_nTransfer); } 140 0 yongsun 141 0 yongsun unsigned int 142 0 yongsun transfer(unsigned char c) const; 143 0 yongsun 144 0 yongsun inline unsigned int 145 0 yongsun transfer(unsigned c) const 146 0 yongsun { return transfer((unsigned char)(c)); } 147 0 yongsun }; 148 0 yongsun 149 0 yongsun public: 150 34 tchaikov CPinyinTrie() : m_Size(0), m_mem(NULL), m_words(NULL) { } 151 0 yongsun 152 0 yongsun ~CPinyinTrie() { free(); } 153 0 yongsun 154 0 yongsun bool 155 0 yongsun load(const char* fileName); 156 0 yongsun 157 0 yongsun bool 158 0 yongsun isValid(const TNode* pnode, bool allowNonComplete, bool allowGBK=true); 159 0 yongsun 160 0 yongsun unsigned int 161 0 yongsun getRootOffset() const { return 3 * sizeof(unsigned int); } 162 0 yongsun 163 0 yongsun const TNode* 164 0 yongsun getRootNode() const { return (TNode*)(m_mem+getRootOffset()); } 165 0 yongsun 166 0 yongsun const TNode* 167 0 yongsun nodeFromOffset(unsigned int offset) const 168 0 yongsun { return (offset < getRootOffset())?NULL:((TNode*)(m_mem+offset)); } 169 0 yongsun 170 0 yongsun //@{ 171 0 yongsun /** transfer on an char or a string from a specific node*/ 172 0 yongsun //inline const TNode* 173 0 yongsun //transfer(const TNode* pnode, char c) const 174 0 yongsun // { return transfer(pnode, (unsigned char)c); } 175 0 yongsun 176 0 yongsun inline const TNode* 177 0 yongsun transfer(const TNode* pnode, unsigned char c) const 178 0 yongsun { return nodeFromOffset(pnode->transfer(c)); } 179 0 yongsun 180 0 yongsun inline const TNode* 181 0 yongsun transfer(const TNode* pnode, TWCHAR wc) const 182 0 yongsun { return nodeFromOffset(pnode->transfer(unsigned(wc))); } 183 0 yongsun 184 0 yongsun /* 185 0 yongsun inline const TNode* 186 0 yongsun transfer(const TNode* pnode, const char* str) const 187 0 yongsun { return transfer(pnode, (const unsigned char*)str); } 188 0 yongsun */ 189 0 yongsun 190 0 yongsun const TNode* 191 0 yongsun transfer(const TNode* pnode, const unsigned char* str) const; 192 0 yongsun 193 0 yongsun const TNode* 194 0 yongsun transfer(const TNode* pnode, const TWCHAR* wstr) const; 195 0 yongsun 196 0 yongsun /* 197 0 yongsun inline const TNode* 198 0 yongsun transfer(const TNode* pnode, const char* str, int nlen) const 199 0 yongsun { return transfer(pnode, (const unsigned char*)str, nlen); } 200 0 yongsun */ 201 0 yongsun 202 0 yongsun const TNode* 203 0 yongsun transfer(const TNode* pnode, const unsigned char* str, int nlen) const; 204 0 yongsun 205 0 yongsun const TNode* 206 0 yongsun transfer(const TNode* pnode, const TWCHAR* wstr, int nlen) const; 207 0 yongsun //@} 208 0 yongsun 209 0 yongsun //@{ 210 0 yongsun /** transfer on an char or a string from root node*/ 211 0 yongsun /* 212 0 yongsun inline const TNode* 213 0 yongsun transfer(const char* str) const 214 0 yongsun { return transfer(getRootNode(), str); } 215 0 yongsun */ 216 0 yongsun 217 0 yongsun inline const TNode* 218 0 yongsun transfer(const unsigned char* str) const 219 0 yongsun { return transfer(getRootNode(), str); } 220 0 yongsun 221 0 yongsun inline const TNode* 222 0 yongsun transfer(const TWCHAR* wstr) const 223 0 yongsun { return transfer(getRootNode(), wstr); } 224 0 yongsun 225 0 yongsun /* 226 0 yongsun inline const TNode* 227 0 yongsun transfer(const char* str, int nlen) const 228 0 yongsun { return transfer(getRootNode(), str, nlen); } 229 0 yongsun */ 230 0 yongsun 231 0 yongsun inline const TNode* 232 0 yongsun transfer(const unsigned char* str, int nlen) const 233 0 yongsun { return transfer(getRootNode(), str, nlen); } 234 0 yongsun 235 0 yongsun inline const TNode* 236 0 yongsun transfer(const TWCHAR* wstr, int nlen) const 237 0 yongsun { return transfer(getRootNode(), wstr, nlen); } 238 0 yongsun //@} 239 0 yongsun 240 0 yongsun unsigned int 241 0 yongsun getWordCount(void) const { return *(unsigned int*)m_mem; } 242 0 yongsun 243 0 yongsun unsigned int 244 0 yongsun getNodeCount(void) const { return *(unsigned int*)(m_mem+sizeof(unsigned int)); } 245 0 yongsun 246 0 yongsun unsigned int 247 0 yongsun getStringOffset(void) const { return *(unsigned int*)(m_mem+2*sizeof(unsigned int)); } 248 0 yongsun 249 0 yongsun unsigned int 250 0 yongsun getSimbolId(const TWCHAR* wstr); 251 0 yongsun 252 0 yongsun unsigned int 253 0 yongsun getSimbolId(const wstring & wstr); 254 0 yongsun 255 0 yongsun const TWCHAR* 256 0 yongsun operator[](unsigned int idx) const { return m_words[idx]; } 257 0 yongsun 258 0 yongsun int 259 0 yongsun lengthAt(unsigned int idx) const; 260 0 yongsun 261 0 yongsun void 262 0 yongsun free(void); 263 0 yongsun 264 0 yongsun void 265 0 yongsun print(FILE *fp) const; 266 0 yongsun 267 0 yongsun protected: 268 0 yongsun unsigned int m_Size; 269 0 yongsun char *m_mem; 270 0 yongsun TWCHAR **m_words; 271 0 yongsun 272 0 yongsun std::map<wstring, unsigned> m_SimbolMap; 273 0 yongsun 274 0 yongsun protected: 275 0 yongsun void 276 0 yongsun print(const TNode* pRoot, std::string& prefix, FILE *fp) const; 277 0 yongsun }; 278 0 yongsun 279 0 yongsun #endif 280