1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifdef HAVE_CONFIG_H 39 0 yongsun #include "config.h" 40 0 yongsun #endif 41 0 yongsun 42 0 yongsun #ifdef HAVE_ASSERT_H 43 0 yongsun #include <assert.h> 44 0 yongsun #endif 45 0 yongsun 46 0 yongsun #include <algorithm> 47 90 tonylee 48 90 tonylee #ifdef HAVE_ICONV_H 49 0 yongsun #include <iconv.h> 50 90 tonylee #endif 51 0 yongsun 52 0 yongsun #include "pytrie_gen.h" 53 0 yongsun 54 0 yongsun static const char* 55 0 yongsun skipSpace(const char* p) 56 0 yongsun { 57 0 yongsun while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') 58 0 yongsun ++p; 59 0 yongsun return p; 60 0 yongsun } 61 0 yongsun 62 0 yongsun static const char* 63 0 yongsun skipNonSpace(const char* p) 64 0 yongsun { 65 0 yongsun while (*p != '\0' && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') 66 0 yongsun ++p; 67 0 yongsun return p; 68 0 yongsun } 69 0 yongsun 70 0 yongsun static void 71 0 yongsun insertWordId(CPinyinTrieMaker::CWordSet& idset, CPinyinTrieMaker::TWordId id) 72 0 yongsun { 73 0 yongsun CPinyinTrieMaker::CWordSet::iterator it = idset.find(id); 74 0 yongsun if (it == idset.end()) 75 0 yongsun idset.insert(id); 76 0 yongsun else { 77 0 yongsun const CPinyinTrieMaker::TWordId& a = *it; 78 0 yongsun if ((a.anony.m_bHide && !id.anony.m_bHide) || (a.anony.m_bHide == id.anony.m_bHide && a.anony.m_cost > id.anony.m_cost)) { 79 0 yongsun idset.erase(it); 80 0 yongsun idset.insert(id); 81 0 yongsun } 82 0 yongsun } 83 0 yongsun } 84 0 yongsun 85 0 yongsun struct TSyllableInfo { 86 0 yongsun std::string m_py; 87 0 yongsun int m_cost; 88 0 yongsun 89 0 yongsun TSyllableInfo(const char* py=NULL, int cost=0) : m_py(py), m_cost(cost) {} 90 0 yongsun bool operator< (const TSyllableInfo& b) const { return m_py < b.m_py; } 91 0 yongsun }; 92 0 yongsun 93 90 tonylee #ifdef HAVE_ICONV_H 94 0 yongsun bool isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric) 95 0 yongsun { 96 0 yongsun static char gbstr[256]; 97 0 yongsun static char utstr[256]; 98 0 yongsun 99 0 yongsun TIConvSrcPtr src = (TIConvSrcPtr)utf8; 100 0 yongsun size_t srclen = strlen((char*)src)+1; 101 0 yongsun char* dst = (char *)gbstr; 102 0 yongsun size_t dstlen = 256; 103 0 yongsun size_t res = iconv(ic, &src, &srclen, &dst, &dstlen); 104 0 yongsun 105 0 yongsun if (res != size_t(-1) && srclen == 0) { 106 0 yongsun // do revert convertion and compare them 107 0 yongsun src = (TIConvSrcPtr)gbstr; 108 0 yongsun srclen = strlen((char*)src)+1; 109 0 yongsun dst = (char *)utstr; 110 0 yongsun dstlen = 256; 111 0 yongsun res = iconv(ric, &src, &srclen, &dst, &dstlen); 112 0 yongsun if (res != size_t(-1) && srclen == 0) 113 0 yongsun return (strcmp(utf8, utstr) == 0); 114 0 yongsun } 115 0 yongsun return false; 116 0 yongsun } 117 0 yongsun 118 0 yongsun //return: bit 0x1: contains some gbk out of gb2312, bit 0x2: contains some gb18030 outof gbk 119 0 yongsun unsigned getPureGBEncoding(const char* utf8str) 120 0 yongsun { 121 0 yongsun static iconv_t ic_gb = iconv_open("GB2312", "UTF-8"); 122 0 yongsun static iconv_t ic_gbk = iconv_open("GBK", "UTF-8"); 123 0 yongsun static iconv_t ric_gb = iconv_open("UTF-8", "GB2312"); 124 0 yongsun static iconv_t ric_gbk = iconv_open("UTF-8", "GBK"); 125 0 yongsun 126 0 yongsun unsigned ret = 0; 127 0 yongsun 128 0 yongsun if (!isCorrectConverted(utf8str, ic_gb, ric_gb)) { 129 0 yongsun ret = 1; // at least it is contains some GBK char 130 0 yongsun if (!isCorrectConverted(utf8str, ic_gbk, ric_gbk)) 131 0 yongsun ret = 3; //contains some GB18030-only char 132 0 yongsun 133 0 yongsun #ifdef DEBUG 134 0 yongsun fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret); 135 0 yongsun fflush(stderr); 136 0 yongsun #endif 137 0 yongsun } 138 0 yongsun return ret; 139 0 yongsun } 140 90 tonylee #else // !HAVE_ICONV_H 141 90 tonylee unsigned getPureGBEncoding(const char* utf8str) 142 90 tonylee { 143 90 tonylee // FIXME 144 90 tonylee return 0x3; 145 90 tonylee } 146 90 tonylee #endif // HAVE_ICONV_H 147 0 yongsun 148 0 yongsun bool 149 0 yongsun parseLine(char* buf, char* word_buf, int& id, std::set<TSyllableInfo>& pyset) 150 0 yongsun { 151 0 yongsun pyset.clear(); 152 5 ys148558 153 5 ys148558 /* ignore the empty lines and comment lines */ 154 5 ys148558 if (*buf == '\n' || *buf == '#') 155 5 ys148558 return 0; 156 5 ys148558 157 0 yongsun char* word_start = word_buf; 158 0 yongsun char* p = (char*)skipSpace(buf); 159 0 yongsun char* t = (char*)skipNonSpace(p); 160 0 yongsun while(p < t) *word_buf++ = *p++; 161 0 yongsun *word_buf = 0; 162 0 yongsun 163 0 yongsun p = (char*)skipSpace(p); 164 0 yongsun t = (char*)skipNonSpace(p); 165 0 yongsun if (*t) 166 0 yongsun *t++ = 0; 167 0 yongsun id = atoi(p); 168 0 yongsun p = (char*)skipSpace(t); 169 0 yongsun while (*p) { 170 0 yongsun const char* s = p; 171 0 yongsun t = (char*)skipNonSpace(p); 172 0 yongsun if (*t) 173 0 yongsun *t++ = 0; 174 0 yongsun while ((*p >= 'a' && *p <= 'z') || (*p == CPinyinTrie::SYLLABLE_BREAKER)) 175 0 yongsun ++p; 176 0 yongsun if ((p > s) && ((*p == 0) || (*p == ':'))) { 177 0 yongsun int cost = 0; 178 0 yongsun if (*p == ':') { 179 0 yongsun *p++ = 0; 180 0 yongsun cost = atoi(p); 181 0 yongsun } 182 0 yongsun pyset.insert(TSyllableInfo(s, cost)); 183 0 yongsun } 184 0 yongsun p = (char*)skipSpace(t); 185 0 yongsun } 186 0 yongsun return pyset.size() > 0; 187 0 yongsun } 188 0 yongsun 189 0 yongsun 190 0 yongsun CPinyinTrieMaker::CPinyinTrieMaker() 191 0 yongsun : m_RootNode(), m_FullSyllables(), m_StateMap(), m_AllNodes() 192 0 yongsun { 193 0 yongsun m_AllNodes.push_back(&m_RootNode); 194 0 yongsun m_RootNode.m_bExpanded = true; 195 0 yongsun m_RootNode.m_PrimitiveNodes.insert(&m_RootNode); 196 0 yongsun m_StateMap[&(m_RootNode.m_PrimitiveNodes)] = &m_RootNode; 197 0 yongsun } 198 0 yongsun /********************************************************** 199 0 yongsun lexicon 200 0 yongsun TAB(1) 201 0 yongsun word id 202 0 yongsun ' 203 0 yongsun 4095; 204 0 yongsun **********************************************************/ 205 44 yongsun #define RARE_MULTI_PHONETIC_STARTING_ID 140000 /* FIXME */ 206 0 yongsun bool 207 0 yongsun CPinyinTrieMaker::constructFromLexicon(const char* fileName) 208 0 yongsun { 209 44 yongsun static int rmp_id = RARE_MULTI_PHONETIC_STARTING_ID; 210 0 yongsun static char buf[4096]; 211 0 yongsun static char word_buf[2048]; 212 0 yongsun 213 0 yongsun int id; 214 0 yongsun bool suc = true; 215 0 yongsun std::set<TSyllableInfo> pyset; 216 0 yongsun FILE *fp = fopen(fileName, "r"); 217 0 yongsun printf("Adding pinyin and corresponding words..."); fflush(stdout); 218 0 yongsun while (fgets(buf, 4096, fp) != NULL) { 219 0 yongsun if (!parseLine(buf, word_buf, id, pyset)) { 220 0 yongsun if (word_buf[0] != L'<' && word_buf[0] != 0) { 221 0 yongsun if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1); 222 0 yongsun m_Lexicon[id] = std::string(word_buf); 223 0 yongsun } 224 0 yongsun continue; 225 0 yongsun } 226 0 yongsun unsigned gbcategory = getPureGBEncoding(word_buf); 227 0 yongsun 228 0 yongsun std::set<TSyllableInfo>::iterator its = pyset.begin(); 229 0 yongsun std::set<TSyllableInfo>::iterator ite = pyset.end(); 230 0 yongsun for (; its != ite; ++its) { 231 0 yongsun const char *t = its->m_py.c_str(); 232 0 yongsun int cost = its->m_cost; 233 44 yongsun int myid = id; 234 44 yongsun 235 0 yongsun if (cost < 0) { 236 0 yongsun cost = 30 / (-cost); 237 44 yongsun myid = rmp_id ++; 238 44 yongsun } 239 44 yongsun 240 44 yongsun if (m_Lexicon.size() < myid+1) m_Lexicon.resize(myid+1); 241 44 yongsun m_Lexicon[myid] = std::string(word_buf); 242 44 yongsun 243 44 yongsun CPinyinTrieMaker::TWordId wid(myid, cost, its->m_cost < 0, gbcategory & 0x1, gbcategory & 0x2); 244 0 yongsun suc = insertFullPinyinPair(t, wid) && suc; 245 0 yongsun 246 0 yongsun while (*t) { 247 0 yongsun char *p = buf; 248 0 yongsun while (*t != 0 && *t != CPinyinTrie::SYLLABLE_BREAKER) 249 0 yongsun *p++ = *t++; 250 0 yongsun *p = 0; 251 0 yongsun registerFullSyllable(buf); 252 0 yongsun if (*t == CPinyinTrie::SYLLABLE_BREAKER) 253 0 yongsun ++t; 254 0 yongsun } 255 0 yongsun } 256 0 yongsun } 257 0 yongsun fclose(fp); 258 0 yongsun 259 0 yongsun std::string pyPrefix = ""; 260 0 yongsun 261 0 yongsun printf("\n %d primitive nodes", m_AllNodes.size()); fflush(stdout); 262 0 yongsun 263 0 yongsun /* 264 0 yongsun printf("\n Printing it to stderr..."); 265 0 yongsun print(stderr, &m_RootNode, pyPrefix); 266 0 yongsun */ 267 0 yongsun 268 0 yongsun printf("\nThreading non-complete pinyin..."); fflush(stdout); 269 0 yongsun suc = threadNonCompletePinyin() && suc; 270 0 yongsun printf("\n %d total nodes", m_AllNodes.size()); fflush(stdout); 271 0 yongsun 272 0 yongsun /* 273 0 yongsun printf("\n Printing it to stderr..."); 274 0 yongsun fprintf(stderr, "\n\n\n\n-----------------------------\n\n\n\n"); 275 0 yongsun print(stderr, &m_RootNode, pyPrefix); 276 0 yongsun */ 277 0 yongsun 278 0 yongsun printf("\n"); fflush(stdout); 279 0 yongsun 280 0 yongsun return suc; 281 0 yongsun } 282 0 yongsun 283 0 yongsun CPinyinTrieMaker::TNode::TNode() 284 0 yongsun : m_bFullSyllableTransfer(false), m_bExpanded(false), m_WordIdSet(), 285 0 yongsun m_Trans(), m_PrimitiveNodes(), m_SyllablePrefix() 286 0 yongsun { 287 0 yongsun } 288 0 yongsun 289 0 yongsun bool 290 0 yongsun CPinyinTrieMaker::PNodeSet::operator< (const PNodeSet& another) const 291 0 yongsun { 292 0 yongsun CNodeSet::const_iterator t1 = m_pns->begin(); 293 0 yongsun CNodeSet::const_iterator t2 = m_pns->end(); 294 0 yongsun CNodeSet::const_iterator a1 = another.m_pns->begin(); 295 0 yongsun CNodeSet::const_iterator a2 = another.m_pns->end(); 296 0 yongsun for (; t1 != t2 && a1 != a2; ++t1, ++a1) { 297 0 yongsun if (*t1 < *a1) return true; 298 0 yongsun if (*t1 > *a1) return false; 299 0 yongsun } 300 0 yongsun return (a1 != a2); 301 0 yongsun } 302 0 yongsun 303 0 yongsun bool 304 0 yongsun CPinyinTrieMaker::PNodeSet::operator==(const PNodeSet& another) const 305 0 yongsun { 306 0 yongsun CNodeSet::const_iterator t1 = m_pns->begin(); 307 0 yongsun CNodeSet::const_iterator t2 = m_pns->end(); 308 0 yongsun CNodeSet::const_iterator a1 = another.m_pns->begin(); 309 0 yongsun CNodeSet::const_iterator a2 = another.m_pns->end(); 310 0 yongsun for (; t1 != t2 && a1 != a2; ++t1, ++a1) { 311 0 yongsun if (*t1 != *a1) return false; 312 0 yongsun } 313 0 yongsun return (a1 == a2 && t1 != t2); 314 0 yongsun } 315 0 yongsun 316 0 yongsun 317 0 yongsun void 318 0 yongsun CPinyinTrieMaker::print(FILE* fp, TNode* root, std::string& pinyin) 319 0 yongsun { 320 0 yongsun if (root && root->m_WordIdSet.size() > 0) { 321 0 yongsun fprintf(fp, "%s", pinyin.c_str()); 322 0 yongsun CWordSet::iterator itId = root->m_WordIdSet.begin(); 323 0 yongsun CWordSet::iterator itIdLast = root->m_WordIdSet.end(); 324 0 yongsun for (; itId != itIdLast; ++itId) { 325 0 yongsun fprintf(fp, " %s", m_Lexicon[itId->anony.m_id].c_str()); 326 0 yongsun } 327 0 yongsun fprintf(fp, "\n"); 328 0 yongsun } 329 0 yongsun if (root) { 330 0 yongsun CTrans::iterator itTrans = root->m_Trans.begin(); 331 0 yongsun CTrans::iterator itTransLast = root->m_Trans.end(); 332 0 yongsun for (; itTrans != itTransLast; ++itTrans) { 333 0 yongsun pinyin += itTrans->first; 334 0 yongsun print(fp, itTrans->second, pinyin); 335 0 yongsun pinyin.resize(pinyin.size() -1); 336 0 yongsun } 337 0 yongsun } 338 0 yongsun } 339 0 yongsun 340 0 yongsun 341 0 yongsun /*********************************************************** 342 0 yongsun 343 0 yongsun pinyin trie 344 0 yongsun ***********************************************************/ 345 0 yongsun bool 346 0 yongsun CPinyinTrieMaker::registerFullSyllable(const char* pinyin) 347 0 yongsun { 348 0 yongsun CPinyinString str(pinyin); 349 0 yongsun m_FullSyllables.insert(str); 350 0 yongsun return true; 351 0 yongsun } 352 0 yongsun 353 0 yongsun 354 0 yongsun CPinyinTrieMaker::TNode* 355 0 yongsun CPinyinTrieMaker::insertTransfer(TNode* pnode, unsigned char c) 356 0 yongsun { 357 0 yongsun CTrans::iterator itt = pnode->m_Trans.find(c); 358 0 yongsun CTrans::iterator ite = pnode->m_Trans.end(); 359 0 yongsun if (itt == ite) { 360 0 yongsun TNode *p = new TNode(); 361 0 yongsun m_AllNodes.push_back(p); 362 0 yongsun pnode->m_Trans[c] = p; 363 0 yongsun if (c != CPinyinTrie::SYLLABLE_BREAKER) { 364 0 yongsun p->m_SyllablePrefix = pnode->m_SyllablePrefix; 365 0 yongsun p->m_SyllablePrefix += c; 366 0 yongsun } 367 0 yongsun return p; 368 0 yongsun } 369 0 yongsun return itt->second; 370 0 yongsun } 371 0 yongsun 372 0 yongsun /*********************************************************** 373 0 yongsun pinyin: != NULL, 26' 374 0 yongsun >0'' 375 0 yongsun wid : word id 376 0 yongsun ***********************************************************/ 377 0 yongsun bool 378 0 yongsun CPinyinTrieMaker::insertFullPinyinPair(const char* pinyin, TWordId wid) 379 0 yongsun { 380 0 yongsun const char* p = pinyin; 381 0 yongsun TNode *pnode = &m_RootNode; 382 0 yongsun for (; *p; ++p) { 383 0 yongsun if (*p == CPinyinTrie::SYLLABLE_BREAKER) 384 0 yongsun pnode->m_bFullSyllableTransfer = true; 385 0 yongsun pnode = insertTransfer(pnode, (unsigned char)*p); 386 0 yongsun pnode->m_bExpanded = true; 387 0 yongsun pnode->m_PrimitiveNodes.insert(pnode); 388 0 yongsun m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode; 389 0 yongsun } 390 0 yongsun if (*p-1 != CPinyinTrie::SYLLABLE_BREAKER) { 391 0 yongsun pnode->m_bFullSyllableTransfer = true; 392 0 yongsun pnode = insertTransfer(pnode, CPinyinTrie::SYLLABLE_BREAKER); 393 0 yongsun pnode->m_bExpanded = true; 394 0 yongsun pnode->m_PrimitiveNodes.insert(pnode); 395 0 yongsun m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode; 396 0 yongsun } 397 0 yongsun insertWordId(pnode->m_WordIdSet, wid); 398 0 yongsun return true; 399 0 yongsun } 400 0 yongsun 401 0 yongsun bool 402 0 yongsun CPinyinTrieMaker::threadNonCompletePinyin(void) 403 0 yongsun { 404 0 yongsun CNodeList::iterator itNode = m_AllNodes.begin(); 405 0 yongsun for (; itNode != m_AllNodes.end(); ++itNode) { 406 0 yongsun TNode* pnode = *itNode; 407 0 yongsun if (!pnode->m_bExpanded) 408 0 yongsun expandNode(pnode); 409 0 yongsun 410 0 yongsun if (pnode->m_SyllablePrefix.size() > 0 && 411 0 yongsun m_FullSyllables.find(pnode->m_SyllablePrefix) == m_FullSyllables.end() && 412 0 yongsun pnode->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) == pnode->m_Trans.end()) { 413 0 yongsun addNonCompleteSyllableTransfer(pnode); 414 0 yongsun } 415 0 yongsun } 416 0 yongsun return true; 417 0 yongsun } 418 0 yongsun 419 0 yongsun /** 420 0 yongsun * For those node which are added after fullComplete PINYIN string, 421 0 yongsun * give their succ nodes, for example, for "dian'ying dui'yu dian'hua", an 422 0 yongsun * new state "d'" would be expand to "d'y d'h" 423 0 yongsun */ 424 0 yongsun void 425 0 yongsun CPinyinTrieMaker::expandNode(TNode* pnode) 426 0 yongsun { 427 0 yongsun std::map<char, CNodeSet> combTrans; 428 0 yongsun 429 0 yongsun CNodeSet::iterator itNode = pnode->m_PrimitiveNodes.begin(); 430 0 yongsun CNodeSet::iterator itNodeLast = pnode->m_PrimitiveNodes.end(); 431 0 yongsun for (; itNode != itNodeLast; ++itNode) { 432 0 yongsun CTrans::iterator itTrans = (*itNode)->m_Trans.begin(); 433 0 yongsun CTrans::iterator itTransLast = (*itNode)->m_Trans.end(); 434 0 yongsun for (; itTrans != itTransLast; ++itTrans) { 435 0 yongsun if (itTrans->first == CPinyinTrie::SYLLABLE_BREAKER && 436 0 yongsun !(*itNode)->m_bFullSyllableTransfer) { 437 0 yongsun continue; 438 0 yongsun } 439 0 yongsun combTrans[itTrans->first].insert(itTrans->second); 440 0 yongsun } 441 0 yongsun } 442 0 yongsun 443 0 yongsun std::map<char, CNodeSet>::iterator itCombTrans = combTrans.begin(); 444 0 yongsun std::map<char, CNodeSet>::iterator itCombTransLast = combTrans.end(); 445 0 yongsun for (; itCombTrans != itCombTransLast; ++itCombTrans) { 446 0 yongsun //if a new state, ie new node set appear, create the new state --> ps 447 0 yongsun //esle let ps = the founded old state, let transfer(c) = state 448 0 yongsun unsigned char c = itCombTrans->first; 449 0 yongsun // if (c == CPinyinTrie::SYLLABLE_BREAKER && !pnode->m_bFullSyllableTransfer) { 450 0 yongsun // continue; 451 0 yongsun // } 452 0 yongsun TNode* pChildNode = NULL; 453 0 yongsun CStateMap::iterator itStateMap = m_StateMap.find(&itCombTrans->second); 454 0 yongsun if (itStateMap != m_StateMap.end()) { 455 0 yongsun pChildNode = itStateMap->second; 456 0 yongsun } else { 457 0 yongsun pChildNode = new TNode(); 458 0 yongsun m_AllNodes.push_back(pChildNode); 459 0 yongsun pChildNode->m_PrimitiveNodes = itCombTrans->second; 460 0 yongsun m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode; 461 0 yongsun if (c != CPinyinTrie::SYLLABLE_BREAKER) { 462 0 yongsun pChildNode->m_SyllablePrefix = pnode->m_SyllablePrefix; 463 0 yongsun pChildNode->m_SyllablePrefix += c; 464 0 yongsun } else { 465 0 yongsun CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin(); 466 0 yongsun CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end(); 467 0 yongsun for (; itps != itpse; ++itps) { 468 0 yongsun CWordSet::iterator ita = (*itps)->m_WordIdSet.begin(); 469 0 yongsun CWordSet::iterator itb = (*itps)->m_WordIdSet.end(); 470 0 yongsun for (; ita != itb; ++ita) 471 0 yongsun insertWordId(pChildNode->m_WordIdSet, *ita); 472 0 yongsun //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end()); 473 0 yongsun } 474 0 yongsun pnode->m_bFullSyllableTransfer = false; 475 0 yongsun } 476 0 yongsun } 477 0 yongsun pnode->m_Trans[c] = pChildNode; 478 0 yongsun } 479 0 yongsun pnode->m_bExpanded = true; 480 0 yongsun } 481 0 yongsun 482 0 yongsun void 483 0 yongsun CPinyinTrieMaker::addNonCompleteSyllableTransfer(TNode* pnode) 484 0 yongsun { 485 0 yongsun CNodeSet syChildren; 486 0 yongsun TNode* pChildNode = NULL; 487 0 yongsun 488 0 yongsun findSyllableChildren(pnode, syChildren); 489 0 yongsun if (syChildren.size() == 0) //z, c, s with only zh, ch, sh children 490 0 yongsun return; 491 0 yongsun CStateMap::iterator itStateMap = m_StateMap.find(&syChildren); 492 0 yongsun if (itStateMap != m_StateMap.end()) { 493 0 yongsun pChildNode = itStateMap->second; 494 0 yongsun } else { 495 0 yongsun pChildNode = new TNode(); 496 0 yongsun m_AllNodes.push_back(pChildNode); 497 0 yongsun pChildNode->m_PrimitiveNodes = syChildren; 498 0 yongsun m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode; 499 0 yongsun CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin(); 500 0 yongsun CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end(); 501 0 yongsun for (; itps != itpse; ++itps) { 502 0 yongsun CWordSet::iterator ita = (*itps)->m_WordIdSet.begin(); 503 0 yongsun CWordSet::iterator itb = (*itps)->m_WordIdSet.end(); 504 0 yongsun for (; ita != itb; ++ita) 505 0 yongsun insertWordId(pChildNode->m_WordIdSet, *ita); 506 0 yongsun //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end()); 507 0 yongsun } 508 0 yongsun } 509 0 yongsun pnode->m_Trans[CPinyinTrie::SYLLABLE_BREAKER] = pChildNode; 510 0 yongsun // this is default: pnode->m_bFullSyllableTransfer = false; 511 0 yongsun } 512 0 yongsun 513 0 yongsun int 514 0 yongsun CPinyinTrieMaker::findSyllableChildren(const TNode *pn, CNodeSet& children) 515 0 yongsun { 516 0 yongsun CNodeSet::iterator itNode = pn->m_PrimitiveNodes.begin(); 517 0 yongsun CNodeSet::iterator itNodeLast = pn->m_PrimitiveNodes.end(); 518 0 yongsun for (children.clear(); itNode != itNodeLast; ++itNode) { 519 0 yongsun findPrimitiveSyllableChildren(*itNode, children); 520 0 yongsun } 521 0 yongsun return children.size(); 522 0 yongsun } 523 0 yongsun 524 0 yongsun void 525 0 yongsun CPinyinTrieMaker::findPrimitiveSyllableChildren(const TNode *pn, CNodeSet& children) 526 0 yongsun { 527 0 yongsun CTrans::const_iterator it = pn->m_Trans.begin(); 528 0 yongsun CTrans::const_iterator ite= pn->m_Trans.end(); 529 0 yongsun for (; it != ite; ++it) { 530 0 yongsun if (it->first != CPinyinTrie::SYLLABLE_BREAKER) { 531 0 yongsun if (it->first == 'h' && 532 0 yongsun (pn->m_SyllablePrefix == "c" || pn->m_SyllablePrefix == "z" || 533 0 yongsun pn->m_SyllablePrefix == "s" ) ) { 534 0 yongsun continue; 535 0 yongsun } 536 0 yongsun findPrimitiveSyllableChildren(it->second, children); 537 0 yongsun } else { 538 0 yongsun if (pn->m_bFullSyllableTransfer) { 539 0 yongsun children.insert(it->second); 540 0 yongsun } 541 0 yongsun } 542 0 yongsun } 543 0 yongsun } 544 0 yongsun 545 0 yongsun bool 546 0 yongsun CPinyinTrieMaker::write(const char* fileName, CWordEvaluator* psrt) 547 0 yongsun { 548 0 yongsun bool suc = false; 549 0 yongsun FILE* fp = fopen(fileName, "wb"); 550 0 yongsun if (fp != NULL) { 551 0 yongsun suc = write(fp, psrt); 552 0 yongsun fclose(fp); 553 0 yongsun } 554 0 yongsun return suc; 555 0 yongsun } 556 0 yongsun 557 0 yongsun bool 558 0 yongsun CPinyinTrieMaker::write(FILE *fp, CWordEvaluator* psrt) 559 0 yongsun { 560 0 yongsun bool suc = true; 561 0 yongsun static TWCHAR wbuf[1024]; 562 0 yongsun 563 0 yongsun std::map<TNode*, unsigned int> nodeOffsetMap; 564 0 yongsun 565 0 yongsun /*the file started with m_nWord, the size itself do not included here*/ 566 0 yongsun unsigned int nWord = m_Lexicon.size(); 567 0 yongsun unsigned int nNode = m_AllNodes.size(); 568 0 yongsun unsigned int lexiconOffset; 569 0 yongsun unsigned int offset = sizeof(unsigned int) * 3; 570 0 yongsun 571 0 yongsun CNodeList::const_iterator itNode = m_AllNodes.begin(); 572 0 yongsun CNodeList::const_iterator itNodeLast = m_AllNodes.end(); 573 0 yongsun for (; itNode != itNodeLast; ++itNode) { 574 0 yongsun nodeOffsetMap[*itNode] = offset; 575 0 yongsun offset += CPinyinTrie::TNode::size_for((*itNode)->m_Trans.size(), 576 0 yongsun (*itNode)->m_WordIdSet.size()); 577 0 yongsun } 578 0 yongsun lexiconOffset = offset; 579 0 yongsun CLexicon::const_iterator itWordStr = m_Lexicon.begin(); 580 0 yongsun CLexicon::const_iterator itWordStrLast = m_Lexicon.end(); 581 0 yongsun for (; itWordStr != itWordStrLast; ++itWordStr) { 582 0 yongsun MBSTOWCS(wbuf, itWordStr->c_str(), 1024); 583 0 yongsun int sz = WCSLEN(wbuf); 584 0 yongsun offset += (sz+1)*sizeof(TWCHAR); 585 0 yongsun } 586 0 yongsun 587 0 yongsun suc = (fwrite(&offset, sizeof(unsigned int), 1, fp) == 1); 588 0 yongsun suc = (fwrite(&nWord, sizeof(unsigned int), 1, fp) == 1); 589 0 yongsun suc = (fwrite(&nNode, sizeof(unsigned int), 1, fp) == 1); 590 0 yongsun suc = (fwrite(&lexiconOffset, sizeof(unsigned int), 1, fp) == 1); 591 0 yongsun 592 0 yongsun itNode = m_AllNodes.begin(); 593 0 yongsun itNodeLast = m_AllNodes.end(); 594 0 yongsun for (; itNode != itNodeLast && suc; ++itNode) { 595 0 yongsun CPinyinTrie::TNode outNode; 596 0 yongsun outNode.m_nTransfer = (*itNode)->m_Trans.size(); 597 0 yongsun outNode.m_nWordId = (*itNode)->m_WordIdSet.size(); 598 0 yongsun outNode.m_bFullSyllableTransfer = (*itNode)->m_bFullSyllableTransfer; 599 0 yongsun 600 0 yongsun //determine this node's GB category, have some pure gb2312 words, or all GBK/GB18030 words 601 0 yongsun outNode.m_bGBK = 1; 602 0 yongsun outNode.m_bGB18030 = 1; 603 0 yongsun 604 0 yongsun TNode* itequ = *itNode; 605 0 yongsun if (outNode.m_nWordId == 0) { 606 0 yongsun if ((*itNode)->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) != (*itNode)->m_Trans.end()) { 607 0 yongsun itequ = (*itNode)->m_Trans[CPinyinTrie::SYLLABLE_BREAKER]; 608 0 yongsun if (itequ->m_WordIdSet.size() == 0) { 609 0 yongsun outNode.m_bGBK = 0; 610 0 yongsun outNode.m_bGB18030 = 0; 611 0 yongsun } 612 0 yongsun } else { 613 0 yongsun outNode.m_bGBK = 0; 614 0 yongsun outNode.m_bGB18030 = 0; 615 0 yongsun } 616 0 yongsun } 617 0 yongsun CWordSet::iterator itId = itequ->m_WordIdSet.begin(); 618 0 yongsun CWordSet::iterator itIdLast = itequ->m_WordIdSet.end(); 619 0 yongsun for (; itId != itIdLast && outNode.m_bGBK; ++itId) { 620 0 yongsun outNode.m_bGB18030 &= itId->anony.m_bGB18030; 621 0 yongsun outNode.m_bGBK &= itId->anony.m_bGBK; 622 0 yongsun } 623 0 yongsun #ifdef DEBUG 624 0 yongsun if (outNode.m_bGBK) { 625 0 yongsun CWordSet::iterator itId = (*itNode)->m_WordIdSet.begin(); 626 0 yongsun CWordSet::iterator itIdLast = (*itNode)->m_WordIdSet.end(); 627 0 yongsun fprintf(stderr, "========>("); 628 0 yongsun for (; itId != itIdLast; ++itId) { 629 0 yongsun fprintf(stderr, " %d-%1d", itId->anony.m_id, itId->anony.m_bGBK); 630 0 yongsun } 631 0 yongsun fprintf(stderr, " )\n\n"); 632 0 yongsun fflush(stderr); 633 0 yongsun } 634 0 yongsun #endif 635 0 yongsun suc = (fwrite(&outNode, sizeof(outNode), 1, fp) == 1); 636 0 yongsun 637 0 yongsun CTrans::iterator itTrans = (*itNode)->m_Trans.begin(); 638 0 yongsun CTrans::iterator itTransLast = (*itNode)->m_Trans.end(); 639 0 yongsun for (; itTrans != itTransLast && suc; ++itTrans) { 640 0 yongsun CPinyinTrie::TTransUnit tru; 641 0 yongsun tru.m_Char = itTrans->first; 642 0 yongsun tru.m_Offset = nodeOffsetMap[itTrans->second]; 643 0 yongsun assert(tru.m_Offset != 0); 644 0 yongsun suc = (fwrite(&tru, sizeof(tru), 1, fp) == 1); 645 0 yongsun } 646 0 yongsun 647 0 yongsun CWordVec vec; 648 0 yongsun itId = (*itNode)->m_WordIdSet.begin(); 649 0 yongsun itIdLast = (*itNode)->m_WordIdSet.end(); 650 0 yongsun for (; itId != itIdLast; ++itId) 651 0 yongsun vec.push_back(TWordInfo(*itId, psrt->getCost(*itId), psrt->isSeen(*itId))); 652 0 yongsun std::make_heap(vec.begin(), vec.end()); 653 0 yongsun std::sort_heap(vec.begin(), vec.end()); 654 0 yongsun 655 0 yongsun CWordVec::iterator itv = vec.begin(); 656 0 yongsun CWordVec::iterator itve = vec.end(); 657 0 yongsun for (; itv != itve && suc; ++itv) { 658 0 yongsun CPinyinTrie::TWordIdInfo wi; 659 0 yongsun wi.m_id = itv->m_id.anony.m_id; 660 0 yongsun wi.m_bGBK = itv->m_id.anony.m_bGBK; 661 0 yongsun wi.m_bGB18030 = itv->m_id.anony.m_bGB18030; 662 0 yongsun wi.m_len = m_Lexicon[itv->m_id.anony.m_id].size(); 663 0 yongsun wi.m_bSeen = ((itv->m_bSeen)?(1):(0)); 664 0 yongsun wi.m_cost = itv->m_id.anony.m_cost; 665 0 yongsun suc = (fwrite(&wi, sizeof(wi), 1, fp) == 1); 666 0 yongsun } 667 0 yongsun } 668 0 yongsun itWordStr = m_Lexicon.begin(); 669 0 yongsun itWordStrLast = m_Lexicon.end(); 670 0 yongsun for (; itWordStr != itWordStrLast && suc; ++itWordStr) { 671 0 yongsun MBSTOWCS(wbuf, itWordStr->c_str(), 1024); 672 0 yongsun int sz = WCSLEN(wbuf); 673 0 yongsun suc = (fwrite(wbuf, (sz+1)*sizeof(TWCHAR), 1, fp) == 1); 674 0 yongsun } 675 0 yongsun return suc; 676 0 yongsun } 677