1 #ifdef HAVE_CONFIG_H 2 #include "config.h" 3 #endif 4 5 #ifdef HAVE_ASSERT_H 6 #include <assert.h> 7 #endif 8 9 #include <algorithm> 10 11 #ifdef HAVE_ICONV_H 12 #include <iconv.h> 13 #endif 14 15 #include "pytrie_gen.h" 16 #include "pinyin_data.h" 17 18 static const char* 19 skipSpace(const char* p) 20 { 21 while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') 22 ++p; 23 return p; 24 } 25 26 static const char* 27 skipNonSpace(const char* p) 28 { 29 while (*p != '\0' && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') 30 ++p; 31 return p; 32 } 33 34 static void 35 insertWordId(CPinyinTrieMaker::CWordSet& idset, CPinyinTrieMaker::TWordId id) 36 { 37 CPinyinTrieMaker::CWordSet::const_iterator it = idset.find(id); 38 if (it == idset.end()) 39 idset.insert(id); 40 else { 41 const CPinyinTrieMaker::TWordId& a = *it; 42 if ((a.anony.m_bHide && !id.anony.m_bHide) || (a.anony.m_bHide == id.anony.m_bHide && a.anony.m_cost > id.anony.m_cost)) { 43 idset.erase(it); 44 idset.insert(id); 45 } 46 } 47 } 48 49 struct TSyllableInfo { 50 std::string m_py; 51 int m_cost; 52 53 TSyllableInfo(const char* py=NULL, int cost=0) : m_py(py), m_cost(cost) {} 54 bool operator< (const TSyllableInfo& b) const { return m_py < b.m_py; } 55 }; 56 57 #ifdef HAVE_ICONV_H 58 bool isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric) 59 { 60 static char gbstr[256]; 61 static char utstr[256]; 62 63 TIConvSrcPtr src = (TIConvSrcPtr)utf8; 64 size_t srclen = strlen((char*)src)+1; 65 char* dst = (char *)gbstr; 66 size_t dstlen = 256; 67 size_t res = iconv(ic, &src, &srclen, &dst, &dstlen); 68 69 if (res != size_t(-1) && srclen == 0) { 70 // do revert convertion and compare them 71 src = (TIConvSrcPtr)gbstr; 72 srclen = strlen((char*)src)+1; 73 dst = (char *)utstr; 74 dstlen = 256; 75 res = iconv(ric, &src, &srclen, &dst, &dstlen); 76 if (res != size_t(-1) && srclen == 0) 77 return (strcmp(utf8, utstr) == 0); 78 } 79 return false; 80 } 81 82 //return: bit 0x1: contains some gbk out of gb2312, bit 0x2: contains some gb18030 outof gbk 83 unsigned getPureGBEncoding(const char* utf8str) 84 { 85 static iconv_t ic_gb = iconv_open("GB2312", "UTF-8"); 86 static iconv_t ic_gbk = iconv_open("GBK", "UTF-8"); 87 static iconv_t ric_gb = iconv_open("UTF-8", "GB2312"); 88 static iconv_t ric_gbk = iconv_open("UTF-8", "GBK"); 89 90 unsigned ret = 0; 91 92 if (!isCorrectConverted(utf8str, ic_gb, ric_gb)) { 93 ret = 1; // at least it is contains some GBK char 94 if (!isCorrectConverted(utf8str, ic_gbk, ric_gbk)) 95 ret = 3; //contains some GB18030-only char 96 97 #ifdef DEBUG 98 fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret); 99 fflush(stderr); 100 #endif 101 } 102 return ret; 103 } 104 #else // !HAVE_ICONV_H 105 unsigned getPureGBEncoding(const char* utf8str) 106 { 107 // FIXME 108 return 0x3; 109 } 110 #endif // HAVE_ICONV_H 111 112 bool 113 parseLine(char* buf, char* word_buf, int& id, std::set<TSyllableInfo>& pyset) 114 { 115 pyset.clear(); 116 117 /* ignore the empty lines and comment lines */ 118 if (*buf == '\n' || *buf == '#') 119 return 0; 120 121 char* word_start = word_buf; 122 char* p = (char*)skipSpace(buf); 123 char* t = (char*)skipNonSpace(p); 124 while(p < t) *word_buf++ = *p++; 125 *word_buf = 0; 126 127 p = (char*)skipSpace(p); 128 t = (char*)skipNonSpace(p); 129 if (*t) 130 *t++ = 0; 131 id = atoi(p); 132 p = (char*)skipSpace(t); 133 while (*p) { 134 const char* s = p; 135 t = (char*)skipNonSpace(p); 136 if (*t) 137 *t++ = 0; 138 while ((*p >= 'a' && *p <= 'z') || (*p == '\'')) 139 ++p; 140 if ((p > s) && ((*p == 0) || (*p == ':'))) { 141 int cost = 0; 142 if (*p == ':') { 143 *p++ = 0; 144 cost = atoi(p); 145 } 146 pyset.insert(TSyllableInfo(s, cost)); 147 } 148 p = (char*)skipSpace(t); 149 } 150 return pyset.size() > 0; 151 } 152 153 154 CPinyinTrieMaker::CPinyinTrieMaker() 155 : m_RootNode(), m_StateMap() 156 { 157 m_RootNode.m_bExpanded = true; 158 } 159 /********************************************************** 160 lexicon��������������� 161 ���������������������������������������������������TAB(1������������)��� 162 ������������������ ������������������������������������������word id��� 163 ���������������������������������������������������������������'��������� 164 ���������������������������������������������������������4095; 165 **********************************************************/ 166 #define RARE_MULTI_PHONETIC_STARTING_ID 140000 /* FIXME */ 167 bool 168 CPinyinTrieMaker::constructFromLexicon(const char* fileName) 169 { 170 static int rmp_id = RARE_MULTI_PHONETIC_STARTING_ID; 171 static char buf[4096]; 172 static char word_buf[2048]; 173 174 int id; 175 bool suc = true; 176 std::set<TSyllableInfo> pyset; 177 FILE *fp = fopen(fileName, "r"); 178 printf("Adding pinyin and corresponding words..."); fflush(stdout); 179 while (fgets(buf, sizeof(buf), fp) != NULL) { 180 if (!parseLine(buf, word_buf, id, pyset)) { 181 if (word_buf[0] != L'<' && word_buf[0] != 0) { 182 if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1); 183 m_Lexicon[id] = std::string(word_buf); 184 } 185 continue; 186 } 187 unsigned gbcategory = getPureGBEncoding(word_buf); 188 189 std::set<TSyllableInfo>::const_iterator its = pyset.begin(); 190 std::set<TSyllableInfo>::const_iterator ite = pyset.end(); 191 for (; its != ite; ++its) { 192 const char *t = its->m_py.c_str(); 193 int cost = its->m_cost; 194 int myid = id; 195 196 if (cost < 0) { 197 cost = 30 / (-cost); 198 myid = rmp_id ++; 199 } 200 201 if (m_Lexicon.size() < myid+1) m_Lexicon.resize(myid+1); 202 m_Lexicon[myid] = std::string(word_buf); 203 204 CPinyinTrieMaker::TWordId wid(myid, cost, its->m_cost < 0, gbcategory); 205 suc = insertFullPinyinPair(t, wid) && suc; 206 } 207 } 208 fclose(fp); 209 210 printf("\n %zd primitive nodes", TNode::m_AllNodes.size()); fflush(stdout); 211 212 threadNonCompletePinyin(); 213 printf("\n %zd total nodes", TNode::m_AllNodes.size()); fflush(stdout); 214 215 std::string pyPrefix = ""; 216 //print(stderr, &m_RootNode, pyPrefix); 217 printf("\n"); fflush(stdout); 218 219 return suc; 220 } 221 222 CPinyinTrieMaker::CNodeList CPinyinTrieMaker::TNode::m_AllNodes; 223 CPinyinTrieMaker::TNode::TNode() 224 : m_bFullSyllableTransfer(false), m_bExpanded(false), m_WordIdSet(), 225 m_Trans(), m_cmbNodes() 226 { 227 m_AllNodes.push_back(this); 228 } 229 230 bool 231 CPinyinTrieMaker::PNodeSet::operator< (const PNodeSet& another) const 232 { 233 CNodeSet::const_iterator t1 = m_pns->begin(); 234 CNodeSet::const_iterator t2 = m_pns->end(); 235 CNodeSet::const_iterator a1 = another.m_pns->begin(); 236 CNodeSet::const_iterator a2 = another.m_pns->end(); 237 for (; t1 != t2 && a1 != a2; ++t1, ++a1) { 238 if (*t1 < *a1) return true; 239 if (*t1 > *a1) return false; 240 } 241 return (a1 != a2); 242 } 243 244 bool 245 CPinyinTrieMaker::PNodeSet::operator==(const PNodeSet& another) const 246 { 247 CNodeSet::const_iterator t1 = m_pns->begin(); 248 CNodeSet::const_iterator t2 = m_pns->end(); 249 CNodeSet::const_iterator a1 = another.m_pns->begin(); 250 CNodeSet::const_iterator a2 = another.m_pns->end(); 251 for (; t1 != t2 && a1 != a2; ++t1, ++a1) { 252 if (*t1 != *a1) return false; 253 } 254 return (a1 == a2 && t1 != t2); 255 } 256 257 static void 258 parseFullPinyin (const char *pinyin, std::vector<TSyllable> &ret) 259 { 260 char *buf = strdup (pinyin); 261 char *p = buf, *q = buf; 262 ret.clear(); 263 264 while (*p) { 265 if (*p == '\'') { 266 *p = '\0'; 267 unsigned s = CPinyinData::encodeSyllable(q); 268 ret.push_back (TSyllable(s)); 269 q = p+1; 270 } 271 p++; 272 } 273 274 if (*q) { 275 unsigned s = CPinyinData::encodeSyllable(q); 276 ret.push_back (TSyllable(s)); 277 } 278 279 free(buf); 280 } 281 282 void 283 CPinyinTrieMaker::print(FILE* fp, TNode* root, std::string& pinyin) 284 { 285 if (root && root->m_WordIdSet.size() > 0) { 286 fprintf(fp, "%s", pinyin.c_str()); 287 CWordSet::const_iterator itId = root->m_WordIdSet.begin(); 288 CWordSet::const_iterator itIdLast = root->m_WordIdSet.end(); 289 for (; itId != itIdLast; ++itId) { 290 fprintf(fp, " %s", m_Lexicon[itId->anony.m_id].c_str()); 291 } 292 fprintf(fp, "\n"); 293 } 294 if (root) { 295 CTrans::const_iterator itTrans = root->m_Trans.begin(); 296 CTrans::const_iterator itTransLast = root->m_Trans.end(); 297 for (; itTrans != itTransLast; ++itTrans) { 298 const char *str = CPinyinData::decodeSyllable(itTrans->first); 299 pinyin = pinyin + str + '\''; 300 print(fp, itTrans->second, pinyin); 301 pinyin.resize(pinyin.size()-strlen(str)-1); 302 } 303 } 304 } 305 306 CPinyinTrieMaker::TNode* 307 CPinyinTrieMaker::insertTransfer(TNode* pnode, unsigned s) 308 { 309 CTrans::const_iterator itt = pnode->m_Trans.find(s); 310 CTrans::const_iterator ite = pnode->m_Trans.end(); 311 if (itt == ite) { 312 TNode *p = new TNode(); 313 p->m_bFullSyllableTransfer = true; 314 p->m_bExpanded = true; 315 pnode->m_Trans[s] = p; 316 return p; 317 } 318 return itt->second; 319 } 320 321 bool 322 CPinyinTrieMaker::insertFullPinyinPair(const char* pinyin, TWordId wid) 323 { 324 TNode *pnode = &m_RootNode; 325 std::vector<TSyllable> syllables; 326 parseFullPinyin (pinyin, syllables); 327 328 std::vector<TSyllable>::const_iterator it = syllables.begin(); 329 std::vector<TSyllable>::const_iterator ite = syllables.end(); 330 331 for (; it != ite; ++it) 332 pnode = insertTransfer(pnode, *it); 333 334 insertWordId(pnode->m_WordIdSet, wid); 335 return true; 336 } 337 338 CPinyinTrieMaker::TNode* 339 CPinyinTrieMaker::addCombinedTransfers (TNode *pnode, unsigned s, const CNodeSet& nodes) 340 { 341 assert (!nodes.empty()); 342 343 TNode *p = NULL; 344 if (nodes.size() == 1) { 345 p = *(nodes.begin()); 346 } else { 347 p = new TNode(); 348 p->m_cmbNodes = nodes; 349 m_StateMap[&p->m_cmbNodes] = p; 350 351 CNodeSet::const_iterator it = nodes.begin(); 352 CNodeSet::const_iterator ite = nodes.end(); 353 for (; it != ite; ++it) 354 p->m_WordIdSet.insert ((*it)->m_WordIdSet.begin(), (*it)->m_WordIdSet.end()); 355 } 356 357 pnode->m_Trans[s] = p; 358 return p; 359 } 360 361 void 362 CPinyinTrieMaker::combineInitialTrans (TNode *pnode) 363 { 364 std::map<unsigned, CNodeSet> combTrans; 365 366 CTrans::const_iterator itTrans = pnode->m_Trans.begin(); 367 CTrans::const_iterator itTransLast = pnode->m_Trans.end(); 368 for (; itTrans != itTransLast; ++itTrans) { 369 TSyllable s = (TSyllable) itTrans->first; 370 if (s.initial) { 371 s.final = s.tone = 0; 372 combTrans[s].insert(itTrans->second); 373 } 374 } 375 376 std::map<unsigned, CNodeSet>::const_iterator itCombTrans = combTrans.begin(); 377 for (; itCombTrans != combTrans.end(); ++itCombTrans) 378 addCombinedTransfers (pnode, itCombTrans->first, itCombTrans->second); 379 } 380 381 void 382 CPinyinTrieMaker::expandCombinedNode (TNode *pnode) 383 { 384 assert (pnode->m_cmbNodes.size() >= 1); 385 386 std::map<unsigned, CNodeSet> combTrans; 387 CNodeSet::const_iterator itNode = pnode->m_cmbNodes.begin(); 388 CNodeSet::const_iterator itNodeLast = pnode->m_cmbNodes.end(); 389 for (; itNode != itNodeLast; ++itNode) { 390 CTrans::const_iterator itTrans = (*itNode)->m_Trans.begin(); 391 CTrans::const_iterator itTransLast = (*itNode)->m_Trans.end(); 392 for (; itTrans != itTransLast; ++itTrans) 393 combTrans[itTrans->first].insert(itTrans->second); 394 } 395 396 std::map<unsigned, CNodeSet>::const_iterator itCombTrans = combTrans.begin(); 397 for (; itCombTrans != combTrans.end(); ++itCombTrans) { 398 TNode* p = NULL; 399 unsigned s = itCombTrans->first; 400 CNodeSet nodes = itCombTrans->second; 401 402 CStateMap::const_iterator itStateMap = m_StateMap.find(&nodes); 403 if (itStateMap != m_StateMap.end()) 404 p = itStateMap->second; 405 else 406 p = addCombinedTransfers (pnode, s, nodes); 407 408 pnode->m_Trans[s] = p; 409 } 410 411 pnode->m_bExpanded = true; 412 } 413 414 bool 415 CPinyinTrieMaker::threadNonCompletePinyin(void) 416 { 417 CNodeList::const_iterator itNode = TNode::m_AllNodes.begin(); 418 for (; itNode != TNode::m_AllNodes.end(); ++itNode) { 419 TNode* pnode = *itNode; 420 if (pnode->m_bExpanded) 421 combineInitialTrans (pnode); 422 else 423 expandCombinedNode (pnode); 424 } 425 return true; 426 } 427 428 bool 429 CPinyinTrieMaker::write(const char* fileName, CWordEvaluator* psrt) 430 { 431 bool suc = false; 432 FILE* fp = fopen(fileName, "wb"); 433 if (fp != NULL) { 434 suc = write(fp, psrt); 435 fclose(fp); 436 } 437 return suc; 438 } 439 440 bool 441 CPinyinTrieMaker::write(FILE *fp, CWordEvaluator* psrt) 442 { 443 bool suc = true; 444 static TWCHAR wbuf[1024]; 445 446 std::map<TNode*, unsigned int> nodeOffsetMap; 447 448 unsigned int nWord = m_Lexicon.size(); 449 unsigned int nNode = TNode::m_AllNodes.size(); 450 unsigned int lexiconOffset; 451 unsigned int offset = sizeof(unsigned int) * 3; 452 453 CNodeList::const_iterator itNode = TNode::m_AllNodes.begin(); 454 CNodeList::const_iterator itNodeLast = TNode::m_AllNodes.end(); 455 for (; itNode != itNodeLast; ++itNode) { 456 nodeOffsetMap[*itNode] = offset; 457 offset += CPinyinTrie::TNode::size_for((*itNode)->m_Trans.size(), 458 (*itNode)->m_WordIdSet.size()); 459 } 460 lexiconOffset = offset; 461 CLexicon::const_iterator itWordStr = m_Lexicon.begin(); 462 CLexicon::const_iterator itWordStrLast = m_Lexicon.end(); 463 for (; itWordStr != itWordStrLast; ++itWordStr) { 464 MBSTOWCS(wbuf, itWordStr->c_str(), 1024); 465 int sz = WCSLEN(wbuf); 466 offset += (sz+1)*sizeof(TWCHAR); 467 } 468 469 suc = (fwrite(&nWord, sizeof(unsigned int), 1, fp) == 1); 470 suc = (fwrite(&nNode, sizeof(unsigned int), 1, fp) == 1); 471 suc = (fwrite(&lexiconOffset, sizeof(unsigned int), 1, fp) == 1); 472 473 itNode = TNode::m_AllNodes.begin(); 474 itNodeLast = TNode::m_AllNodes.end(); 475 for (; itNode != itNodeLast && suc; ++itNode) { 476 CPinyinTrie::TNode outNode; 477 TNode *pnode = *itNode; 478 479 outNode.m_nTransfer = pnode->m_Trans.size(); 480 outNode.m_nWordId = pnode->m_WordIdSet.size(); 481 outNode.m_bFullSyllableTransfer = pnode->m_bFullSyllableTransfer; 482 outNode.m_csLevel = 0; 483 484 CWordSet::const_iterator itId = pnode->m_WordIdSet.begin(); 485 CWordSet::const_iterator itIdLast = pnode->m_WordIdSet.end(); 486 for (; itId != itIdLast && outNode.m_csLevel<3; ++itId) { 487 if (outNode.m_csLevel < itId->anony.m_csLevel) 488 outNode.m_csLevel = itId->anony.m_csLevel; 489 } 490 491 suc = (fwrite(&outNode, sizeof(outNode), 1, fp) == 1); 492 493 CTrans::const_iterator itTrans = pnode->m_Trans.begin(); 494 CTrans::const_iterator itTransLast = pnode->m_Trans.end(); 495 for (; itTrans != itTransLast && suc; ++itTrans) { 496 CPinyinTrie::TTransUnit tru; 497 tru.m_Syllable = itTrans->first; 498 tru.m_Offset = nodeOffsetMap[itTrans->second]; 499 assert(tru.m_Offset != 0 && tru.m_Offset < lexiconOffset); 500 suc = (fwrite(&tru, sizeof(tru), 1, fp) == 1); 501 } 502 503 CWordVec vec; 504 itId = pnode->m_WordIdSet.begin(); 505 itIdLast = pnode->m_WordIdSet.end(); 506 for (; itId != itIdLast; ++itId) 507 vec.push_back(TWordInfo(*itId, psrt->getCost(*itId), psrt->isSeen(*itId))); 508 std::make_heap(vec.begin(), vec.end()); 509 std::sort_heap(vec.begin(), vec.end()); 510 511 CWordVec::const_iterator itv = vec.begin(); 512 CWordVec::const_iterator itve = vec.end(); 513 for (; itv != itve && suc; ++itv) { 514 CPinyinTrie::TWordIdInfo wi; 515 wi.m_id = itv->m_id.anony.m_id; 516 assert (wi.m_id < nWord); 517 wi.m_csLevel = itv->m_id.anony.m_csLevel; 518 wi.m_len = m_Lexicon[itv->m_id.anony.m_id].size(); 519 wi.m_bSeen = ((itv->m_bSeen)?(1):(0)); 520 wi.m_cost = itv->m_id.anony.m_cost; 521 suc = (fwrite(&wi, sizeof(wi), 1, fp) == 1); 522 } 523 } 524 525 itWordStr = m_Lexicon.begin(); 526 itWordStrLast = m_Lexicon.end(); 527 for (; itWordStr != itWordStrLast && suc; ++itWordStr) { 528 MBSTOWCS(wbuf, itWordStr->c_str(), 1024); 529 int sz = WCSLEN(wbuf); 530 suc = (fwrite(wbuf, (sz+1)*sizeof(TWCHAR), 1, fp) == 1); 531 } 532 return suc; 533 } 534
