1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #include <stdio.h> 39 208 tchaikov #include <stdlib.h> 40 0 yongsun 41 0 yongsun #include "sim_dict.h" 42 0 yongsun 43 0 yongsun 44 0 yongsun void CSIMDict::freeSubTree(CSIMDict::TState& root) 45 0 yongsun { 46 0 yongsun if (root.follow != NULL) { 47 0 yongsun Map_Type &map = *(root.follow); 48 0 yongsun for (Map_Type::iterator it=map.begin(), last=map.end(); it != last; ++it) 49 0 yongsun freeSubTree(it->second); 50 0 yongsun delete root.follow; 51 0 yongsun root.follow = NULL; 52 0 yongsun } 53 0 yongsun } 54 0 yongsun 55 0 yongsun const CSIMDict::TState* CSIMDict::step(const CSIMDict::TState* root, TWCHAR wch) 56 0 yongsun { 57 0 yongsun if ((root != NULL) && (root->follow != NULL) && wch != WCH_NULL) { 58 0 yongsun Map_Type::iterator it = root->follow->find(TSIMChar(wch)); 59 0 yongsun if (it != root->follow->end()) 60 0 yongsun return &(it->second); 61 0 yongsun } 62 0 yongsun return NULL; 63 0 yongsun } 64 0 yongsun 65 0 yongsun int CSIMDict::matchLongest(const CSIMDict::TState* root, CSIMDict::PState & result, const TWCHAR* str) 66 0 yongsun { 67 0 yongsun int lastWordLen = 0, len = 0; 68 0 yongsun result = root; 69 0 yongsun while (root != NULL) { 70 0 yongsun if (root->word_id != SIM_ID_NOT_WORD) { 71 0 yongsun result = root; 72 0 yongsun lastWordLen = len; 73 0 yongsun } 74 0 yongsun ++len; 75 0 yongsun root = step(root, *str++); 76 0 yongsun } 77 0 yongsun return lastWordLen; 78 0 yongsun } 79 0 yongsun 80 0 yongsun bool 81 0 yongsun CSIMDict::parseText(const char* filename) 82 0 yongsun { 83 0 yongsun FILE * fp = NULL; 84 0 yongsun static char buf[1024]; 85 0 yongsun static TWCHAR wword[sizeof(buf)]; 86 0 yongsun unsigned int id; 87 0 yongsun 88 0 yongsun try { 89 0 yongsun if ((fp = fopen(filename, "r")) == NULL) 90 0 yongsun return false; 91 0 yongsun while (fgets(buf, 1024, fp) != NULL) { 92 8 ys148558 if (*buf == '\n' || *buf == '#') 93 8 ys148558 continue; 94 0 yongsun 95 8 ys148558 char* p = buf; 96 8 ys148558 while (*p == ' ' || *p == '\t') 97 8 ys148558 ++p; 98 8 ys148558 char* pstart = p; 99 8 ys148558 while (*p != 0 && *p != ' ' && *p != '\t') 100 8 ys148558 ++p; 101 8 ys148558 if (*p == 0) 102 8 ys148558 continue; 103 8 ys148558 *p++ = 0; 104 8 ys148558 while (*p == ' ' || *p == '\t') 105 8 ys148558 ++p; 106 8 ys148558 if (!(*p >= '0' && *p <= '9')) continue; 107 8 ys148558 for (id=0; *p >= '0' && *p <= '9'; ++p) 108 8 ys148558 id = 10*id + (*p - '0'); 109 8 ys148558 110 8 ys148558 if (id < SIM_ID_REALWORD_START) 111 8 ys148558 continue; 112 8 ys148558 if (MBSTOWCS(wword, pstart, sizeof(buf)) != (size_t)-1) { 113 8 ys148558 insertWord(wword, TSIMWordId(id)); 114 8 ys148558 } else { 115 8 ys148558 fprintf(stderr, "mbs to wcs conversion error for : %s %d\n", buf, id); 116 8 ys148558 exit(100); 117 0 yongsun } 118 0 yongsun } 119 0 yongsun fclose(fp); 120 0 yongsun } catch (...) { 121 0 yongsun if (fp != NULL) 122 0 yongsun fclose(fp); 123 0 yongsun buf[sizeof(buf)-1] = 0; 124 0 yongsun fprintf(stderr, "Catch exception when loading dictionary at %s, existing...", buf); 125 0 yongsun exit(200); 126 0 yongsun } 127 0 yongsun return true; 128 0 yongsun } 129 0 yongsun 130 0 yongsun void CSIMDict::insertWord(const TWCHAR* wstr, TSIMWordId id) 131 0 yongsun { 132 0 yongsun TState* ps = &m_root; 133 0 yongsun while (*wstr) { 134 0 yongsun TSIMChar ch(*wstr++); 135 0 yongsun TSIMWordId nodeId = (*wstr)?SIM_ID_NOT_WORD:id; 136 0 yongsun if (ps->follow == NULL) { 137 0 yongsun ps->follow = new Map_Type(); 138 0 yongsun } 139 0 yongsun Map_Type & map = *(ps->follow); 140 0 yongsun Map_Type::iterator it = map.find(ch); 141 0 yongsun if (it != map.end() && nodeId != SIM_ID_NOT_WORD && 142 0 yongsun it->second.word_id != SIM_ID_NOT_WORD && it->second.word_id != nodeId) { 143 0 yongsun throw new int(100); 144 0 yongsun } 145 0 yongsun if (it != map.end()){ 146 0 yongsun if (nodeId != SIM_ID_NOT_WORD) 147 0 yongsun it->second.word_id = nodeId; 148 0 yongsun ps = &(it->second); 149 0 yongsun } else { 150 0 yongsun ps = &(map[ch] = TState(nodeId)); 151 0 yongsun } 152 0 yongsun } 153 0 yongsun } 154 0 yongsun 155 0 yongsun void CSIMDict::InnerPrint(FILE* fp, wstring & wstr, const TState* pnode) 156 0 yongsun { 157 0 yongsun if (pnode != NULL && pnode->word_id != SIM_ID_NOT_WORD) { 158 0 yongsun char* buf = new char[wstr.size()*2+2]; 159 0 yongsun WCSTOMBS(buf, wstr.c_str(), wstr.size()*2+2); 160 0 yongsun fprintf(fp, "%s %d\n", buf, unsigned(pnode->word_id)); 161 0 yongsun delete[] buf; 162 0 yongsun } 163 0 yongsun if (pnode != NULL && pnode->follow != NULL) { 164 0 yongsun Map_Type::iterator it, ite = pnode->follow->end(); 165 0 yongsun for (it = pnode->follow->begin(); it != ite; ++it) { 166 0 yongsun TWCHAR wch = TWCHAR(it->first); 167 0 yongsun wstr.push_back(wch); 168 0 yongsun InnerPrint(fp, wstr, &(it->second)); 169 0 yongsun wstr.erase(wstr.size()-1, 1); 170 0 yongsun } 171 0 yongsun } 172 0 yongsun } 173