1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifdef HAVE_CONFIG_H 39 0 yongsun #include "config.h" 40 0 yongsun #endif 41 0 yongsun 42 0 yongsun #ifdef HAVE_ASSERT_H 43 0 yongsun #include <assert.h> 44 0 yongsun #endif 45 0 yongsun 46 0 yongsun #ifdef HAVE_GETOPT_H 47 0 yongsun #include <getopt.h> 48 0 yongsun #endif 49 0 yongsun 50 0 yongsun #include <stdio.h> 51 0 yongsun #include <unistd.h> 52 0 yongsun #include <locale.h> 53 0 yongsun 54 0 yongsun #include <vector> 55 0 yongsun #include <map> 56 208 tchaikov #include <algorithm> 57 0 yongsun 58 0 yongsun #include "../sim_dict.h" 59 0 yongsun #include "../sim_sen.h" 60 0 yongsun #include "../slm.h" 61 0 yongsun 62 0 yongsun static struct option long_options[] = 63 0 yongsun { 64 0 yongsun {"dict", 1, 0, 'd'}, 65 0 yongsun {"format", 1, 0, 'f'}, 66 0 yongsun {"show-id", 0, 0, 'i'}, 67 0 yongsun {"s-tok", 1, 0, 's'}, 68 0 yongsun {"model", 1, 0, 'm'}, 69 0 yongsun {0, 0, 0, 0} 70 0 yongsun }; 71 0 yongsun 72 0 yongsun static char* s_strDictFile = NULL; 73 0 yongsun static char* s_strSlmFile = NULL; 74 0 yongsun static bool s_bTextOut = false; 75 0 yongsun static bool s_bShowId = false; 76 0 yongsun static TSIMWordId s_iSTOKID = 10; 77 0 yongsun 78 0 yongsun static CSIMDict *s_dict = NULL; 79 0 yongsun static CThreadSlm *s_tslm = NULL; 80 0 yongsun 81 0 yongsun static void 82 0 yongsun ShowUsage() 83 0 yongsun { 84 0 yongsun fprintf(stderr, "\nUsage:\n"); 85 209 tchaikov fprintf(stderr, "slmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-m lm_file]\n\n"); 86 0 yongsun fprintf(stderr, " -f --format:\n"); 87 0 yongsun fprintf(stderr, " Output Format, can be 'text' or 'bin'. default 'bin'\n"); 88 0 yongsun fprintf(stderr, " Normally, in text mode, word text are output, while in binary mode,\n"); 89 0 yongsun fprintf(stderr, " binary short integer of the word-ids are writed to stdout.\n"); 90 0 yongsun fprintf(stderr, " -s --stok:\n"); 91 0 yongsun fprintf(stderr, " Sentence token id. Default 10.\n"); 92 0 yongsun fprintf(stderr, " It will be write to output in binary mode after every sentence.\n"); 93 0 yongsun fprintf(stderr, " -i --show-id:\n"); 94 0 yongsun fprintf(stderr, " Show Id info. Under text output format mode, Attach id after known-words.\n"); 95 0 yongsun fprintf(stderr, " Under binary mode, print id in text.\n"); 96 0 yongsun fprintf(stderr, " -m --model:\n"); 97 0 yongsun fprintf(stderr, " Language model file name"); 98 0 yongsun fprintf(stderr, "\n"); 99 0 yongsun fprintf(stderr, "Notes:\n"); 100 0 yongsun fprintf(stderr, " Under binary mode, consecutive id of 0 are merged into one 0.\n"); 101 0 yongsun fprintf(stderr, " Under text mode, no space are insert between unknown-words. \n"); 102 0 yongsun fprintf(stderr, "\n"); 103 0 yongsun fprintf(stderr, "\n"); 104 0 yongsun exit(1000); 105 0 yongsun } 106 0 yongsun 107 0 yongsun static void 108 0 yongsun getParameters(int argc, char* argv[]) 109 0 yongsun { 110 0 yongsun int c; 111 0 yongsun while ((c=getopt_long(argc, argv, "d:if:s:m:", long_options, NULL)) != -1) 112 0 yongsun { 113 0 yongsun switch (c) { 114 0 yongsun case 'd': 115 0 yongsun s_strDictFile = strdup(optarg); 116 0 yongsun break; 117 0 yongsun case 'i': 118 0 yongsun s_bShowId = true; 119 0 yongsun break; 120 0 yongsun case 'f': 121 0 yongsun s_bTextOut = (strcmp(optarg, "text") == 0); 122 0 yongsun break; 123 0 yongsun case 's': 124 0 yongsun s_iSTOKID = atoi(optarg); 125 0 yongsun break; 126 0 yongsun case 'm': 127 0 yongsun s_strSlmFile = strdup(optarg); 128 0 yongsun break; 129 0 yongsun default: 130 0 yongsun ShowUsage(); 131 0 yongsun break; 132 0 yongsun } 133 0 yongsun } 134 0 yongsun if (s_strDictFile == NULL) 135 0 yongsun ShowUsage(); 136 0 yongsun } 137 0 yongsun 138 0 yongsun static void 139 0 yongsun output_stok(int& nWords) 140 0 yongsun { 141 0 yongsun if (s_bShowId) { 142 0 yongsun if (nWords > 0) 143 0 yongsun printf(" "); 144 0 yongsun printf("%d", unsigned(s_iSTOKID)); 145 0 yongsun } else { 146 0 yongsun fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout); 147 0 yongsun } 148 0 yongsun ++nWords; 149 0 yongsun } 150 0 yongsun 151 0 yongsun static void 152 0 yongsun output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords) 153 0 yongsun { 154 0 yongsun static char mbword[1024]; 155 0 yongsun static TWCHAR wcword[1024]; 156 0 yongsun 157 0 yongsun bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD); 158 0 yongsun if (s_bTextOut) { 159 0 yongsun for (int i=0; i < len; ++i, ++p) 160 0 yongsun wcword[i] = *p; 161 0 yongsun wcword[len] = 0; 162 0 yongsun WCSTOMBS(mbword, wcword, sizeof(mbword)); 163 0 yongsun if (bRealGap && idprev == SIM_ID_NOT_WORD) 164 0 yongsun printf("(%d)", unsigned(idprev)); 165 0 yongsun if (bRealGap && (nWords > 0)) 166 0 yongsun printf(" "); 167 0 yongsun printf("%s", mbword); 168 0 yongsun if (s_bShowId && idcur != SIM_ID_NOT_WORD) 169 0 yongsun printf("(%d)", unsigned(idcur)); 170 0 yongsun } else { 171 0 yongsun if (bRealGap) { 172 0 yongsun if (s_bShowId) { 173 0 yongsun if (nWords > 0) 174 0 yongsun printf(" "); 175 0 yongsun printf("%d", unsigned(idcur)); 176 0 yongsun } else 177 0 yongsun fwrite(&idcur, sizeof(TSIMWordId), 1, stdout); 178 0 yongsun } 179 0 yongsun } 180 0 yongsun if (bRealGap) 181 0 yongsun ++nWords; 182 0 yongsun } 183 0 yongsun 184 0 yongsun struct TLatticeWord { 185 0 yongsun int m_left; 186 0 yongsun int m_right; 187 0 yongsun int m_wordId; 188 0 yongsun 189 0 yongsun TLatticeWord(int left=0, int right=0, int wid=0) 190 0 yongsun : m_left(left), m_right(right), m_wordId(wid) { } 191 0 yongsun }; 192 0 yongsun 193 0 yongsun typedef std::vector<TLatticeWord> TLatticeWordVec; 194 0 yongsun 195 0 yongsun struct TLatticeStateValue { 196 0 yongsun double m_pr; 197 0 yongsun TLatticeWord* mp_btword; 198 0 yongsun CThreadSlm::TState m_btstate; 199 0 yongsun 200 0 yongsun TLatticeStateValue(double pr=0.0, TLatticeWord* btword=NULL, CThreadSlm::TState btstate = CThreadSlm::TState()) 201 0 yongsun : m_pr(pr), mp_btword(btword), m_btstate(btstate) { } 202 0 yongsun }; 203 0 yongsun 204 0 yongsun typedef std::map<CThreadSlm::TState, TLatticeStateValue> TLatticeColumnStates; 205 0 yongsun 206 0 yongsun struct TLatticeColumn { 207 0 yongsun TLatticeWordVec m_wordstarting; 208 0 yongsun TLatticeColumnStates m_states; 209 0 yongsun }; 210 0 yongsun 211 0 yongsun typedef std::vector<TLatticeColumn> CLattice; 212 0 yongsun 213 0 yongsun inline void insertLatticeWord(CLattice& lattice, TLatticeWord word) 214 0 yongsun { 215 0 yongsun lattice[word.m_left].m_wordstarting.push_back(word); 216 0 yongsun } 217 0 yongsun 218 0 yongsun int 219 0 yongsun getAmbiLen(const TWCHAR* p, int word_len) 220 0 yongsun { 221 0 yongsun const CSIMDict::TState* pstate; 222 0 yongsun 223 0 yongsun for (int i=1; (i<word_len) && *(p+i) != WCH_NULL; ++i) { 224 0 yongsun int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i); 225 0 yongsun if (word_len < i+len) 226 0 yongsun word_len = i+len; 227 0 yongsun } 228 0 yongsun 229 0 yongsun return word_len; 230 0 yongsun } 231 0 yongsun 232 0 yongsun void fullSegBuildLattice(wstring& sntnc, int left, int len, CLattice& lattice) 233 0 yongsun { 234 0 yongsun for (int right=left+len; left < right; ++left) { 235 0 yongsun bool found = false; 236 0 yongsun 237 0 yongsun const TWCHAR* p = sntnc.c_str()+left; 238 0 yongsun const CSIMDict::TState* pds = s_dict->getRoot(); 239 0 yongsun for (len = 0; left+len < right; ++len) { 240 0 yongsun if ((pds = s_dict->step(pds, *p++)) == NULL) 241 0 yongsun break; 242 0 yongsun if (pds->word_id != SIM_ID_NOT_WORD) { 243 0 yongsun found = true; 244 0 yongsun insertLatticeWord(lattice, TLatticeWord(left, left+len+1, pds->word_id)); 245 0 yongsun } 246 0 yongsun } 247 0 yongsun if (!found) 248 0 yongsun insertLatticeWord(lattice, TLatticeWord(left, left+1, SIM_ID_NOT_WORD)); 249 0 yongsun } 250 0 yongsun } 251 0 yongsun 252 0 yongsun /** 253 0 yongsun * Lattice head should have one state, with its TState using slm's root. its 254 0 yongsun * pr = 0 and its mp_btword == NULL; 255 0 yongsun * Lattice tail must contain no word, and it previous node contain only one word 256 0 yongsun * with its right = left+1, right == tail. 257 0 yongsun * The lattice should ensure the lattice path existing 258 0 yongsun */ 259 0 yongsun void buildLattice(wstring &sntnc, CLattice& lattice) 260 0 yongsun { 261 0 yongsun lattice.clear(); 262 0 yongsun lattice.resize(sntnc.size()+2); 263 0 yongsun 264 0 yongsun unsigned int idcur = SIM_ID_NOT_WORD; 265 0 yongsun lattice[0].m_states[CThreadSlm::TState()] = TLatticeStateValue(0.0, NULL, CThreadSlm::TState()); 266 0 yongsun 267 0 yongsun for (int i=0, sz=sntnc.size(); i < sz; ) { 268 0 yongsun const CSIMDict::TState* pstate; 269 0 yongsun const TWCHAR* p = sntnc.c_str()+i; 270 0 yongsun int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p); 271 0 yongsun if (len <= 0) { 272 0 yongsun idcur = SIM_ID_NOT_WORD; 273 0 yongsun len = 1; 274 0 yongsun } else { 275 0 yongsun idcur = pstate->word_id; 276 0 yongsun } 277 0 yongsun int ambilen = getAmbiLen(p, len); 278 0 yongsun 279 0 yongsun if (ambilen <= len) { 280 0 yongsun insertLatticeWord(lattice, TLatticeWord(i, i+len, idcur)); 281 0 yongsun i += len; 282 0 yongsun } else { 283 0 yongsun fullSegBuildLattice(sntnc, i, ambilen, lattice); 284 0 yongsun i += ambilen; 285 0 yongsun } 286 0 yongsun } 287 0 yongsun lattice[sntnc.size()].m_wordstarting.push_back(TLatticeWord(sntnc.size(), sntnc.size()+1, s_iSTOKID)); 288 0 yongsun } 289 0 yongsun 290 0 yongsun void searchBest(CLattice& lattice) 291 0 yongsun { 292 0 yongsun for (int i=0, sz=lattice.size(); i < sz; ++i) { 293 0 yongsun TLatticeColumnStates & states = lattice[i].m_states; 294 0 yongsun TLatticeColumnStates::iterator itss = states.begin(); 295 0 yongsun TLatticeColumnStates::iterator itse = states.end(); 296 0 yongsun for (; itss != itse; ++itss) { 297 0 yongsun TLatticeWordVec::iterator itws = lattice[i].m_wordstarting.begin(); 298 0 yongsun TLatticeWordVec::iterator itwe = lattice[i].m_wordstarting.end(); 299 0 yongsun for (; itws != itwe; ++itws) { 300 0 yongsun CThreadSlm::TState his = itss->first; 301 0 yongsun double pr = itss->second.m_pr; 302 0 yongsun pr += s_tslm->transferNegLog(his, itws->m_wordId, his); 303 0 yongsun TLatticeColumnStates & rss = lattice[itws->m_right].m_states; 304 0 yongsun s_tslm->historify(his); 305 0 yongsun TLatticeColumnStates::iterator itn = rss.find(his); 306 0 yongsun if (itn == rss.end()) { 307 0 yongsun rss[his] = TLatticeStateValue(pr, &(*itws), itss->first); 308 0 yongsun } else { 309 0 yongsun if (itn->second.m_pr > pr) { 310 0 yongsun rss[his] = TLatticeStateValue(pr, &(*itws), itss->first); 311 0 yongsun } 312 0 yongsun } 313 0 yongsun } 314 0 yongsun } 315 0 yongsun } 316 0 yongsun } 317 0 yongsun 318 0 yongsun void getBestPath(CLattice& lattice, TLatticeWordVec& segResult) 319 0 yongsun { 320 0 yongsun TLatticeColumnStates & states = lattice.back().m_states; 321 0 yongsun TLatticeColumnStates::iterator its = states.begin(); 322 0 yongsun 323 0 yongsun TLatticeWord* pbtword = its->second.mp_btword; 324 0 yongsun CThreadSlm::TState btstate = its->second.m_btstate; 325 0 yongsun its = lattice[pbtword->m_left].m_states.find(btstate); 326 0 yongsun assert(its != lattice[pbtword->m_left].m_states.end()); 327 0 yongsun 328 0 yongsun segResult.clear(); 329 0 yongsun while (true) { 330 0 yongsun pbtword = its->second.mp_btword; 331 0 yongsun if (pbtword != NULL) { 332 90 tonylee #ifndef HOST_OS_GNUC_2 333 0 yongsun segResult.push_back(*pbtword); 334 90 tonylee #else // HOST_OS_GNUC_2 335 90 tonylee segResult.insert(segResult.begin(), *pbtword); 336 90 tonylee #endif // !HOST_OS_GNUC_2 337 0 yongsun btstate = its->second.m_btstate; 338 0 yongsun its = lattice[pbtword->m_left].m_states.find(btstate); 339 0 yongsun assert(its != lattice[pbtword->m_left].m_states.end()); 340 0 yongsun } else { 341 0 yongsun break; 342 0 yongsun } 343 0 yongsun } 344 90 tonylee #ifndef HOST_OS_GNUC_2 345 0 yongsun std::reverse(segResult.begin(), segResult.end()); 346 90 tonylee #endif // HOST_OS_GNUC_2 347 0 yongsun } 348 0 yongsun 349 0 yongsun static bool 350 0 yongsun processSingleFile(FILE* fp, int &nWords, int &nAmbis) 351 0 yongsun { 352 0 yongsun nWords = 0; 353 0 yongsun nAmbis = 0; 354 0 yongsun 355 0 yongsun wstring sntnc; 356 0 yongsun CSIMCharReader *pReader = new CSIMCharReader(fp); 357 0 yongsun CSIMCharReader::iterator iter = pReader->begin(); 358 0 yongsun TSIMWordId idcur, idprev = s_iSTOKID; 359 0 yongsun 360 0 yongsun if (!s_bTextOut) 361 0 yongsun output_stok(nWords); 362 0 yongsun 363 0 yongsun while (true){ 364 0 yongsun if (ReadSentence(sntnc, iter, false) == false) 365 0 yongsun break; 366 0 yongsun 367 0 yongsun CLattice lattice; 368 0 yongsun buildLattice(sntnc, lattice); 369 0 yongsun searchBest(lattice); 370 0 yongsun 371 0 yongsun TLatticeWordVec segResult; 372 0 yongsun getBestPath(lattice, segResult); 373 0 yongsun 374 0 yongsun for (int i=0, sz=segResult.size(); i < sz; ++i) { 375 0 yongsun const TWCHAR *p = sntnc.c_str()+segResult[i].m_left; 376 0 yongsun int len = segResult[i].m_right - segResult[i].m_left; 377 0 yongsun idcur = segResult[i].m_wordId; 378 0 yongsun 379 0 yongsun output(len, p, idprev, idcur, nWords); 380 0 yongsun idprev = idcur; 381 0 yongsun } 382 0 yongsun 383 0 yongsun if (!s_bTextOut) { 384 0 yongsun output_stok(nWords); 385 0 yongsun idprev = s_iSTOKID; 386 0 yongsun } 387 0 yongsun } 388 0 yongsun 389 0 yongsun fflush(stdout); 390 0 yongsun return true; 391 0 yongsun } 392 0 yongsun 393 0 yongsun int 394 0 yongsun main(int argc, char *argv[]) 395 0 yongsun { 396 0 yongsun int nWords, nAmbis; 397 0 yongsun 398 0 yongsun setlocale(LC_ALL, ""); 399 0 yongsun getParameters(argc, argv); 400 0 yongsun argc -= optind; 401 0 yongsun argv += optind; 402 0 yongsun 403 0 yongsun fprintf(stderr, "Loading lexicon..."); 404 0 yongsun fflush(stderr); 405 0 yongsun s_dict = new CSIMDict(); 406 0 yongsun s_tslm = new CThreadSlm(); 407 0 yongsun if (!s_dict->parseText(s_strDictFile)) { 408 0 yongsun fprintf(stderr, "fail to open Lexicon file!\n"); 409 0 yongsun fflush(stderr); 410 0 yongsun exit(11); 411 0 yongsun } 412 0 yongsun if (!s_tslm->load(s_strSlmFile, true)) { 413 0 yongsun fprintf(stderr, "fail to open slm file!\n"); 414 0 yongsun fflush(stderr); 415 0 yongsun exit(12); 416 0 yongsun } 417 0 yongsun fprintf(stderr, "done"); 418 0 yongsun fflush(stderr); 419 0 yongsun 420 0 yongsun if (argc == 0) { 421 0 yongsun fprintf(stderr, "\nProcessing from stdin..."); 422 0 yongsun fflush(stderr); 423 0 yongsun processSingleFile(stdin, nWords, nAmbis); 424 0 yongsun fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis); 425 0 yongsun fflush(stderr); 426 0 yongsun } else { 427 0 yongsun for (int i=0; i < argc; ++i) { 428 0 yongsun fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr); 429 0 yongsun FILE *fp = fopen(argv[i], "r"); 430 0 yongsun if (fp != NULL) { 431 0 yongsun processSingleFile(fp, nWords, nAmbis); 432 0 yongsun fprintf(stderr, "@Offset %u, %d words, %d ambiguious. Done!\n", 433 0 yongsun ftell(fp), nWords, nAmbis); 434 0 yongsun fflush(stderr); 435 0 yongsun } else { 436 0 yongsun fprintf(stderr, "Can not Open!!!!!!!\n"); 437 0 yongsun fflush(stderr); 438 0 yongsun } 439 0 yongsun fclose(fp); 440 0 yongsun } 441 0 yongsun } 442 0 yongsun 443 0 yongsun s_tslm->free(); 444 0 yongsun delete s_tslm; 445 0 yongsun s_tslm = NULL; 446 0 yongsun s_dict->close(); 447 0 yongsun delete s_dict; 448 0 yongsun s_dict = NULL; 449 0 yongsun return 0; 450 0 yongsun } 451