1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifdef HAVE_CONFIG_H 39 0 yongsun #include "config.h" 40 0 yongsun #endif 41 0 yongsun 42 0 yongsun #ifdef HAVE_ASSERT_H 43 0 yongsun #include <assert.h> 44 0 yongsun #endif 45 0 yongsun 46 0 yongsun #include <stdlib.h> 47 0 yongsun #include <math.h> 48 0 yongsun #include <vector> 49 0 yongsun #include <algorithm> 50 0 yongsun 51 0 yongsun #include "sim_slmbuilder.h" 52 0 yongsun 53 0 yongsun void CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) 54 0 yongsun { 55 0 yongsun if (dis != NULL) 56 0 yongsun delete [] dis; 57 0 yongsun dis = new double[--n]; 58 0 yongsun if (thres > n) thres = n; 59 0 yongsun for (int freq=1; freq < n; ++freq) { 60 0 yongsun if (nr[freq] == 0 || nr[freq+1] == 0) 61 0 yongsun dis[freq] = 1.0; 62 0 yongsun else 63 0 yongsun dis[freq] = double(nr[freq+1])/nr[freq]; 64 0 yongsun printf("%lf ", dis[freq]); fflush(stdout); 65 0 yongsun } 66 0 yongsun } 67 0 yongsun 68 0 yongsun double CSlmGTDiscounter::discount(int freq) 69 0 yongsun { 70 0 yongsun double newfreq = freq * ((freq < thres)?dis[freq]:hd); 71 0 yongsun if (newfreq >= double(freq)) 72 0 yongsun newfreq = freq * hd; 73 0 yongsun return newfreq; 74 0 yongsun } 75 0 yongsun 76 0 yongsun void CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) 77 0 yongsun { 78 0 yongsun // normally, c should not greater than 1.0, yet when cut-off is used, it could be so. 79 0 yongsun if (c <= 0.0) { 80 0 yongsun c = double(nr[1]) / (nr[1] + 2.0*nr[2]); 81 0 yongsun printf("parameter c=%lf", c); fflush(stdout); 82 0 yongsun } else { 83 0 yongsun printf("Using given parameter c=%lf", c); fflush(stdout); 84 0 yongsun } 85 0 yongsun } 86 0 yongsun 87 0 yongsun double CSlmAbsoluteDiscounter::discount(int freq) 88 0 yongsun { 89 0 yongsun return (freq > 0)?(freq - c):(0.0); 90 0 yongsun } 91 0 yongsun 92 0 yongsun void CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) 93 0 yongsun { 94 0 yongsun if (dis <= 0.0 || dis >= 1.0) { 95 0 yongsun dis = 1.0 - double(nr[1])/nr[0]; 96 0 yongsun printf("parameter d=%lf", dis); fflush(stdout); 97 0 yongsun } else { 98 0 yongsun printf("Using given parameter d=%lf", dis); fflush(stdout); 99 0 yongsun } 100 0 yongsun } 101 0 yongsun 102 0 yongsun double CSlmLinearDiscounter::discount(int freq) 103 0 yongsun { 104 0 yongsun return freq * dis; 105 0 yongsun } 106 0 yongsun 107 0 yongsun // n=1 for unigram, n=2 for bigram; 108 0 yongsun // level[0] is for psuedo 0 gram, ... 109 0 yongsun void CSlmBuilder::Create(int n) 110 0 yongsun { 111 0 yongsun assert(n != 0); 112 0 yongsun nlevel = n; 113 0 yongsun level = new void * [n+1]; 114 0 yongsun for (int i=0; i<n; ++i) { 115 0 yongsun level[i] = new std::vector<TNode>; 116 0 yongsun if (i) ((TNodeLevel*)level[i])->reserve(1024); 117 0 yongsun } 118 0 yongsun //Add leaf level 119 0 yongsun level[n] = new std::vector<TLeaf>; 120 0 yongsun ((TLeafLevel*)level[n])->reserve(1024); 121 0 yongsun 122 0 yongsun //Add psuedo root node 123 0 yongsun ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0)); 124 87 yongsun 125 87 yongsun //Initialize the nr[n+1][SLM_MAX_R] 2-D array 126 0 yongsun nr = new FREQ_TYPE[n+1][SLM_MAX_R]; 127 87 yongsun for (int lvl=0; lvl<n+1; ++lvl) 128 87 yongsun for (int r=0; r<SLM_MAX_R; ++r) 129 87 yongsun nr[lvl][r] = 0; 130 0 yongsun } 131 0 yongsun 132 0 yongsun void CSlmBuilder::SetCut(FREQ_TYPE threshold[]) 133 0 yongsun { 134 0 yongsun if (cut != NULL) 135 0 yongsun delete []cut; 136 0 yongsun cut = new FREQ_TYPE[nlevel+1]; 137 0 yongsun for (int i=0; i<nlevel; ++i) 138 0 yongsun cut[i+1] = threshold[i]; 139 0 yongsun } 140 0 yongsun 141 0 yongsun void CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[]) 142 0 yongsun { 143 0 yongsun if (discounter != NULL) 144 0 yongsun delete []discounter; 145 0 yongsun discounter = new CSlmDiscounter* [nlevel+1]; 146 0 yongsun for (int i=0; i < nlevel; ++i) 147 0 yongsun discounter[i+1] = dis[i]; 148 0 yongsun } 149 0 yongsun 150 0 yongsun void CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[]) 151 0 yongsun { 152 0 yongsun breaker.clear(); 153 0 yongsun for (int i=0; i < nId; ++i) 154 0 yongsun breaker.push_back(brks[i]); 155 0 yongsun std::make_heap(breaker.begin(), breaker.end()); 156 0 yongsun std::sort_heap(breaker.begin(), breaker.end()); 157 0 yongsun } 158 0 yongsun 159 0 yongsun void CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[]) 160 0 yongsun { 161 0 yongsun m_excludes.clear(); 162 0 yongsun for (int i=0; i < nId; ++i) 163 0 yongsun m_excludes.push_back(excludes[i]); 164 0 yongsun std::make_heap(m_excludes.begin(), m_excludes.end()); 165 0 yongsun std::sort_heap(m_excludes.begin(), m_excludes.end()); 166 0 yongsun } 167 0 yongsun 168 0 yongsun bool CSlmBuilder::isBreakId(TSIMWordId id) 169 0 yongsun { 170 0 yongsun return std::binary_search(breaker.begin(), breaker.end(), id); 171 0 yongsun } 172 0 yongsun 173 0 yongsun bool CSlmBuilder::isExcludeId(TSIMWordId id) 174 0 yongsun { 175 0 yongsun return std::binary_search(m_excludes.begin(), m_excludes.end(), id); 176 0 yongsun } 177 0 yongsun 178 0 yongsun void CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr) 179 0 yongsun { 180 0 yongsun int ch; 181 0 yongsun bool brk = isExcludeId(*ngram); 182 0 yongsun 183 0 yongsun for (int i=1; i < nlevel; ++i) { 184 0 yongsun TNodeLevel* pnl = (TNodeLevel*)(level[i]); 185 0 yongsun if (pnl->capacity() == pnl->size()) { 186 0 yongsun size_t newsz = 2*pnl->capacity(); 187 0 yongsun if (pnl->capacity() > 1024*1024) 188 0 yongsun newsz = pnl->capacity() + 1024*1024; 189 0 yongsun pnl->reserve(newsz); 190 0 yongsun } 191 0 yongsun } 192 0 yongsun TLeafLevel* pll = (TLeafLevel*)(level[nlevel]); 193 0 yongsun if (pll->capacity() == pll->size()) { 194 0 yongsun size_t newsz = 2*pll->capacity(); 195 0 yongsun if (pll->capacity() > 1024*1024) 196 0 yongsun newsz = pll->capacity() + 1024*1024; 197 0 yongsun pll->reserve(newsz); 198 0 yongsun } 199 0 yongsun 200 0 yongsun if (!brk) 201 0 yongsun (*(TNodeLevel*)(level[0]))[0].freq += fr; 202 0 yongsun 203 0 yongsun bool branch = false; 204 0 yongsun for (int i=1; (!brk && i < nlevel); ++i) { 205 0 yongsun std::vector<TNode> & pv = *(TNodeLevel*)(level[i-1]); 206 0 yongsun std::vector<TNode> & v = *(TNodeLevel*)(level[i]); 207 0 yongsun branch = branch || (pv.back().child >= v.size()) || (v.back().id != ngram[i-1]) ; 208 0 yongsun if (branch) { 209 0 yongsun if (i == nlevel-1) 210 0 yongsun ch = ((TLeafLevel*)(level[i+1]))->size(); 211 0 yongsun else 212 0 yongsun ch = ((TNodeLevel*)(level[i+1]))->size(); 213 0 yongsun v.push_back(TNode(ngram[i-1], ch, fr)); 214 0 yongsun } else { 215 0 yongsun v.back().freq += fr; 216 0 yongsun } 217 0 yongsun brk = (i > 1 && isBreakId(ngram[i-1])) || isExcludeId(ngram[i]); 218 0 yongsun } 219 0 yongsun 220 0 yongsun // Insert to the leaf level 221 87 yongsun if (!brk) { 222 87 yongsun if (fr > cut[nlevel]) { 223 87 yongsun TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); 224 87 yongsun v.push_back(TLeaf(ngram[nlevel-1], fr)); 225 87 yongsun } else { 226 87 yongsun nr[nlevel][0] += fr; 227 87 yongsun nr[nlevel][fr] += fr; 228 87 yongsun } 229 0 yongsun } 230 0 yongsun } 231 0 yongsun 232 0 yongsun void CSlmBuilder::CountNr() 233 0 yongsun { 234 0 yongsun printf("\nCounting Nr..."); fflush(stdout); 235 0 yongsun for (int lvl=1; lvl<nlevel; ++lvl) { 236 0 yongsun TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 237 0 yongsun for (TNodeIterator it=v.begin(), ite=v.end(); it != ite; ++it) { 238 0 yongsun FREQ_TYPE freq = it->freq; 239 0 yongsun nr[lvl][0] += freq; 240 0 yongsun if (freq < SLM_MAX_R && freq > 0) 241 0 yongsun nr[lvl][freq] += freq; 242 0 yongsun } 243 0 yongsun } 244 0 yongsun TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); 245 0 yongsun for (TLeafIterator it=v.begin(), ite=v.end(); it != ite; ++it) { 246 0 yongsun FREQ_TYPE freq = it->freq; 247 0 yongsun nr[nlevel][0] += freq; 248 0 yongsun if (freq < SLM_MAX_R && freq > 0) 249 0 yongsun nr[nlevel][freq] += freq; 250 0 yongsun } 251 0 yongsun printf("\n"); fflush(stdout); 252 0 yongsun } 253 0 yongsun 254 0 yongsun int 255 0 yongsun CSlmBuilder::CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast, 256 0 yongsun TLeafIterator chfirst, TLeafIterator chlast, int thred) 257 0 yongsun { 258 0 yongsun int idxfirst, idxchk; 259 0 yongsun TLeafIterator chchk = chfirst; 260 0 yongsun for (idxfirst=idxchk=0; chchk != chlast; ++chchk, ++idxchk) { 261 0 yongsun //do not cut item whoese 1. freq > thred; 2. psuedo tail 262 0 yongsun if (chchk->freq > thred || (chchk+1) == chlast) { 263 0 yongsun if (idxfirst < idxchk) 264 0 yongsun *chfirst = *chchk; 265 0 yongsun for (;pfirst != plast && pfirst->child <= idxchk; ++pfirst) 266 0 yongsun pfirst->child = idxfirst; 267 0 yongsun ++idxfirst; 268 0 yongsun ++chfirst; 269 0 yongsun } 270 0 yongsun } 271 0 yongsun assert(pfirst == plast); 272 0 yongsun return idxfirst; 273 0 yongsun } 274 0 yongsun 275 0 yongsun int 276 0 yongsun CSlmBuilder::CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast, 277 0 yongsun TNodeIterator chfirst, TNodeIterator chlast, int thred) 278 0 yongsun { 279 0 yongsun int idxfirst, idxchk; 280 0 yongsun TNodeIterator chchk = chfirst; 281 0 yongsun for (idxfirst=idxchk=0; chchk != chlast; ++chchk, ++idxchk) { 282 0 yongsun //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children 283 0 yongsun TNodeIterator chnext = chchk + 1; 284 0 yongsun if (chchk->freq > thred || chnext == chlast || (chnext->child != chchk->child)) { 285 0 yongsun if (idxfirst < idxchk) 286 0 yongsun *chfirst = *chchk; 287 0 yongsun for (;pfirst != plast && pfirst->child <= idxchk; ++pfirst) 288 0 yongsun pfirst->child = idxfirst; 289 0 yongsun ++idxfirst; 290 0 yongsun ++chfirst; 291 0 yongsun } 292 0 yongsun } 293 0 yongsun assert(pfirst == plast); 294 0 yongsun return idxfirst; 295 0 yongsun } 296 0 yongsun 297 0 yongsun void CSlmBuilder::Cut() 298 0 yongsun { 299 0 yongsun printf("\nCuting according freq..."); fflush(stdout); 300 0 yongsun for (int lvl=nlevel; lvl>0; --lvl) { 301 0 yongsun printf("\n Cut level %d with threshold %d...", lvl, cut[lvl]); fflush(stdout); 302 0 yongsun TNodeLevel& parent = *(TNodeLevel*)(level[lvl-1]); 303 0 yongsun if (lvl == nlevel) { 304 0 yongsun if (cut[lvl] > 0) { 305 0 yongsun TLeafLevel& v = *(TLeafLevel*)(level[lvl]); 306 0 yongsun int newsize = CutLeafLevel(parent.begin(), parent.end(), v.begin(), v.end(), cut[lvl]); 307 0 yongsun v.erase(v.begin()+newsize, v.end()); 308 0 yongsun } 309 0 yongsun } else { 310 0 yongsun if (cut[lvl] > 0) { 311 0 yongsun TNodeLevel& v= *(TNodeLevel*)(level[lvl]); 312 0 yongsun int newsize = CutNodeLevel(parent.begin(), parent.end(), v.begin(), v.end(), cut[lvl]); 313 0 yongsun v.erase(v.begin()+newsize, v.end()); 314 0 yongsun } 315 0 yongsun } 316 0 yongsun } 317 0 yongsun printf("\n"); fflush(stdout); 318 0 yongsun } 319 0 yongsun 320 0 yongsun void CSlmBuilder::AppendTails() 321 0 yongsun { 322 0 yongsun printf("\nAppending psuedo tail node for each level..."); fflush(stdout); 323 0 yongsun for (int lvl=0; lvl<nlevel; ++lvl) { 324 0 yongsun int child_size = 0; 325 0 yongsun if (lvl == nlevel - 1){ 326 0 yongsun child_size = ((TLeafLevel*)(level[lvl+1]))->size(); 327 0 yongsun } else { 328 0 yongsun child_size = ((TNodeLevel*)(level[lvl+1]))->size(); 329 0 yongsun } 330 0 yongsun TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 331 0 yongsun v.push_back(TNode(0x00FFFFFF, child_size, 1)); 332 0 yongsun } 333 0 yongsun //also make a psuedo tail node for the leaf level 334 0 yongsun ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1)); 335 0 yongsun printf("\n"); fflush(stdout); 336 0 yongsun } 337 0 yongsun 338 0 yongsun template<class TChildLevel> 339 0 yongsun void DiscountOneLevel(CSlmBuilder::TNodeLevel& v, TChildLevel& ch, CSlmDiscounter* disc, int bUseLogPr) 340 0 yongsun { 341 0 yongsun CSlmBuilder::TNodeIterator it = v.begin(); 342 0 yongsun CSlmBuilder::TNodeIterator ite= v.begin() + (v.size()-1); 343 0 yongsun for (; it != ite; ++it) { //do not calc the psuedo tail item 344 0 yongsun CSlmBuilder::TNodeIterator itnext = it + 1; 345 0 yongsun double root_freq = it->freq; 346 0 yongsun for (int h = it->child, t = itnext->child; h < t; ++h) { 347 0 yongsun double pr = disc->discount(ch[h].freq) / root_freq; 348 0 yongsun assert(pr > 0.0 && pr < 1.0); 349 0 yongsun if (bUseLogPr) { 350 0 yongsun ch[h].pr = CSlmBuilder::PR_TYPE( -log(pr) ); 351 0 yongsun } else { 352 0 yongsun ch[h].pr = CSlmBuilder::PR_TYPE( pr ); 353 0 yongsun } 354 0 yongsun } 355 0 yongsun } 356 0 yongsun } 357 0 yongsun 358 0 yongsun void CSlmBuilder::Discount() 359 0 yongsun { 360 0 yongsun printf("\nDiscounting..."); 361 0 yongsun for (int lvl=nlevel; lvl>0; --lvl){ 362 0 yongsun printf("\n Initializing level %d's %s discount method: ", lvl, discounter[lvl]->getName()); 363 0 yongsun discounter[lvl]->init(SLM_MAX_R, nr[lvl]); 364 0 yongsun } 365 0 yongsun printf("\n"); 366 0 yongsun for (int lvl = nlevel-1; lvl >= 0; --lvl) { 367 0 yongsun printf("\n Discounting level %d ...", lvl+1); fflush(stdout); 368 0 yongsun TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 369 0 yongsun if (lvl == nlevel - 1) { //its child is leaf 370 0 yongsun TLeafLevel& ch = *(TLeafLevel*)(level[lvl+1]); 371 0 yongsun DiscountOneLevel(v, ch, discounter[lvl+1], bUseLogPr); 372 0 yongsun } else { 373 0 yongsun TNodeLevel& ch = *(TNodeLevel*)(level[lvl+1]); 374 0 yongsun DiscountOneLevel(v, ch, discounter[lvl+1], bUseLogPr); 375 0 yongsun } 376 0 yongsun } 377 0 yongsun printf("\n Giving psuedo root level 0 a distribution..."); 378 0 yongsun //make the psuedo 0-gram a equal distribution 379 0 yongsun TNodeLevel& v0 = *(TNodeLevel*)(level[0]); 380 0 yongsun if (bUseLogPr) { 381 0 yongsun v0[0].pr = PR_TYPE(-log(double(1.0)/m_nWord)); 382 0 yongsun } else { 383 0 yongsun v0[0].pr = PR_TYPE(double(1.0)/m_nWord); 384 0 yongsun } 385 0 yongsun printf("\n"); fflush(stdout); 386 0 yongsun } 387 0 yongsun 388 0 yongsun template<class chIterator> 389 0 yongsun double CalcNodeBow(CSlmBuilder* builder, int lvl, TSIMWordId words[], chIterator chh, chIterator cht, int bUseLogPr) 390 0 yongsun { 391 0 yongsun if (chh == cht) return 1.0; 392 0 yongsun double sumnext = 0.0, sum=0.0; 393 0 yongsun for (; chh < cht; ++chh) { 394 0 yongsun if (bUseLogPr) { 395 0 yongsun sumnext += exp(-(chh->pr)); 396 0 yongsun } else { 397 0 yongsun sumnext += double(chh->pr); 398 0 yongsun } 399 0 yongsun words[lvl+1] = chh->id; 400 0 yongsun sum += builder->getPr(lvl, words+2); 401 0 yongsun } 402 0 yongsun assert(sumnext > 0.0 && sumnext < 1.05); 403 0 yongsun assert(sum < 1.05 && sum > 0.0); 404 0 yongsun // 405 0 yongsun if (sumnext >= 1.0 || sum >= 1.0) { 406 0 yongsun double bow = ((sumnext > sum)?sumnext:sum)+0.0001; 407 0 yongsun bow = (bow - sumnext) / (bow - sum); 408 0 yongsun printf("\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:", sumnext, sum, bow, lvl); 409 0 yongsun for (int i=1; i <= lvl; ++i) 410 0 yongsun printf("%d ", words[i]); 411 0 yongsun return bow; 412 0 yongsun } 413 0 yongsun return (1.0-sumnext)/(1.0-sum); 414 0 yongsun } 415 0 yongsun 416 0 yongsun void CSlmBuilder::CalcBOW() 417 0 yongsun { 418 0 yongsun printf("\nCalculating Back-Off Weight..."); 419 0 yongsun for (int lvl=0; lvl < nlevel; ++lvl) { 420 0 yongsun printf("\n Processing level %d ", lvl); fflush(stdout); 421 241 tchaikov TNode* base[16]; //it should be lvl+1, yet some compiler does not support it 422 241 tchaikov int idx[16]; //it should be lvl+1, yet some compiler does not support it 423 0 yongsun for (int i=0; i <= lvl; ++i) { 424 0 yongsun base[i] = &((*(TNodeLevel*)level[i])[0]); 425 0 yongsun idx[i] = 0; 426 0 yongsun } 427 0 yongsun TSIMWordId words[17]; //it should be lvl+2, yet some compiler do not support it 428 0 yongsun int sz = ((TNodeLevel*)(level[lvl]))->size()-1; 429 0 yongsun printf("(%d items)...", sz+1); fflush(stdout); 430 0 yongsun for (; idx[lvl] < sz; ++idx[lvl]) { 431 0 yongsun words[lvl] = base[lvl][idx[lvl]].id; 432 0 yongsun for (int k=lvl-1; k >= 0; --k) { 433 0 yongsun while (base[k][idx[k]+1].child <= idx[k+1]) 434 0 yongsun ++idx[k]; 435 0 yongsun words[k] = base[k][idx[k]].id; 436 0 yongsun } 437 0 yongsun TNode & node = base[lvl][idx[lvl]]; 438 0 yongsun TNode & nodenext = *((&node)+1); 439 0 yongsun double bow; 440 0 yongsun if (lvl == nlevel-1) { 441 0 yongsun TLeaf * ch = &((*(TLeafLevel*)level[lvl+1])[0]); 442 0 yongsun bow = CalcNodeBow(this, lvl, words, ch+node.child, ch+nodenext.child, bUseLogPr); 443 0 yongsun } else { 444 0 yongsun TNode * ch = &((*(TNodeLevel*)level[lvl+1])[0]); 445 0 yongsun bow = CalcNodeBow(this, lvl, words, ch+node.child, ch+nodenext.child, bUseLogPr); 446 0 yongsun } 447 0 yongsun if (bUseLogPr) { 448 0 yongsun node.bow = PR_TYPE(-log(bow)); 449 0 yongsun } else { 450 0 yongsun node.bow = PR_TYPE(bow); 451 0 yongsun } 452 0 yongsun } 453 0 yongsun } 454 0 yongsun printf("\n"); fflush(stdout); 455 0 yongsun } 456 0 yongsun 457 0 yongsun double CSlmBuilder::getPr(int n, TSIMWordId *words) 458 0 yongsun { 459 0 yongsun int lvl; 460 0 yongsun double bow = 1.0; 461 0 yongsun void* pnode = &((*(TNodeLevel*)level[0])[0]); 462 0 yongsun 463 0 yongsun assert(n <= nlevel); 464 0 yongsun 465 0 yongsun if (n==0) { 466 0 yongsun if (bUseLogPr) { 467 0 yongsun return exp(-((TNode*)pnode)->pr); 468 0 yongsun } else { 469 0 yongsun return ((TNode*)pnode)->pr; 470 0 yongsun } 471 0 yongsun } 472 0 yongsun 473 0 yongsun for (lvl = 0; pnode != NULL && lvl < n; ++lvl) { 474 0 yongsun if (bUseLogPr) { 475 0 yongsun bow = exp(-((TNode*)pnode)->bow); 476 0 yongsun } else { 477 0 yongsun bow = ((TNode*)pnode)->bow; 478 0 yongsun } 479 0 yongsun pnode = FindChild(lvl, (TNode*)pnode, words[lvl]); 480 0 yongsun } 481 0 yongsun 482 0 yongsun if (pnode != NULL) { // find the whole string 483 0 yongsun if (bUseLogPr) { 484 0 yongsun return exp(-((TLeaf*)pnode)->pr); 485 0 yongsun } else { 486 0 yongsun return ((TLeaf*)pnode)->pr; 487 0 yongsun } 488 0 yongsun } else if (lvl == n-1) { // only find the history 489 0 yongsun return bow*getPr(n-1, words+1); 490 0 yongsun } else { //even not find the history 491 0 yongsun return getPr(n-1, words+1); 492 0 yongsun } 493 0 yongsun } 494 0 yongsun 495 0 yongsun void* CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id) 496 0 yongsun { 497 0 yongsun int chh = root->child, cht = (root+1)->child; 498 0 yongsun if (lvl == nlevel-1) { 499 0 yongsun TLeaf* pleaf = &((*(TLeafLevel*)level[lvl+1])[0]); 500 0 yongsun return (void*)binary_find(pleaf, chh, cht, TLeaf(id)); 501 0 yongsun } else { 502 0 yongsun TNode* pnode = &((*(TNodeLevel*)level[lvl+1])[0]); 503 0 yongsun return (void*)binary_find(pnode, chh, cht, TNode(id)); 504 0 yongsun } 505 0 yongsun } 506 0 yongsun 507 0 yongsun void CSlmBuilder::Build() 508 0 yongsun { 509 0 yongsun CountNr(); 510 0 yongsun AppendTails(); 511 0 yongsun Cut(); 512 0 yongsun Discount(); 513 0 yongsun CalcBOW(); 514 0 yongsun } 515 0 yongsun 516 0 yongsun void CSlmBuilder::Write(FILE *out) 517 0 yongsun { 518 0 yongsun fwrite(&nlevel, sizeof(nlevel), 1, out); 519 0 yongsun fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out); 520 0 yongsun for (int lvl=0; lvl <= nlevel; ++lvl) { 521 0 yongsun int sz = 0; 522 0 yongsun if (lvl == nlevel) 523 0 yongsun sz = ((TLeafLevel*)(level[lvl]))->size(); 524 0 yongsun else 525 0 yongsun sz = ((TNodeLevel*)(level[lvl]))->size(); 526 0 yongsun fwrite(&sz, sizeof(sz), 1, out); 527 0 yongsun } 528 0 yongsun for (int lvl=0; lvl < nlevel; ++lvl) { 529 0 yongsun TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 530 0 yongsun for (TNodeIterator it=v.begin(), ite=v.end(); it != ite; ++it) 531 0 yongsun fwrite(&(*it), sizeof(TNode), 1, out); 532 0 yongsun } 533 0 yongsun TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); 534 0 yongsun for (TLeafIterator it=v.begin(), ite=v.end(); it != ite; ++it) 535 0 yongsun fwrite(&(*it), sizeof(TLeaf), 1, out); 536 0 yongsun } 537 0 yongsun 538 0 yongsun void CSlmBuilder::Close(void) 539 0 yongsun { 540 0 yongsun if (level != NULL) { 541 0 yongsun for (int lvl=0; lvl <= nlevel; ++lvl) { 542 0 yongsun if (lvl == nlevel) 543 0 yongsun delete (TLeafLevel*)(level[lvl]); 544 0 yongsun else 545 0 yongsun delete (TNodeLevel*)(level[lvl]); 546 0 yongsun } 547 0 yongsun delete [] level; 548 0 yongsun level = NULL; 549 0 yongsun } 550 0 yongsun if (cut != NULL) { 551 0 yongsun delete [] cut; 552 0 yongsun cut = NULL; 553 0 yongsun } 554 0 yongsun if (discounter != NULL) { 555 0 yongsun for (int lvl = 1; lvl <= nlevel; ++lvl) { 556 0 yongsun delete discounter[lvl]; 557 0 yongsun } 558 0 yongsun delete [] discounter; 559 0 yongsun discounter = NULL; 560 0 yongsun } 561 0 yongsun if (nr != NULL) { 562 0 yongsun delete [] nr; 563 0 yongsun nr = NULL; 564 0 yongsun } 565 0 yongsun breaker.clear(); 566 0 yongsun m_nWord = 0; 567 0 yongsun nlevel = 0; 568 0 yongsun } 569 0 yongsun 570