1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifndef _SUN_AGC_SLM_H 39 0 yongsun #define _SUN_AGC_SLM_H 40 0 yongsun 41 0 yongsun #include "../portability.h" 42 0 yongsun 43 0 yongsun #include <stdio.h> 44 0 yongsun 45 0 yongsun /** 46 0 yongsun * Thread slm make the following modifications to simple back-off language model 47 0 yongsun * -# Word id are limited to 18 bits, about 240K word ids 48 0 yongsun * -# Compact all float value of -log(pr) into 65536 (16 bits) 49 0 yongsun * level and use a table to map the index to a float value; 50 0 yongsun * -# Compact all float value of -log(pr) into 16384 (14 bits) 51 0 yongsun * level and use a table to map the index to a float value; 52 0 yongsun * -# threading infomation embed into binary model file. Threading include 53 0 yongsun * - bol(back-off-level) from current level 54 0 yongsun * - bon(back-off-node)'s index in the bol level array 55 0 yongsun * . 56 0 yongsun * The thread could be used: 57 0 yongsun * - when leaf node are arrived, it could use (bol,bon) as history for 58 0 yongsun * history node. 59 0 yongsun * - when a word could not be found in current node (cl, cn)'s children, 60 0 yongsun * searching could be transfered to (bol, bon) directly and continue 61 0 yongsun * searching the target word 62 0 yongsun * -# Add a basic type TState in Language model, a state is pair of\n 63 0 yongsun * (level, array_idx_of_the level) 64 0 yongsun * -# change all get probability interface to\n 65 0 yongsun * double transfer(TState& history, unsigned int wid, TState& result); 66 0 yongsun */ 67 0 yongsun class CThreadSlm { 68 0 yongsun public: 69 0 yongsun enum { 70 0 yongsun BITS_BOW = 14, 71 0 yongsun BITS_PR = 16, 72 0 yongsun ID_NOT_WORD = 69 73 0 yongsun }; 74 0 yongsun 75 0 yongsun /** 76 0 yongsun * (level:idx) located a state in the language model very well 77 0 yongsun * Please note the psuedo unigram state, with level == 0, but idx > 0 78 0 yongsun * it's for used with bigram cache model 79 0 yongsun */ 80 0 yongsun union TState{ 81 0 yongsun TState(const TState& b) : m_all(b.m_all) { } 82 0 yongsun TState(unsigned level=0, unsigned idx=0) { anony.m_Level=level; anony.m_Idx=idx; } 83 0 yongsun 84 0 yongsun inline TState& operator++() { ++anony.m_Idx; return *this; } 85 0 yongsun 86 0 yongsun inline void setIdx(unsigned int idx) { anony.m_Idx = idx; } 87 0 yongsun inline void setLevel(unsigned int lvl) { anony.m_Level = lvl; } 88 0 yongsun 89 0 yongsun inline unsigned int getLevel() const { return anony.m_Level; } 90 0 yongsun inline unsigned int getIdx() const { return anony.m_Idx; } 91 0 yongsun inline operator unsigned() const { return m_all; } //((anony.m_Level << 24) + anony.m_Idx); } 92 0 yongsun inline bool operator==(const TState & b) const { return m_all == b.m_all; } 93 0 yongsun inline bool operator< (const TState & b) const { return unsigned(*this) < unsigned(b); } 94 0 yongsun 95 0 yongsun private: 96 0 yongsun unsigned int m_all; 97 198 tchaikov #ifndef WORDS_BIGENDIAN 98 0 yongsun struct TAnonymous { 99 0 yongsun unsigned m_Idx :24; 100 0 yongsun unsigned m_Level : 8; 101 0 yongsun } anony; 102 0 yongsun #else 103 0 yongsun struct TAnonymous { 104 0 yongsun unsigned m_Level : 8; 105 0 yongsun unsigned m_Idx :24; 106 0 yongsun } anony; 107 0 yongsun #endif 108 0 yongsun }; 109 0 yongsun 110 0 yongsun /** 111 0 yongsun * Machine dependent 112 0 yongsun */ 113 0 yongsun struct TNode { 114 0 yongsun public: 115 0 yongsun unsigned int wid() const 116 0 yongsun { 117 0 yongsun return m_wid; 118 0 yongsun } 119 0 yongsun 120 0 yongsun unsigned int bow() const 121 0 yongsun { 122 0 yongsun return m_bow; 123 0 yongsun } 124 0 yongsun 125 0 yongsun unsigned int pr() const 126 0 yongsun { 127 0 yongsun return m_pr; 128 0 yongsun } 129 0 yongsun 130 0 yongsun unsigned int bon() const 131 0 yongsun { 132 0 yongsun return m_bon; 133 0 yongsun } 134 0 yongsun 135 0 yongsun unsigned int bol() const 136 0 yongsun { 137 0 yongsun return m_bol; 138 0 yongsun } 139 0 yongsun 140 0 yongsun unsigned int ch() const 141 0 yongsun { 142 0 yongsun return ((m_ch_hi << 16) + m_ch_lo); 143 0 yongsun } 144 0 yongsun 145 0 yongsun void set_wid(unsigned int wid) 146 0 yongsun { 147 0 yongsun m_wid = wid; 148 0 yongsun } 149 0 yongsun 150 0 yongsun void set_bow(unsigned int bow) 151 0 yongsun { 152 0 yongsun m_bow = bow; 153 0 yongsun } 154 0 yongsun 155 0 yongsun void set_pr(unsigned int pr) 156 0 yongsun { 157 0 yongsun m_pr = pr; 158 0 yongsun } 159 0 yongsun 160 0 yongsun void set_bon(unsigned int bon) 161 0 yongsun { 162 0 yongsun m_bon = bon; 163 0 yongsun } 164 0 yongsun 165 0 yongsun void set_bol(unsigned int bol) 166 0 yongsun { 167 0 yongsun m_bol = bol; 168 0 yongsun } 169 0 yongsun 170 0 yongsun void set_ch(unsigned int ch) 171 0 yongsun { 172 0 yongsun m_ch_hi=((ch >> 16) & 0x7F); 173 0 yongsun m_ch_lo=(ch & 0xFFFF); 174 0 yongsun } 175 0 yongsun 176 0 yongsun protected: 177 198 tchaikov #ifndef WORDS_BIGENDIAN 178 0 yongsun unsigned m_wid :18; 179 0 yongsun unsigned m_bow :14; 180 0 yongsun unsigned m_pr :16; 181 0 yongsun unsigned m_ch_lo :16; 182 0 yongsun unsigned m_bon :23; 183 0 yongsun unsigned m_bol : 2; 184 0 yongsun unsigned m_ch_hi : 7; 185 0 yongsun #else 186 0 yongsun unsigned m_ch_hi : 7; 187 0 yongsun unsigned m_bol : 2; 188 0 yongsun unsigned m_bon :23; 189 0 yongsun unsigned m_ch_lo :16; 190 0 yongsun unsigned m_pr :16; 191 0 yongsun unsigned m_bow :14; 192 0 yongsun unsigned m_wid :18; 193 0 yongsun #endif 194 0 yongsun 195 0 yongsun private: 196 0 yongsun /** 197 0 yongsun * Machine dependent 198 0 yongsun union TChildIdx { 199 0 yongsun public: 200 0 yongsun inline TChildIdx(unsigned val) : m_all(val) { } 201 0 yongsun inline TChildIdx(const TChildIdx& b) : m_all(b.m_all) { } 202 0 yongsun inline TChildIdx(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi = hi; anony.m_lo = lo; } 203 0 yongsun 204 0 yongsun inline unsigned int lo() { return anony.m_lo; } 205 0 yongsun inline unsigned int hi() { return anony.m_hi; } 206 0 yongsun inline unsigned int all(){ return m_all; } 207 0 yongsun 208 0 yongsun inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); } 209 0 yongsun inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); } 210 0 yongsun inline unsigned int set_all(unsigned int all) { return (m_all = all); } 211 0 yongsun 212 0 yongsun private: 213 0 yongsun unsigned int m_all; 214 198 tchaikov #ifndef WORDS_BIGENDIAN 215 0 yongsun struct TAnony { 216 0 yongsun unsigned m_lo :16; 217 0 yongsun unsigned m_hi : 7; 218 0 yongsun unsigned NOUSE: 9; 219 0 yongsun } anony; 220 0 yongsun #else 221 0 yongsun struct TAnony { 222 0 yongsun unsigned NOUSE: 9; 223 0 yongsun unsigned m_hi : 7; 224 0 yongsun unsigned m_lo :16; 225 0 yongsun } anony; 226 0 yongsun #endif 227 0 yongsun }; 228 0 yongsun */ 229 0 yongsun }; 230 0 yongsun 231 0 yongsun /** 232 0 yongsun * Machine dependent 233 0 yongsun */ 234 0 yongsun struct TLeaf { 235 0 yongsun public: 236 0 yongsun inline unsigned int wid() const { return m_wid; } 237 0 yongsun inline unsigned int bon() const { return m_bon; } 238 0 yongsun inline unsigned int bol() const { return m_bol; } 239 0 yongsun inline unsigned int pr() const { return ((m_pr_hi << 14) + m_pr_lo); } 240 0 yongsun 241 0 yongsun inline void set_wid(unsigned int wid) { m_wid = wid; } 242 0 yongsun inline void set_bon(unsigned int bon) { m_bon = bon; } 243 0 yongsun inline void set_bol(unsigned int bol) { m_bol = bol; } 244 0 yongsun inline void set_pr(unsigned int pr) { m_pr_hi = ((pr >> 14) & 0x3); m_pr_lo = pr & 0x3FFF; } 245 0 yongsun 246 0 yongsun protected: 247 198 tchaikov #ifndef WORDS_BIGENDIAN 248 0 yongsun unsigned m_wid :18; 249 0 yongsun unsigned m_pr_lo :14; 250 0 yongsun unsigned m_bon :23; 251 0 yongsun unsigned m_bol : 2; 252 0 yongsun unsigned m_pr_hi : 2; 253 0 yongsun #else 254 0 yongsun unsigned m_pr_hi : 2; 255 0 yongsun unsigned m_bol : 2; 256 0 yongsun unsigned m_bon :23; 257 0 yongsun unsigned m_pr_lo :14; 258 0 yongsun unsigned m_wid :18; 259 0 yongsun #endif 260 0 yongsun 261 0 yongsun private: 262 0 yongsun /* 263 0 yongsun union TPr { 264 0 yongsun public: 265 0 yongsun inline TPr(unsigned int val) : m_all(val) { } 266 0 yongsun inline TPr(const TPr & b) : m_all(b.m_all) { } 267 0 yongsun inline TPr(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi=hi, anony.m_lo=lo; } 268 0 yongsun 269 0 yongsun inline unsigned int lo() { return anony.m_lo; } 270 0 yongsun inline unsigned int hi() { return anony.m_hi; } 271 0 yongsun inline unsigned int all(){ return m_all; } 272 0 yongsun 273 0 yongsun inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); } 274 0 yongsun inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); } 275 0 yongsun inline unsigned int set_all(unsigned int all) { return (m_all = all); } 276 0 yongsun 277 0 yongsun private: 278 0 yongsun unsigned int m_all; 279 198 tchaikov #ifndef WORDS_BIGENDIAN 280 0 yongsun struct TAnony { 281 0 yongsun unsigned m_lo :14; 282 0 yongsun unsigned m_hi : 2; 283 0 yongsun unsigned NONUSE:16; 284 0 yongsun } anony; 285 0 yongsun #else 286 0 yongsun struct TAnony { 287 0 yongsun unsigned NONUSE:16; 288 0 yongsun unsigned m_hi : 2; 289 0 yongsun unsigned m_lo :14; 290 0 yongsun } anony; 291 0 yongsun #endif 292 0 yongsun }; 293 0 yongsun */ 294 0 yongsun }; 295 0 yongsun 296 0 yongsun public: 297 0 yongsun CThreadSlm() 298 34 tchaikov : m_N(0), m_UseLogPr(0), m_Levels(NULL), m_LevelSizes(NULL), 299 34 tchaikov m_bowTable(NULL), m_prTable(NULL), m_bMMap(false), m_buf(NULL) { } 300 0 yongsun 301 0 yongsun ~CThreadSlm() { free(); } 302 0 yongsun 303 0 yongsun bool 304 0 yongsun load(const char* fname, bool MMap=false); 305 0 yongsun 306 198 tchaikov unsigned 307 198 tchaikov isUseLogPr() const 308 0 yongsun { return m_UseLogPr; } 309 0 yongsun 310 0 yongsun void 311 0 yongsun free(); 312 0 yongsun 313 0 yongsun double 314 0 yongsun transferNegLog(TState history, unsigned int wid, TState& result); 315 0 yongsun 316 0 yongsun double 317 0 yongsun transfer(TState history, unsigned int wid, TState& result); 318 0 yongsun 319 0 yongsun TState 320 0 yongsun history_state_of(TState st); 321 0 yongsun 322 0 yongsun TState& 323 0 yongsun historify(TState& st); 324 0 yongsun 325 0 yongsun unsigned int 326 0 yongsun lastWordId(TState st); 327 0 yongsun 328 0 yongsun protected: 329 0 yongsun double 330 0 yongsun rawTransfer(TState history, unsigned int wid, TState& result); 331 0 yongsun 332 0 yongsun protected: 333 0 yongsun typedef void* PtrVoid; 334 0 yongsun 335 0 yongsun unsigned m_N; 336 0 yongsun unsigned m_UseLogPr; 337 0 yongsun void **m_Levels; 338 0 yongsun unsigned *m_LevelSizes; 339 0 yongsun float *m_bowTable; 340 0 yongsun float *m_prTable; 341 0 yongsun 342 0 yongsun private: 343 198 tchaikov ssize_t m_bufSize; 344 0 yongsun bool m_bMMap; 345 0 yongsun char *m_buf; 346 0 yongsun }; 347 0 yongsun 348 0 yongsun #endif 349