1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifndef _SUN_AGC_SLM_H 39 #define _SUN_AGC_SLM_H 40 41 #include "../portability.h" 42 43 #include <stdio.h> 44 45 /** 46 * Thread slm make the following modifications to simple back-off language model 47 * -# Word id are limited to 18 bits, about 240K word ids 48 * -# Compact all float value of -log(pr) into 65536 (16 bits) 49 * level and use a table to map the index to a float value; 50 * -# Compact all float value of -log(pr) into 16384 (14 bits) 51 * level and use a table to map the index to a float value; 52 * -# threading infomation embed into binary model file. Threading include 53 * - bol(back-off-level) from current level 54 * - bon(back-off-node)'s index in the bol level array 55 * . 56 * The thread could be used: 57 * - when leaf node are arrived, it could use (bol,bon) as history for 58 * history node. 59 * - when a word could not be found in current node (cl, cn)'s children, 60 * searching could be transfered to (bol, bon) directly and continue 61 * searching the target word 62 * -# Add a basic type TState in Language model, a state is pair of\n 63 * (level, array_idx_of_the level) 64 * -# change all get probability interface to\n 65 * double transfer(TState& history, unsigned int wid, TState& result); 66 */ 67 class CThreadSlm { 68 public: 69 enum { 70 BITS_BOW = 14, 71 BITS_PR = 16, 72 ID_NOT_WORD = 69 73 }; 74 75 /** 76 * (level:idx) located a state in the language model very well 77 * Please note the psuedo unigram state, with level == 0, but idx > 0 78 * it's for used with bigram cache model 79 */ 80 union TState{ 81 TState(const TState& b) : m_all(b.m_all) { } 82 TState(unsigned level=0, unsigned idx=0) { anony.m_Level=level; anony.m_Idx=idx; } 83 84 inline TState& operator++() { ++anony.m_Idx; return *this; } 85 86 inline void setIdx(unsigned int idx) { anony.m_Idx = idx; } 87 inline void setLevel(unsigned int lvl) { anony.m_Level = lvl; } 88 89 inline unsigned int getLevel() const { return anony.m_Level; } 90 inline unsigned int getIdx() const { return anony.m_Idx; } 91 inline operator unsigned() const { return m_all; } //((anony.m_Level << 24) + anony.m_Idx); } 92 inline bool operator==(const TState & b) const { return m_all == b.m_all; } 93 inline bool operator< (const TState & b) const { return unsigned(*this) < unsigned(b); } 94 95 private: 96 unsigned int m_all; 97 #ifndef WORDS_BIGENDIAN 98 struct TAnonymous { 99 unsigned m_Idx :24; 100 unsigned m_Level : 8; 101 } anony; 102 #else 103 struct TAnonymous { 104 unsigned m_Level : 8; 105 unsigned m_Idx :24; 106 } anony; 107 #endif 108 }; 109 110 /** 111 * Machine dependent 112 */ 113 struct TNode { 114 public: 115 unsigned int wid() const 116 { 117 return m_wid; 118 } 119 120 unsigned int bow() const 121 { 122 return m_bow; 123 } 124 125 unsigned int pr() const 126 { 127 return m_pr; 128 } 129 130 unsigned int bon() const 131 { 132 return m_bon; 133 } 134 135 unsigned int bol() const 136 { 137 return m_bol; 138 } 139 140 unsigned int ch() const 141 { 142 return ((m_ch_hi << 16) + m_ch_lo); 143 } 144 145 void set_wid(unsigned int wid) 146 { 147 m_wid = wid; 148 } 149 150 void set_bow(unsigned int bow) 151 { 152 m_bow = bow; 153 } 154 155 void set_pr(unsigned int pr) 156 { 157 m_pr = pr; 158 } 159 160 void set_bon(unsigned int bon) 161 { 162 m_bon = bon; 163 } 164 165 void set_bol(unsigned int bol) 166 { 167 m_bol = bol; 168 } 169 170 void set_ch(unsigned int ch) 171 { 172 m_ch_hi=((ch >> 16) & 0x7F); 173 m_ch_lo=(ch & 0xFFFF); 174 } 175 176 protected: 177 #ifndef WORDS_BIGENDIAN 178 unsigned m_wid :18; 179 unsigned m_bow :14; 180 unsigned m_pr :16; 181 unsigned m_ch_lo :16; 182 unsigned m_bon :23; 183 unsigned m_bol : 2; 184 unsigned m_ch_hi : 7; 185 #else 186 unsigned m_ch_hi : 7; 187 unsigned m_bol : 2; 188 unsigned m_bon :23; 189 unsigned m_ch_lo :16; 190 unsigned m_pr :16; 191 unsigned m_bow :14; 192 unsigned m_wid :18; 193 #endif 194 195 private: 196 /** 197 * Machine dependent 198 union TChildIdx { 199 public: 200 inline TChildIdx(unsigned val) : m_all(val) { } 201 inline TChildIdx(const TChildIdx& b) : m_all(b.m_all) { } 202 inline TChildIdx(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi = hi; anony.m_lo = lo; } 203 204 inline unsigned int lo() { return anony.m_lo; } 205 inline unsigned int hi() { return anony.m_hi; } 206 inline unsigned int all(){ return m_all; } 207 208 inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); } 209 inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); } 210 inline unsigned int set_all(unsigned int all) { return (m_all = all); } 211 212 private: 213 unsigned int m_all; 214 #ifndef WORDS_BIGENDIAN 215 struct TAnony { 216 unsigned m_lo :16; 217 unsigned m_hi : 7; 218 unsigned NOUSE: 9; 219 } anony; 220 #else 221 struct TAnony { 222 unsigned NOUSE: 9; 223 unsigned m_hi : 7; 224 unsigned m_lo :16; 225 } anony; 226 #endif 227 }; 228 */ 229 }; 230 231 /** 232 * Machine dependent 233 */ 234 struct TLeaf { 235 public: 236 inline unsigned int wid() const { return m_wid; } 237 inline unsigned int bon() const { return m_bon; } 238 inline unsigned int bol() const { return m_bol; } 239 inline unsigned int pr() const { return ((m_pr_hi << 14) + m_pr_lo); } 240 241 inline void set_wid(unsigned int wid) { m_wid = wid; } 242 inline void set_bon(unsigned int bon) { m_bon = bon; } 243 inline void set_bol(unsigned int bol) { m_bol = bol; } 244 inline void set_pr(unsigned int pr) { m_pr_hi = ((pr >> 14) & 0x3); m_pr_lo = pr & 0x3FFF; } 245 246 protected: 247 #ifndef WORDS_BIGENDIAN 248 unsigned m_wid :18; 249 unsigned m_pr_lo :14; 250 unsigned m_bon :23; 251 unsigned m_bol : 2; 252 unsigned m_pr_hi : 2; 253 #else 254 unsigned m_pr_hi : 2; 255 unsigned m_bol : 2; 256 unsigned m_bon :23; 257 unsigned m_pr_lo :14; 258 unsigned m_wid :18; 259 #endif 260 261 private: 262 /* 263 union TPr { 264 public: 265 inline TPr(unsigned int val) : m_all(val) { } 266 inline TPr(const TPr & b) : m_all(b.m_all) { } 267 inline TPr(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi=hi, anony.m_lo=lo; } 268 269 inline unsigned int lo() { return anony.m_lo; } 270 inline unsigned int hi() { return anony.m_hi; } 271 inline unsigned int all(){ return m_all; } 272 273 inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); } 274 inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); } 275 inline unsigned int set_all(unsigned int all) { return (m_all = all); } 276 277 private: 278 unsigned int m_all; 279 #ifndef WORDS_BIGENDIAN 280 struct TAnony { 281 unsigned m_lo :14; 282 unsigned m_hi : 2; 283 unsigned NONUSE:16; 284 } anony; 285 #else 286 struct TAnony { 287 unsigned NONUSE:16; 288 unsigned m_hi : 2; 289 unsigned m_lo :14; 290 } anony; 291 #endif 292 }; 293 */ 294 }; 295 296 public: 297 CThreadSlm() 298 : m_N(0), m_UseLogPr(0), m_Levels(NULL), m_LevelSizes(NULL), 299 m_bowTable(NULL), m_prTable(NULL), m_bMMap(false), m_buf(NULL) { } 300 301 ~CThreadSlm() { free(); } 302 303 bool 304 load(const char* fname, bool MMap=false); 305 306 unsigned 307 isUseLogPr() const 308 { return m_UseLogPr; } 309 310 void 311 free(); 312 313 double 314 transferNegLog(TState history, unsigned int wid, TState& result); 315 316 double 317 transfer(TState history, unsigned int wid, TState& result); 318 319 TState 320 history_state_of(TState st); 321 322 TState& 323 historify(TState& st); 324 325 unsigned int 326 lastWordId(TState st); 327 328 protected: 329 double 330 rawTransfer(TState history, unsigned int wid, TState& result); 331 332 protected: 333 typedef void* PtrVoid; 334 335 unsigned m_N; 336 unsigned m_UseLogPr; 337 void **m_Levels; 338 unsigned *m_LevelSizes; 339 float *m_bowTable; 340 float *m_prTable; 341 342 private: 343 ssize_t m_bufSize; 344 bool m_bMMap; 345 char *m_buf; 346 }; 347 348 #endif 349
