1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include <config.h> 40 #endif 41 42 #include <unistd.h> 43 #include <fcntl.h> 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <math.h> 47 #include <errno.h> 48 #include <string.h> 49 50 #include "slm.h" 51 52 #ifdef HAVE_SYS_MMAN_H 53 #include <sys/mman.h> 54 #elif defined(BEOS_OS) 55 #include <be/kernel/OS.h> 56 #endif 57 58 bool 59 CThreadSlm::load(const char* fname, bool MMap) 60 { 61 int fd = open(fname, O_RDONLY); 62 if (fd == -1) { 63 fprintf(stderr, "open %s: %s\n", fname, strerror(errno)); 64 return false; 65 } 66 67 m_bufSize = lseek(fd, 0, SEEK_END); 68 lseek(fd, 0, SEEK_SET); 69 70 m_bMMap = MMap; 71 if (m_bMMap) { 72 #ifdef HAVE_SYS_MMAN_H 73 void* p = mmap(NULL, m_bufSize, PROT_READ, MAP_SHARED, fd, 0); 74 if (p == MAP_FAILED) { 75 close(fd); 76 return false; 77 } 78 m_buf = (char *)p; 79 #elif defined(BEOS_OS) 80 char *p = NULL; 81 area_id area = create_area("tmp", (void**)&p, B_ANY_ADDRESS, 82 (m_bufSize + (B_PAGE_SIZE - 1)) & ~(B_PAGE_SIZE - 1), 83 B_NO_LOCK, B_READ_AREA | B_WRITE_AREA); 84 if (area < 0) { 85 close(fd); 86 return false; 87 } 88 m_buf = p; 89 90 for (ssize_t len = m_bufSize; len > 0; ) { 91 ssize_t n = read(fd, p, len); 92 if (n < 0) break; 93 p += n; 94 len -= n; 95 } 96 #else // Other OS 97 #error "No implementation for mmap()" 98 #endif // HAVE_SYS_MMAN_H 99 } else { 100 if ((m_buf = new char[m_bufSize]) == NULL) { 101 close(fd); 102 return false; 103 } 104 if (read(fd, m_buf, m_bufSize) != m_bufSize) { 105 perror("read lm"); 106 delete [] m_buf; m_buf = NULL; 107 close(fd); 108 return false; 109 } 110 } 111 close(fd); 112 113 m_N = *(unsigned*)m_buf; 114 m_UseLogPr = *(((unsigned*)m_buf)+1); 115 m_LevelSizes = ((unsigned*)m_buf)+2; 116 m_prTable = (float*)(m_buf + 2*sizeof(unsigned) + (m_N+1)*sizeof(unsigned)); 117 m_bowTable = m_prTable + (1 << BITS_PR); 118 119 TNode* pn = (TNode*)(m_bowTable + (1 << BITS_BOW)); 120 121 //Solaris CC would cause error in runtime if using some thing like 122 //following even using (void**) conversion. So add PtrVoid definition 123 //m_Levels = new (void*) [m_N + 1]; 124 m_Levels = new PtrVoid[m_N+1]; 125 126 for (int lvl = 0; lvl <= m_N; ++lvl) { 127 m_Levels[lvl] = (void*)pn; 128 pn += m_LevelSizes[lvl]; 129 } 130 return true; 131 } 132 133 void 134 CThreadSlm::free() 135 { 136 if (m_Levels) { 137 delete [] m_Levels; 138 } 139 if (m_buf) { 140 if (m_bMMap) { 141 #ifdef HAVE_SYS_MMAN_H 142 munmap(m_buf, m_bufSize); 143 #elif defined(BEOS_OS) 144 delete_area(area_for(m_buf)); 145 #else // Other OS 146 #error "No implementation for munmap()" 147 #endif // HAVE_SYS_MMAN_H 148 } else { 149 delete [] m_buf; 150 } 151 } 152 m_buf = NULL; 153 m_Levels = NULL; 154 } 155 156 template<class NodeT> 157 unsigned int 158 find_id(NodeT* base, unsigned int h, unsigned int t, unsigned int id) 159 { 160 unsigned int tail = t; 161 while (h < t) { 162 int m = h + (t-h)/2; 163 NodeT* pm = base+m; 164 unsigned int thisId = pm->wid(); 165 if (thisId < id) 166 h = m+1; 167 else if (thisId > id) 168 t = m; 169 else 170 return m; 171 } 172 return tail; 173 } 174 175 /** 176 * return value as the model suggested. The history state must be historified 177 * or the history's level should be 0. when level == 0 but idx != 0, the 178 * history is a psuedo unigram state used for this model to combine another 179 * bigram cache language model 180 */ 181 double 182 CThreadSlm::rawTransfer(TState history, unsigned int wid, TState& result) 183 { 184 unsigned int lvl = history.getLevel(); 185 unsigned int pos = history.getIdx(); 186 187 double cost = (m_UseLogPr)?0.0:1.0; 188 189 // NON_Word id must be dealed with special, let it transfer to root 190 // without any cost 191 if (ID_NOT_WORD == wid) { 192 result = 0; 193 return cost; 194 } 195 196 while (true) { 197 //for psuedo cache model unigram state 198 TNode* pn = ((TNode *)m_Levels[lvl]) + ((lvl)?pos:0); 199 200 unsigned int t = (pn+1)->ch(); 201 202 if (lvl < m_N-1) { 203 TNode* pBase =(TNode*)m_Levels[lvl+1]; 204 unsigned int idx = find_id(pBase, pn->ch(), t, wid); 205 if (idx != t) { 206 result.setIdx(idx); 207 result.setLevel(lvl+1); 208 double pr = m_prTable[pBase[idx].pr()]; 209 return (m_UseLogPr)?(cost+pr):(cost*pr); 210 } 211 212 } else { 213 TLeaf* pBase =(TLeaf*)m_Levels[lvl+1]; 214 unsigned int idx = find_id(pBase, pn->ch(), t, wid); 215 if (idx != t) { 216 result.setIdx(idx); 217 result.setLevel(lvl+1); 218 double pr = m_prTable[pBase[idx].pr()]; 219 return (m_UseLogPr)?(cost+pr):(cost*pr); 220 } 221 222 } 223 224 if (m_UseLogPr) 225 cost += m_bowTable[pn->bow()]; 226 else 227 cost *= m_bowTable[pn->bow()]; 228 if (lvl == 0) 229 break; 230 lvl = pn->bol(); 231 pos = pn->bon(); 232 } 233 result.setLevel(0); 234 result.setIdx(0); 235 if (m_UseLogPr) 236 return cost + m_prTable[((TNode *)m_Levels[0])->pr()]; 237 else 238 return cost * m_prTable[((TNode *)m_Levels[0])->pr()]; 239 } 240 241 double 242 CThreadSlm::transferNegLog(TState history, unsigned int wid, TState& result) 243 { 244 double cost = rawTransfer(history, wid, result); 245 if (m_UseLogPr) 246 return cost; 247 else 248 return -log(cost); 249 } 250 251 double 252 CThreadSlm::transfer(TState history, unsigned int wid, TState& result) 253 { 254 double cost = rawTransfer(history, wid, result); 255 if (!m_UseLogPr) 256 return cost; 257 else 258 return exp(-cost); 259 } 260 261 unsigned int 262 CThreadSlm::lastWordId(TState st) 263 { 264 unsigned int lvl = st.getLevel(); 265 if (lvl >= m_N) { 266 const TLeaf* pn = ((const TLeaf *)m_Levels[m_N]) + st.getIdx(); 267 return pn->wid(); 268 } else if (lvl > 0) { 269 const TNode *pn = ((const TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 270 return pn->wid(); 271 } else { 272 unsigned int idx = st.getIdx(); 273 if (idx == 0) { 274 const TNode *pn = ((const TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 275 return pn->wid(); 276 } 277 return idx; // return the psuedo state word id 278 } 279 } 280 281 CThreadSlm::TState 282 CThreadSlm::history_state_of(TState st) 283 { 284 if (st.getLevel() >= m_N) { 285 TLeaf* pl = ((TLeaf *)m_Levels[m_N]) + st.getIdx(); 286 return TState(pl->bol(), pl->bon()); 287 } else { 288 TNode* pn = ((TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 289 if (pn->ch() == (pn+1)->ch()) 290 return TState(pn->bol(), pn->bon()); 291 else 292 return st; 293 } 294 } 295 296 CThreadSlm::TState& 297 CThreadSlm::historify(TState& st) 298 { 299 if (st.getLevel() >= m_N) { 300 TLeaf* pl = ((TLeaf *)m_Levels[m_N]) + st.getIdx(); 301 st.setLevel(pl->bol()); 302 st.setIdx(pl->bon()); 303 } else { 304 TNode* pn = ((TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 305 if (pn->ch() == (pn+1)->ch()) { 306 st.setLevel(pn->bol()); 307 st.setIdx(pn->bon()); 308 } 309 } 310 return st; 311 } 312
