1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifdef HAVE_CONFIG_H 39 0 yongsun #include <config.h> 40 0 yongsun #endif 41 0 yongsun 42 0 yongsun #include <unistd.h> 43 0 yongsun #include <fcntl.h> 44 0 yongsun #include <sys/types.h> 45 0 yongsun #include <sys/stat.h> 46 0 yongsun #include <math.h> 47 0 yongsun 48 0 yongsun #include "slm.h" 49 90 tonylee 50 90 tonylee #ifdef HAVE_SYS_MMAN_H 51 90 tonylee #include <sys/mman.h> 52 90 tonylee #elif defined(BEOS_OS) 53 90 tonylee #include <be/kernel/OS.h> 54 90 tonylee #endif 55 0 yongsun 56 0 yongsun bool 57 0 yongsun CThreadSlm::load(const char* fname, bool MMap) 58 0 yongsun { 59 0 yongsun int fd = open(fname, O_RDONLY); 60 0 yongsun m_bufSize = lseek(fd, 0, SEEK_END); 61 0 yongsun lseek(fd, 0, SEEK_SET); 62 0 yongsun 63 0 yongsun m_bMMap = MMap; 64 0 yongsun if (m_bMMap) { 65 90 tonylee #ifdef HAVE_SYS_MMAN_H 66 0 yongsun void* p = mmap(NULL, m_bufSize, PROT_READ, MAP_SHARED, fd, 0); 67 0 yongsun if (p == MAP_FAILED) { 68 0 yongsun close(fd); 69 0 yongsun return false; 70 0 yongsun } 71 0 yongsun m_buf = (char *)p; 72 90 tonylee #elif defined(BEOS_OS) 73 90 tonylee char *p = NULL; 74 90 tonylee area_id area = create_area("tmp", (void**)&p, B_ANY_ADDRESS, 75 90 tonylee (m_bufSize + (B_PAGE_SIZE - 1)) & ~(B_PAGE_SIZE - 1), 76 90 tonylee B_NO_LOCK, B_READ_AREA | B_WRITE_AREA); 77 90 tonylee if (area < 0) { 78 90 tonylee close(fd); 79 90 tonylee return false; 80 90 tonylee } 81 90 tonylee m_buf = p; 82 90 tonylee 83 90 tonylee for (ssize_t len = m_bufSize; len > 0; ) { 84 90 tonylee ssize_t n = read(fd, p, len); 85 90 tonylee if (n < 0) break; 86 90 tonylee p += n; 87 90 tonylee len -= n; 88 90 tonylee } 89 90 tonylee #else // Other OS 90 90 tonylee #error "No implementation for mmap()" 91 90 tonylee #endif // HAVE_SYS_MMAN_H 92 0 yongsun } else { 93 0 yongsun if ((m_buf = new char[m_bufSize]) == NULL) { 94 0 yongsun close(fd); 95 0 yongsun return false; 96 0 yongsun } 97 0 yongsun if (read(fd, m_buf, m_bufSize) != m_bufSize) { 98 0 yongsun delete [] m_buf; m_buf = NULL; 99 0 yongsun close(fd); 100 0 yongsun return false; 101 0 yongsun } 102 0 yongsun } 103 0 yongsun close(fd); 104 0 yongsun 105 0 yongsun m_N = *(unsigned*)m_buf; 106 0 yongsun m_UseLogPr = *(((unsigned*)m_buf)+1); 107 0 yongsun m_LevelSizes = ((unsigned*)m_buf)+2; 108 0 yongsun m_prTable = (float*)(m_buf + 2*sizeof(unsigned) + (m_N+1)*sizeof(unsigned)); 109 0 yongsun m_bowTable = m_prTable + (1 << BITS_PR); 110 0 yongsun 111 0 yongsun TNode* pn = (TNode*)(m_bowTable + (1 << BITS_BOW)); 112 0 yongsun 113 0 yongsun //Solaris CC would cause error in runtime if using some thing like 114 0 yongsun //following even using (void**) conversion. So add PtrVoid definition 115 0 yongsun //m_Levels = new (void*) [m_N + 1]; 116 0 yongsun m_Levels = new PtrVoid[m_N+1]; 117 0 yongsun 118 186 tchaikov for (unsigned lvl = 0; lvl <= m_N; ++lvl) { 119 0 yongsun m_Levels[lvl] = (void*)pn; 120 0 yongsun pn += m_LevelSizes[lvl]; 121 0 yongsun } 122 0 yongsun return true; 123 0 yongsun } 124 0 yongsun 125 0 yongsun void 126 0 yongsun CThreadSlm::free() 127 0 yongsun { 128 186 tchaikov delete [] m_Levels; 129 0 yongsun if (m_buf) { 130 0 yongsun if (m_bMMap) { 131 90 tonylee #ifdef HAVE_SYS_MMAN_H 132 0 yongsun munmap(m_buf, m_bufSize); 133 90 tonylee #elif defined(BEOS_OS) 134 90 tonylee delete_area(area_for(m_buf)); 135 90 tonylee #else // Other OS 136 90 tonylee #error "No implementation for munmap()" 137 90 tonylee #endif // HAVE_SYS_MMAN_H 138 0 yongsun } else { 139 0 yongsun delete [] m_buf; 140 0 yongsun } 141 0 yongsun } 142 0 yongsun m_buf = NULL; 143 0 yongsun m_Levels = NULL; 144 0 yongsun } 145 0 yongsun 146 0 yongsun template<class NodeT> 147 0 yongsun unsigned int 148 0 yongsun find_id(NodeT* base, unsigned int h, unsigned int t, unsigned int id) 149 0 yongsun { 150 0 yongsun unsigned int tail = t; 151 0 yongsun while (h < t) { 152 0 yongsun int m = (h+t)/2; 153 0 yongsun NodeT* pm = base+m; 154 0 yongsun unsigned int thisId = pm->wid(); 155 0 yongsun if (thisId < id) 156 0 yongsun h = m+1; 157 0 yongsun else if (thisId > id) 158 0 yongsun t = m; 159 0 yongsun else 160 0 yongsun return m; 161 0 yongsun } 162 0 yongsun return tail; 163 0 yongsun } 164 0 yongsun 165 0 yongsun /** 166 0 yongsun * return value as the model suggested. The history state must be historified 167 0 yongsun * or the history's level should be 0. when level == 0 but idx != 0, the 168 0 yongsun * history is a psuedo unigram state used for this model to combine another 169 0 yongsun * bigram cache language model 170 0 yongsun */ 171 0 yongsun double 172 0 yongsun CThreadSlm::rawTransfer(TState history, unsigned int wid, TState& result) 173 0 yongsun { 174 0 yongsun unsigned int lvl = history.getLevel(); 175 0 yongsun unsigned int pos = history.getIdx(); 176 0 yongsun 177 0 yongsun double cost = (m_UseLogPr)?0.0:1.0; 178 0 yongsun 179 0 yongsun // NON_Word id must be dealed with special, let it transfer to root 180 0 yongsun // without any cost 181 0 yongsun if (ID_NOT_WORD == wid) { 182 0 yongsun result = 0; 183 0 yongsun return cost; 184 0 yongsun } 185 0 yongsun 186 0 yongsun while (true) { 187 0 yongsun //for psuedo cache model unigram state 188 0 yongsun TNode* pn = ((TNode *)m_Levels[lvl]) + ((lvl)?pos:0); 189 0 yongsun 190 0 yongsun unsigned int t = (pn+1)->ch(); 191 0 yongsun 192 0 yongsun if (lvl < m_N-1) { 193 0 yongsun TNode* pBase =(TNode*)m_Levels[lvl+1]; 194 0 yongsun unsigned int idx = find_id(pBase, pn->ch(), t, wid); 195 0 yongsun if (idx != t) { 196 0 yongsun result.setIdx(idx); 197 0 yongsun result.setLevel(lvl+1); 198 0 yongsun double pr = m_prTable[pBase[idx].pr()]; 199 0 yongsun return (m_UseLogPr)?(cost+pr):(cost*pr); 200 0 yongsun } 201 0 yongsun 202 0 yongsun } else { 203 0 yongsun TLeaf* pBase =(TLeaf*)m_Levels[lvl+1]; 204 0 yongsun unsigned int idx = find_id(pBase, pn->ch(), t, wid); 205 0 yongsun if (idx != t) { 206 0 yongsun result.setIdx(idx); 207 0 yongsun result.setLevel(lvl+1); 208 0 yongsun double pr = m_prTable[pBase[idx].pr()]; 209 0 yongsun return (m_UseLogPr)?(cost+pr):(cost*pr); 210 0 yongsun } 211 0 yongsun 212 0 yongsun } 213 0 yongsun 214 0 yongsun if (m_UseLogPr) 215 0 yongsun cost += m_bowTable[pn->bow()]; 216 0 yongsun else 217 0 yongsun cost *= m_bowTable[pn->bow()]; 218 0 yongsun if (lvl == 0) 219 0 yongsun break; 220 0 yongsun lvl = pn->bol(); 221 0 yongsun pos = pn->bon(); 222 0 yongsun } 223 0 yongsun result.setLevel(0); 224 0 yongsun result.setIdx(0); 225 0 yongsun if (m_UseLogPr) 226 0 yongsun return cost + m_prTable[((TNode *)m_Levels[0])->pr()]; 227 0 yongsun else 228 0 yongsun return cost * m_prTable[((TNode *)m_Levels[0])->pr()]; 229 0 yongsun } 230 0 yongsun 231 0 yongsun double 232 0 yongsun CThreadSlm::transferNegLog(TState history, unsigned int wid, TState& result) 233 0 yongsun { 234 0 yongsun double cost = rawTransfer(history, wid, result); 235 0 yongsun if (m_UseLogPr) 236 0 yongsun return cost; 237 0 yongsun else 238 0 yongsun return -log(cost); 239 0 yongsun } 240 0 yongsun 241 0 yongsun double 242 0 yongsun CThreadSlm::transfer(TState history, unsigned int wid, TState& result) 243 0 yongsun { 244 0 yongsun double cost = rawTransfer(history, wid, result); 245 0 yongsun if (!m_UseLogPr) 246 0 yongsun return cost; 247 0 yongsun else 248 0 yongsun return exp(-cost); 249 0 yongsun } 250 0 yongsun 251 0 yongsun unsigned int 252 0 yongsun CThreadSlm::lastWordId(TState st) 253 0 yongsun { 254 0 yongsun unsigned int lvl = st.getLevel(); 255 0 yongsun if (lvl >= m_N) { 256 0 yongsun const TLeaf* pn = ((const TLeaf *)m_Levels[m_N]) + st.getIdx(); 257 0 yongsun return pn->wid(); 258 0 yongsun } else if (lvl > 0) { 259 0 yongsun const TNode *pn = ((const TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 260 0 yongsun return pn->wid(); 261 0 yongsun } else { 262 0 yongsun unsigned int idx = st.getIdx(); 263 0 yongsun if (idx == 0) { 264 0 yongsun const TNode *pn = ((const TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 265 0 yongsun return pn->wid(); 266 0 yongsun } 267 0 yongsun return idx; // return the psuedo state word id 268 0 yongsun } 269 0 yongsun } 270 0 yongsun 271 0 yongsun CThreadSlm::TState 272 0 yongsun CThreadSlm::history_state_of(TState st) 273 0 yongsun { 274 0 yongsun if (st.getLevel() >= m_N) { 275 0 yongsun TLeaf* pl = ((TLeaf *)m_Levels[m_N]) + st.getIdx(); 276 0 yongsun return TState(pl->bol(), pl->bon()); 277 0 yongsun } else { 278 0 yongsun TNode* pn = ((TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 279 0 yongsun if (pn->ch() == (pn+1)->ch()) 280 0 yongsun return TState(pn->bol(), pn->bon()); 281 0 yongsun else 282 0 yongsun return st; 283 0 yongsun } 284 0 yongsun } 285 0 yongsun 286 0 yongsun CThreadSlm::TState& 287 0 yongsun CThreadSlm::historify(TState& st) 288 0 yongsun { 289 0 yongsun if (st.getLevel() >= m_N) { 290 0 yongsun TLeaf* pl = ((TLeaf *)m_Levels[m_N]) + st.getIdx(); 291 0 yongsun st.setLevel(pl->bol()); 292 0 yongsun st.setIdx(pl->bon()); 293 0 yongsun } else { 294 0 yongsun TNode* pn = ((TNode *)m_Levels[st.getLevel()]) + st.getIdx(); 295 0 yongsun if (pn->ch() == (pn+1)->ch()) { 296 0 yongsun st.setLevel(pn->bol()); 297 0 yongsun st.setIdx(pn->bon()); 298 0 yongsun } 299 0 yongsun } 300 0 yongsun return st; 301 0 yongsun } 302