1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifdef HAVE_CONFIG_H 39 0 yongsun #include <config.h> 40 0 yongsun #endif 41 198 tchaikov #include <algorithm> 42 0 yongsun #include "ic_history.h" 43 314 yongsun #include <stdint.h> 44 0 yongsun 45 0 yongsun const unsigned int CICHistory::DCWID = (unsigned int)-1; 46 0 yongsun 47 0 yongsun CICHistory::~CICHistory() 48 0 yongsun { 49 0 yongsun } 50 0 yongsun 51 0 yongsun bool CICHistory::seenBefore(unsigned int wid) 52 0 yongsun { 53 0 yongsun return false; 54 0 yongsun } 55 0 yongsun 56 0 yongsun bool CICHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid) 57 0 yongsun { 58 0 yongsun return true; 59 0 yongsun } 60 0 yongsun 61 0 yongsun double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid) 62 0 yongsun { 63 0 yongsun return 0.0; 64 0 yongsun } 65 0 yongsun 66 0 yongsun double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid) 67 0 yongsun { 68 0 yongsun return 0.0; 69 0 yongsun } 70 0 yongsun 71 0 yongsun bool CICHistory::bufferize(void** buf_ptr, size_t* sz) 72 0 yongsun { 73 0 yongsun *buf_ptr = NULL; 74 0 yongsun *sz = 0; 75 0 yongsun return true; 76 0 yongsun } 77 0 yongsun 78 0 yongsun bool CICHistory::loadFromBuffer(void* buf_ptr, size_t sz) 79 0 yongsun { 80 0 yongsun return true; 81 0 yongsun } 82 0 yongsun 83 0 yongsun 84 0 yongsun static bool bBigramHistoryInited = false; 85 0 yongsun const size_t CBigramHistory::contxt_memory_size = 8192; 86 0 yongsun std::set<unsigned int> CBigramHistory::s_stopWords; 87 0 yongsun 88 0 yongsun /** 89 0 yongsun * Adding stop words including : 90 0 yongsun * 91 0 yongsun * 92 0 yongsun * ?? 93 0 yongsun */ 94 0 yongsun 95 0 yongsun void CBigramHistory::initClass() 96 0 yongsun { 97 0 yongsun if (bBigramHistoryInited == false) { 98 0 yongsun bBigramHistoryInited = true; 99 0 yongsun s_stopWords.clear(); 100 0 yongsun 101 0 yongsun s_stopWords.insert(0); //unknown world 102 0 yongsun s_stopWords.insert(DCWID); //seperator word id used by history memory interanlly 103 0 yongsun 104 0 yongsun s_stopWords.insert(67659); // 105 0 yongsun s_stopWords.insert(24261); // 106 0 yongsun s_stopWords.insert(37471); // 107 0 yongsun s_stopWords.insert(68920); // 108 0 yongsun s_stopWords.insert(5071); // 109 0 yongsun 110 0 yongsun s_stopWords.insert(8396); // 111 0 yongsun s_stopWords.insert(40646); // 112 0 yongsun s_stopWords.insert(6755); // 113 0 yongsun s_stopWords.insert(28369); // 114 0 yongsun s_stopWords.insert(30143); // 115 0 yongsun s_stopWords.insert(8398); // 116 0 yongsun s_stopWords.insert(40648); // 117 0 yongsun s_stopWords.insert(6759); // 118 0 yongsun s_stopWords.insert(28370); // 119 0 yongsun s_stopWords.insert(30114); // 120 0 yongsun 121 0 yongsun s_stopWords.insert(7121); // 122 0 yongsun } 123 0 yongsun } 124 0 yongsun 125 0 yongsun //FIXME: CBigramHistory need to be thread safe 126 0 yongsun CBigramHistory::CBigramHistory() : m_memory(), m_unifreq(), m_bifreq() 127 0 yongsun { 128 0 yongsun 129 0 yongsun } 130 0 yongsun 131 0 yongsun CBigramHistory::~CBigramHistory() 132 0 yongsun { 133 0 yongsun } 134 0 yongsun 135 0 yongsun bool CBigramHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid) 136 0 yongsun { 137 0 yongsun TBigram bigram(DCWID, DCWID); 138 0 yongsun 139 0 yongsun // First , we insert an DC word id before the context history 140 0 yongsun // to seperated from previous stream. 141 0 yongsun if (m_memory.size() == contxt_memory_size) { 142 0 yongsun TBigram hb; 143 0 yongsun hb.first = m_memory.front(); 144 0 yongsun m_memory.pop_front(); 145 0 yongsun hb.second = m_memory.front(); 146 0 yongsun 147 0 yongsun decUniFreq(hb.first); 148 0 yongsun decBiFreq(hb); 149 0 yongsun } 150 0 yongsun m_memory.push_back(DCWID); 151 0 yongsun 152 0 yongsun //Now trying to memorize new stream and forget oldest 153 0 yongsun for (; its_wid != ite_wid; ++its_wid) { 154 0 yongsun if (m_memory.size() == contxt_memory_size) { 155 0 yongsun TBigram hb; 156 0 yongsun hb.first = m_memory.front(); 157 0 yongsun m_memory.pop_front(); 158 0 yongsun hb.second = m_memory.front(); 159 0 yongsun 160 0 yongsun decUniFreq(hb.first); 161 0 yongsun decBiFreq(hb); 162 0 yongsun } 163 0 yongsun bigram.first = bigram.second; 164 0 yongsun bigram.second = *its_wid; 165 0 yongsun m_memory.push_back(*its_wid); 166 0 yongsun incUniFreq(bigram.second); 167 0 yongsun incBiFreq(bigram); 168 0 yongsun } 169 0 yongsun return true; 170 0 yongsun } 171 0 yongsun 172 0 yongsun double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid) 173 0 yongsun { 174 0 yongsun TBigram bigram(DCWID, DCWID); 175 0 yongsun if (its_wid != ite_wid) { 176 0 yongsun --ite_wid; 177 0 yongsun bigram.second = *ite_wid; 178 0 yongsun if (its_wid != ite_wid) 179 0 yongsun bigram.first = *(ite_wid-1); 180 0 yongsun } 181 0 yongsun return pr(bigram); 182 0 yongsun } 183 0 yongsun 184 0 yongsun double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid) 185 0 yongsun { 186 0 yongsun TBigram bigram(DCWID, DCWID); 187 0 yongsun if (its_wid != ite_wid) 188 0 yongsun bigram.first = *(ite_wid-1); 189 0 yongsun bigram.second = wid; 190 0 yongsun return pr(bigram); 191 0 yongsun } 192 0 yongsun 193 198 tchaikov inline uint16_t swap16(uint16_t x) 194 0 yongsun { 195 0 yongsun return ((x << 8) | ((x >> 8) & 0xff)); 196 0 yongsun } 197 0 yongsun 198 198 tchaikov inline uint32_t swap32(uint32_t x) 199 0 yongsun { 200 0 yongsun return ((swap16(x) << 16) | (swap16(x >> 16) & 0xffff)); 201 0 yongsun } 202 0 yongsun 203 0 yongsun bool CBigramHistory::bufferize(void** buf_ptr, size_t* sz) 204 0 yongsun { 205 0 yongsun *buf_ptr = NULL; 206 0 yongsun *sz = 0; 207 0 yongsun try { 208 198 tchaikov *sz = sizeof(uint32_t) * m_memory.size(); 209 0 yongsun if (*sz > 0) { 210 0 yongsun *buf_ptr = malloc(*sz); // malloc for C compatible 211 198 tchaikov #ifdef WORDS_BIGENDIAN 212 198 tchaikov std::copy(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr); 213 198 tchaikov #else 214 198 tchaikov std::transform(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr, swap32); 215 198 tchaikov #endif 216 0 yongsun } 217 0 yongsun return true; 218 0 yongsun } catch (...) { 219 0 yongsun if (*buf_ptr) 220 0 yongsun free(*buf_ptr); 221 0 yongsun *buf_ptr = NULL; 222 0 yongsun *sz = 0; 223 0 yongsun } 224 0 yongsun return false; 225 0 yongsun } 226 0 yongsun 227 0 yongsun bool CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz) 228 0 yongsun { 229 0 yongsun m_memory.clear(); 230 0 yongsun m_unifreq.clear(); 231 0 yongsun m_bifreq.clear(); 232 0 yongsun 233 198 tchaikov sz /= sizeof(uint32_t); 234 198 tchaikov uint32_t *pw = (uint32_t *)buf_ptr; 235 0 yongsun 236 0 yongsun if (pw && sz > 0) { 237 198 tchaikov #ifndef WORDS_BIGENDIAN 238 198 tchaikov std::transform(pw, pw+sz, pw, swap32); 239 198 tchaikov #endif 240 0 yongsun TBigram bigram(DCWID, DCWID); 241 0 yongsun for (int i=0; i < sz; ++i) { 242 0 yongsun bigram.first = bigram.second; 243 0 yongsun bigram.second = *pw++; 244 0 yongsun m_memory.push_back(bigram.second); 245 0 yongsun incUniFreq(bigram.second); 246 0 yongsun incBiFreq(bigram); 247 0 yongsun } 248 0 yongsun } 249 0 yongsun return true; 250 0 yongsun } 251 0 yongsun 252 0 yongsun double CBigramHistory::pr(TBigram& bigram) 253 0 yongsun { 254 0 yongsun int uf0 = uniFreq(bigram.first); 255 0 yongsun int bf = biFreq(bigram); 256 0 yongsun int uf1 = uniFreq(bigram.second); 257 0 yongsun double pr = 0.0; 258 0 yongsun pr += 0.68*double(bf)/double(uf0+0.5); 259 0 yongsun pr += 0.32*double(uf1)/double(m_memory.size() + (contxt_memory_size-m_memory.size())/10); 260 0 yongsun //if (pr != 0) printf("cache pr(%d|%d) = %lf\n", bigram.second, bigram.first, pr); 261 0 yongsun return pr; 262 0 yongsun } 263 0 yongsun 264 0 yongsun int CBigramHistory::uniFreq(TUnigram& ug) 265 0 yongsun { 266 0 yongsun int freq = 0; 267 0 yongsun if (s_stopWords.find(ug) == s_stopWords.end()) { 268 0 yongsun TUnigramPool::iterator it = m_unifreq.find(ug); 269 0 yongsun if (it != m_unifreq.end()) { 270 0 yongsun freq = it->second; 271 0 yongsun } 272 0 yongsun } 273 0 yongsun //if (freq != 0) printf("uniFreq[%d]-->%d\n", ug, freq); 274 0 yongsun return freq; 275 0 yongsun } 276 0 yongsun 277 0 yongsun int CBigramHistory::biFreq(TBigram& bg) 278 0 yongsun { 279 0 yongsun int freq = 0; 280 0 yongsun //std::set<unsigned int>::const_iterator ite = s_stopWords.end(); 281 0 yongsun if (bg.first != DCWID && bg.second != DCWID) { 282 0 yongsun TBigramPool::const_iterator it = m_bifreq.find(bg); 283 0 yongsun if (it != m_bifreq.end()) 284 0 yongsun freq = it->second; 285 0 yongsun } 286 0 yongsun 287 0 yongsun //if (freq != 0) printf("biFreq[%d,%d]-->%d\n", bg.first, bg.second, freq); 288 0 yongsun return freq; 289 0 yongsun } 290 0 yongsun 291 0 yongsun void CBigramHistory::decUniFreq(TUnigram& ug) 292 0 yongsun { 293 0 yongsun TUnigramPool::iterator it = m_unifreq.find(ug); 294 0 yongsun if (it != m_unifreq.end()) { 295 0 yongsun if (it->second > 1) 296 0 yongsun --(it->second); 297 0 yongsun else 298 0 yongsun m_unifreq.erase(it); 299 0 yongsun } 300 0 yongsun } 301 0 yongsun 302 0 yongsun bool CBigramHistory::seenBefore(unsigned int wid) 303 0 yongsun { 304 0 yongsun return (wid != DCWID && s_stopWords.find(wid) == s_stopWords.end() && 305 0 yongsun m_unifreq.find(wid) != m_unifreq.end()); 306 0 yongsun } 307 0 yongsun 308 0 yongsun void CBigramHistory::decBiFreq(TBigram& bg) 309 0 yongsun { 310 0 yongsun TBigramPool::iterator it = m_bifreq.find(bg); 311 0 yongsun if (it != m_bifreq.end()) { 312 0 yongsun if (it->second > 1) 313 0 yongsun --(it->second); 314 0 yongsun else 315 0 yongsun m_bifreq.erase(it); 316 0 yongsun } 317 0 yongsun } 318 0 yongsun 319 0 yongsun void CBigramHistory::incUniFreq(TUnigram& ug) 320 0 yongsun { 321 0 yongsun ++m_unifreq[ug]; 322 0 yongsun //printf("Remebering uniFreq[%d]-->%d\n", ug, m_unifreq[ug]); 323 0 yongsun } 324 0 yongsun 325 0 yongsun void CBigramHistory::incBiFreq(TBigram& bg) 326 0 yongsun { 327 0 yongsun ++m_bifreq[bg]; 328 0 yongsun //printf("Remebering biFreq[%d,%d]-->%d\n", bg.first, bg.second, m_bifreq[bg]); 329 0 yongsun } 330