1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include <config.h> 40 #endif 41 #include <algorithm> 42 #include "ic_history.h" 43 #include <stdint.h> 44 45 const unsigned int CICHistory::DCWID = (unsigned int)-1; 46 47 CICHistory::~CICHistory() 48 { 49 } 50 51 bool CICHistory::seenBefore(unsigned int wid) 52 { 53 return false; 54 } 55 56 bool CICHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid) 57 { 58 return true; 59 } 60 61 double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid) 62 { 63 return 0.0; 64 } 65 66 double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid) 67 { 68 return 0.0; 69 } 70 71 bool CICHistory::bufferize(void** buf_ptr, size_t* sz) 72 { 73 *buf_ptr = NULL; 74 *sz = 0; 75 return true; 76 } 77 78 bool CICHistory::loadFromBuffer(void* buf_ptr, size_t sz) 79 { 80 return true; 81 } 82 83 84 static bool bBigramHistoryInited = false; 85 const size_t CBigramHistory::contxt_memory_size = 8192; 86 std::set<unsigned int> CBigramHistory::s_stopWords; 87 88 /** 89 * Adding stop words including : 90 * ��� ��� ��� ��� ��� ��� ��� ��� ��� ��� ������ ������ ������ 91 * ��� 92 * ??��� ��� ��� 93 */ 94 95 void CBigramHistory::initClass() 96 { 97 if (bBigramHistoryInited == false) { 98 bBigramHistoryInited = true; 99 s_stopWords.clear(); 100 101 s_stopWords.insert(0); //unknown world 102 s_stopWords.insert(DCWID); //seperator word id used by history memory interanlly 103 104 s_stopWords.insert(67659); //��� 105 s_stopWords.insert(24261); //��� 106 s_stopWords.insert(37471); //��� 107 s_stopWords.insert(68920); //��� 108 s_stopWords.insert(5071); //��� 109 110 s_stopWords.insert(8396); //��� 111 s_stopWords.insert(40646); //��� 112 s_stopWords.insert(6755); //��� 113 s_stopWords.insert(28369); //��� 114 s_stopWords.insert(30143); //��� 115 s_stopWords.insert(8398); //������ 116 s_stopWords.insert(40648); //������ 117 s_stopWords.insert(6759); //������ 118 s_stopWords.insert(28370); //������ 119 s_stopWords.insert(30114); //������ 120 121 s_stopWords.insert(7121); //��� 122 } 123 } 124 125 //FIXME: CBigramHistory need to be thread safe 126 CBigramHistory::CBigramHistory() : m_memory(), m_unifreq(), m_bifreq() 127 { 128 129 } 130 131 CBigramHistory::~CBigramHistory() 132 { 133 } 134 135 bool CBigramHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid) 136 { 137 TBigram bigram(DCWID, DCWID); 138 139 // First , we insert an DC word id before the context history 140 // to seperated from previous stream. 141 if (m_memory.size() == contxt_memory_size) { 142 TBigram hb; 143 hb.first = m_memory.front(); 144 m_memory.pop_front(); 145 hb.second = m_memory.front(); 146 147 decUniFreq(hb.first); 148 decBiFreq(hb); 149 } 150 m_memory.push_back(DCWID); 151 152 //Now trying to memorize new stream and forget oldest 153 for (; its_wid != ite_wid; ++its_wid) { 154 if (m_memory.size() == contxt_memory_size) { 155 TBigram hb; 156 hb.first = m_memory.front(); 157 m_memory.pop_front(); 158 hb.second = m_memory.front(); 159 160 decUniFreq(hb.first); 161 decBiFreq(hb); 162 } 163 bigram.first = bigram.second; 164 bigram.second = *its_wid; 165 m_memory.push_back(*its_wid); 166 incUniFreq(bigram.second); 167 incBiFreq(bigram); 168 } 169 return true; 170 } 171 172 double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid) 173 { 174 TBigram bigram(DCWID, DCWID); 175 if (its_wid != ite_wid) { 176 --ite_wid; 177 bigram.second = *ite_wid; 178 if (its_wid != ite_wid) 179 bigram.first = *(ite_wid-1); 180 } 181 return pr(bigram); 182 } 183 184 double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid) 185 { 186 TBigram bigram(DCWID, DCWID); 187 if (its_wid != ite_wid) 188 bigram.first = *(ite_wid-1); 189 bigram.second = wid; 190 return pr(bigram); 191 } 192 193 inline uint16_t swap16(uint16_t x) 194 { 195 return ((x << 8) | ((x >> 8) & 0xff)); 196 } 197 198 inline uint32_t swap32(uint32_t x) 199 { 200 return ((swap16(x) << 16) | (swap16(x >> 16) & 0xffff)); 201 } 202 203 bool CBigramHistory::bufferize(void** buf_ptr, size_t* sz) 204 { 205 *buf_ptr = NULL; 206 *sz = 0; 207 try { 208 *sz = sizeof(uint32_t) * m_memory.size(); 209 if (*sz > 0) { 210 *buf_ptr = malloc(*sz); // malloc for C compatible 211 #ifdef WORDS_BIGENDIAN 212 std::copy(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr); 213 #else 214 std::transform(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr, swap32); 215 #endif 216 } 217 return true; 218 } catch (...) { 219 if (*buf_ptr) 220 free(*buf_ptr); 221 *buf_ptr = NULL; 222 *sz = 0; 223 } 224 return false; 225 } 226 227 bool CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz) 228 { 229 m_memory.clear(); 230 m_unifreq.clear(); 231 m_bifreq.clear(); 232 233 sz /= sizeof(uint32_t); 234 uint32_t *pw = (uint32_t *)buf_ptr; 235 236 if (pw && sz > 0) { 237 #ifndef WORDS_BIGENDIAN 238 std::transform(pw, pw+sz, pw, swap32); 239 #endif 240 TBigram bigram(DCWID, DCWID); 241 for (int i=0; i < sz; ++i) { 242 bigram.first = bigram.second; 243 bigram.second = *pw++; 244 m_memory.push_back(bigram.second); 245 incUniFreq(bigram.second); 246 incBiFreq(bigram); 247 } 248 } 249 return true; 250 } 251 252 double CBigramHistory::pr(TBigram& bigram) 253 { 254 int uf0 = uniFreq(bigram.first); 255 int bf = biFreq(bigram); 256 int uf1 = uniFreq(bigram.second); 257 double pr = 0.0; 258 pr += 0.68*double(bf)/double(uf0+0.5); 259 pr += 0.32*double(uf1)/double(m_memory.size() + (contxt_memory_size-m_memory.size())/10); 260 //if (pr != 0) printf("cache pr(%d|%d) = %lf\n", bigram.second, bigram.first, pr); 261 return pr; 262 } 263 264 int CBigramHistory::uniFreq(TUnigram& ug) 265 { 266 int freq = 0; 267 if (s_stopWords.find(ug) == s_stopWords.end()) { 268 TUnigramPool::iterator it = m_unifreq.find(ug); 269 if (it != m_unifreq.end()) { 270 freq = it->second; 271 } 272 } 273 //if (freq != 0) printf("uniFreq[%d]-->%d\n", ug, freq); 274 return freq; 275 } 276 277 int CBigramHistory::biFreq(TBigram& bg) 278 { 279 int freq = 0; 280 //std::set<unsigned int>::const_iterator ite = s_stopWords.end(); 281 if (bg.first != DCWID && bg.second != DCWID) { 282 TBigramPool::const_iterator it = m_bifreq.find(bg); 283 if (it != m_bifreq.end()) 284 freq = it->second; 285 } 286 287 //if (freq != 0) printf("biFreq[%d,%d]-->%d\n", bg.first, bg.second, freq); 288 return freq; 289 } 290 291 void CBigramHistory::decUniFreq(TUnigram& ug) 292 { 293 TUnigramPool::iterator it = m_unifreq.find(ug); 294 if (it != m_unifreq.end()) { 295 if (it->second > 1) 296 --(it->second); 297 else 298 m_unifreq.erase(it); 299 } 300 } 301 302 bool CBigramHistory::seenBefore(unsigned int wid) 303 { 304 return (wid != DCWID && s_stopWords.find(wid) == s_stopWords.end() && 305 m_unifreq.find(wid) != m_unifreq.end()); 306 } 307 308 void CBigramHistory::decBiFreq(TBigram& bg) 309 { 310 TBigramPool::iterator it = m_bifreq.find(bg); 311 if (it != m_bifreq.end()) { 312 if (it->second > 1) 313 --(it->second); 314 else 315 m_bifreq.erase(it); 316 } 317 } 318 319 void CBigramHistory::incUniFreq(TUnigram& ug) 320 { 321 ++m_unifreq[ug]; 322 //printf("Remebering uniFreq[%d]-->%d\n", ug, m_unifreq[ug]); 323 } 324 325 void CBigramHistory::incBiFreq(TBigram& bg) 326 { 327 ++m_bifreq[bg]; 328 //printf("Remebering biFreq[%d,%d]-->%d\n", bg.first, bg.second, m_bifreq[bg]); 329 } 330
