OpenGrok

Cross Reference: ic_history.cpp
xref: /nv-g11n/inputmethod/sunpinyin/ime/src/ic_history.cpp
Home | History | Annotate | Line # | Download | only in src
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifdef HAVE_CONFIG_H
     39 #include <config.h>
     40 #endif
     41 #include <algorithm>
     42 #include "ic_history.h"
     43 #include <stdint.h>
     44 
     45 const unsigned int CICHistory::DCWID = (unsigned int)-1;
     46 
     47 CICHistory::~CICHistory()
     48 {
     49 }
     50 
     51 bool CICHistory::seenBefore(unsigned int wid)
     52 {
     53     return false;
     54 }
     55 
     56 bool CICHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid)
     57 {
     58     return true;
     59 }
     60 
     61 double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid)
     62 {
     63     return 0.0;
     64 }
     65 
     66 double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid)
     67 {
     68     return 0.0;
     69 }
     70 
     71 bool CICHistory::bufferize(void** buf_ptr, size_t* sz)
     72 {
     73     *buf_ptr = NULL;
     74     *sz = 0;
     75     return true;
     76 }
     77 
     78 bool CICHistory::loadFromBuffer(void* buf_ptr, size_t sz)
     79 {
     80     return true;
     81 }
     82 
     83 
     84 static bool bBigramHistoryInited = false;
     85 const size_t  CBigramHistory::contxt_memory_size = 8192;
     86 std::set<unsigned int>  CBigramHistory::s_stopWords;
     87 
     88 /**
     89 * Adding stop words including :
     90 * ��� ��� ��� ��� ��� ��� ��� ��� ��� ��� ������ ������ ������
     91 * ���
     92 * ??��� ��� ���
     93 */
     94 
     95 void CBigramHistory::initClass()
     96 {
     97     if (bBigramHistoryInited == false) {
     98         bBigramHistoryInited = true;
     99         s_stopWords.clear();
    100 
    101         s_stopWords.insert(0);     //unknown world
    102         s_stopWords.insert(DCWID); //seperator word id used by history memory interanlly
    103 
    104         s_stopWords.insert(67659); //���
    105         s_stopWords.insert(24261); //���
    106         s_stopWords.insert(37471); //���
    107         s_stopWords.insert(68920); //���
    108         s_stopWords.insert(5071);  //���
    109 
    110         s_stopWords.insert(8396);  //���
    111         s_stopWords.insert(40646); //���
    112         s_stopWords.insert(6755);  //���
    113         s_stopWords.insert(28369); //���
    114         s_stopWords.insert(30143); //���
    115         s_stopWords.insert(8398);  //������
    116         s_stopWords.insert(40648); //������
    117         s_stopWords.insert(6759);  //������
    118         s_stopWords.insert(28370); //������
    119         s_stopWords.insert(30114); //������
    120 
    121         s_stopWords.insert(7121);  //���
    122     }
    123 }
    124 
    125 //FIXME: CBigramHistory need to be thread safe
    126 CBigramHistory::CBigramHistory() : m_memory(), m_unifreq(), m_bifreq()
    127 {
    128 
    129 }
    130 
    131 CBigramHistory::~CBigramHistory()
    132 {
    133 }
    134 
    135 bool CBigramHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid)
    136 {
    137     TBigram bigram(DCWID, DCWID);
    138 
    139     // First , we insert an DC word id before the context history
    140     // to seperated from previous stream.
    141     if (m_memory.size() == contxt_memory_size) {
    142         TBigram hb;
    143         hb.first = m_memory.front();
    144         m_memory.pop_front();
    145         hb.second = m_memory.front();
    146 
    147         decUniFreq(hb.first);
    148         decBiFreq(hb);
    149     }
    150     m_memory.push_back(DCWID);
    151 
    152     //Now trying to memorize new stream and forget oldest
    153     for (; its_wid != ite_wid; ++its_wid) {
    154         if (m_memory.size() == contxt_memory_size) {
    155             TBigram hb;
    156             hb.first = m_memory.front();
    157             m_memory.pop_front();
    158             hb.second = m_memory.front();
    159 
    160             decUniFreq(hb.first);
    161             decBiFreq(hb);
    162         }
    163         bigram.first = bigram.second;
    164         bigram.second = *its_wid;
    165         m_memory.push_back(*its_wid);
    166         incUniFreq(bigram.second);
    167         incBiFreq(bigram);
    168     }
    169     return true;
    170 }
    171 
    172 double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid)
    173 {
    174     TBigram bigram(DCWID, DCWID);
    175     if (its_wid != ite_wid) {
    176         --ite_wid;
    177         bigram.second = *ite_wid;
    178         if (its_wid != ite_wid)
    179             bigram.first = *(ite_wid-1);
    180     }
    181     return pr(bigram);
    182 }
    183 
    184 double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid)
    185 {
    186     TBigram bigram(DCWID, DCWID);
    187     if (its_wid != ite_wid)
    188         bigram.first = *(ite_wid-1);
    189     bigram.second = wid;
    190     return pr(bigram);
    191 }
    192 
    193 inline uint16_t swap16(uint16_t x)
    194 {
    195     return ((x << 8) | ((x >> 8) & 0xff));
    196 }
    197 
    198 inline uint32_t swap32(uint32_t x)
    199 {
    200     return ((swap16(x) << 16) | (swap16(x >> 16) & 0xffff));
    201 }
    202 
    203 bool CBigramHistory::bufferize(void** buf_ptr, size_t* sz)
    204 {
    205     *buf_ptr = NULL;
    206     *sz = 0;
    207     try {
    208         *sz = sizeof(uint32_t) * m_memory.size();
    209         if (*sz > 0) {
    210             *buf_ptr = malloc(*sz); // malloc for C compatible
    211 #ifdef WORDS_BIGENDIAN
    212             std::copy(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr);
    213 #else
    214             std::transform(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr, swap32);
    215 #endif
    216         }
    217         return true;
    218     } catch (...) {
    219         if (*buf_ptr)
    220             free(*buf_ptr);
    221         *buf_ptr = NULL;
    222         *sz = 0;
    223     }
    224     return false;
    225 }
    226 
    227 bool CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz)
    228 {
    229     m_memory.clear();
    230     m_unifreq.clear();
    231     m_bifreq.clear();
    232 
    233     sz /= sizeof(uint32_t);
    234     uint32_t *pw = (uint32_t *)buf_ptr;
    235 
    236     if (pw && sz > 0) {
    237 #ifndef WORDS_BIGENDIAN
    238         std::transform(pw, pw+sz, pw, swap32);
    239 #endif
    240         TBigram bigram(DCWID, DCWID);
    241         for (int i=0; i < sz; ++i) {
    242             bigram.first = bigram.second;
    243             bigram.second = *pw++;
    244             m_memory.push_back(bigram.second);
    245             incUniFreq(bigram.second);
    246             incBiFreq(bigram);
    247         }
    248     }
    249     return true;
    250 }
    251 
    252 double CBigramHistory::pr(TBigram& bigram)
    253 {
    254     int uf0 = uniFreq(bigram.first);
    255     int bf = biFreq(bigram);
    256     int uf1 = uniFreq(bigram.second);
    257     double pr = 0.0;
    258     pr += 0.68*double(bf)/double(uf0+0.5);
    259     pr += 0.32*double(uf1)/double(m_memory.size() + (contxt_memory_size-m_memory.size())/10);
    260     //if (pr != 0) printf("cache pr(%d|%d) = %lf\n", bigram.second, bigram.first, pr);
    261     return pr;
    262 }
    263 
    264 int  CBigramHistory::uniFreq(TUnigram& ug)
    265 {
    266     int freq = 0;
    267     if (s_stopWords.find(ug) == s_stopWords.end()) {
    268         TUnigramPool::iterator it = m_unifreq.find(ug);
    269         if (it != m_unifreq.end()) {
    270             freq = it->second;
    271         }
    272     }
    273     //if (freq != 0) printf("uniFreq[%d]-->%d\n", ug, freq);
    274     return freq;
    275 }
    276 
    277 int  CBigramHistory::biFreq(TBigram& bg)
    278 {
    279     int freq = 0;
    280     //std::set<unsigned int>::const_iterator ite = s_stopWords.end();
    281     if (bg.first != DCWID && bg.second != DCWID) {
    282         TBigramPool::const_iterator it = m_bifreq.find(bg);
    283         if (it != m_bifreq.end())
    284             freq =  it->second;
    285     }
    286 
    287     //if (freq != 0) printf("biFreq[%d,%d]-->%d\n", bg.first, bg.second, freq);
    288     return freq;
    289 }
    290 
    291 void CBigramHistory::decUniFreq(TUnigram& ug)
    292 {
    293     TUnigramPool::iterator it = m_unifreq.find(ug);
    294     if (it != m_unifreq.end()) {
    295         if (it->second > 1)
    296             --(it->second);
    297         else
    298             m_unifreq.erase(it);
    299     }
    300 }
    301 
    302 bool CBigramHistory::seenBefore(unsigned int wid)
    303 {
    304     return (wid != DCWID && s_stopWords.find(wid) == s_stopWords.end() &&
    305             m_unifreq.find(wid) != m_unifreq.end());
    306 }
    307 
    308 void CBigramHistory::decBiFreq(TBigram& bg)
    309 {
    310     TBigramPool::iterator it = m_bifreq.find(bg);
    311     if (it != m_bifreq.end()) {
    312         if (it->second > 1)
    313             --(it->second);
    314         else
    315             m_bifreq.erase(it);
    316     }
    317 }
    318 
    319 void CBigramHistory::incUniFreq(TUnigram& ug)
    320 {
    321     ++m_unifreq[ug];
    322     //printf("Remebering uniFreq[%d]-->%d\n", ug, m_unifreq[ug]);
    323 }
    324 
    325 void CBigramHistory::incBiFreq(TBigram& bg)
    326 {
    327     ++m_bifreq[bg];
    328     //printf("Remebering biFreq[%d,%d]-->%d\n", bg.first, bg.second, m_bifreq[bg]);
    329 }
    330