Home | History | Annotate | Download | only in src
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifdef HAVE_CONFIG_H
     39    0   yongsun #include <config.h>
     40    0   yongsun #endif
     41  198  tchaikov #include <algorithm>
     42    0   yongsun #include "ic_history.h"
     43  314   yongsun #include <stdint.h>
     44    0   yongsun 
     45    0   yongsun const unsigned int CICHistory::DCWID = (unsigned int)-1;
     46    0   yongsun 
     47    0   yongsun CICHistory::~CICHistory()
     48    0   yongsun {
     49    0   yongsun }
     50    0   yongsun 
     51    0   yongsun bool CICHistory::seenBefore(unsigned int wid)
     52    0   yongsun {
     53    0   yongsun     return false;
     54    0   yongsun }
     55    0   yongsun 
     56    0   yongsun bool CICHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid)
     57    0   yongsun {
     58    0   yongsun     return true;
     59    0   yongsun }
     60    0   yongsun 
     61    0   yongsun double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid)
     62    0   yongsun {
     63    0   yongsun     return 0.0;
     64    0   yongsun }
     65    0   yongsun 
     66    0   yongsun double CICHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid)
     67    0   yongsun {
     68    0   yongsun     return 0.0;
     69    0   yongsun }
     70    0   yongsun 
     71    0   yongsun bool CICHistory::bufferize(void** buf_ptr, size_t* sz)
     72    0   yongsun {
     73    0   yongsun     *buf_ptr = NULL;
     74    0   yongsun     *sz = 0;
     75    0   yongsun     return true;
     76    0   yongsun }
     77    0   yongsun 
     78    0   yongsun bool CICHistory::loadFromBuffer(void* buf_ptr, size_t sz)
     79    0   yongsun {
     80    0   yongsun     return true;
     81    0   yongsun }
     82    0   yongsun 
     83    0   yongsun 
     84    0   yongsun static bool bBigramHistoryInited = false;
     85    0   yongsun const size_t  CBigramHistory::contxt_memory_size = 8192;
     86    0   yongsun std::set<unsigned int>  CBigramHistory::s_stopWords;
     87    0   yongsun 
     88    0   yongsun /**
     89    0   yongsun * Adding stop words including :
     90    0   yongsun *             
     91    0   yongsun * 
     92    0   yongsun * ??  
     93    0   yongsun */
     94    0   yongsun 
     95    0   yongsun void CBigramHistory::initClass()
     96    0   yongsun {
     97    0   yongsun     if (bBigramHistoryInited == false) {
     98    0   yongsun         bBigramHistoryInited = true;
     99    0   yongsun         s_stopWords.clear();
    100    0   yongsun 
    101    0   yongsun         s_stopWords.insert(0);     //unknown world
    102    0   yongsun         s_stopWords.insert(DCWID); //seperator word id used by history memory interanlly
    103    0   yongsun 
    104    0   yongsun         s_stopWords.insert(67659); //
    105    0   yongsun         s_stopWords.insert(24261); //
    106    0   yongsun         s_stopWords.insert(37471); //
    107    0   yongsun         s_stopWords.insert(68920); //
    108    0   yongsun         s_stopWords.insert(5071);  //
    109    0   yongsun 
    110    0   yongsun         s_stopWords.insert(8396);  //
    111    0   yongsun         s_stopWords.insert(40646); //
    112    0   yongsun         s_stopWords.insert(6755);  //
    113    0   yongsun         s_stopWords.insert(28369); //
    114    0   yongsun         s_stopWords.insert(30143); //
    115    0   yongsun         s_stopWords.insert(8398);  //
    116    0   yongsun         s_stopWords.insert(40648); //
    117    0   yongsun         s_stopWords.insert(6759);  //
    118    0   yongsun         s_stopWords.insert(28370); //
    119    0   yongsun         s_stopWords.insert(30114); //
    120    0   yongsun 
    121    0   yongsun         s_stopWords.insert(7121);  //
    122    0   yongsun     }
    123    0   yongsun }
    124    0   yongsun 
    125    0   yongsun //FIXME: CBigramHistory need to be thread safe
    126    0   yongsun CBigramHistory::CBigramHistory() : m_memory(), m_unifreq(), m_bifreq()
    127    0   yongsun {
    128    0   yongsun 
    129    0   yongsun }
    130    0   yongsun 
    131    0   yongsun CBigramHistory::~CBigramHistory()
    132    0   yongsun {
    133    0   yongsun }
    134    0   yongsun 
    135    0   yongsun bool CBigramHistory::memorize(unsigned int* its_wid, unsigned int* ite_wid)
    136    0   yongsun {
    137    0   yongsun     TBigram bigram(DCWID, DCWID);
    138    0   yongsun 
    139    0   yongsun     // First , we insert an DC word id before the context history
    140    0   yongsun     // to seperated from previous stream.
    141    0   yongsun     if (m_memory.size() == contxt_memory_size) {
    142    0   yongsun         TBigram hb;
    143    0   yongsun         hb.first = m_memory.front();
    144    0   yongsun         m_memory.pop_front();
    145    0   yongsun         hb.second = m_memory.front();
    146    0   yongsun 
    147    0   yongsun         decUniFreq(hb.first);
    148    0   yongsun         decBiFreq(hb);
    149    0   yongsun     }
    150    0   yongsun     m_memory.push_back(DCWID);
    151    0   yongsun 
    152    0   yongsun     //Now trying to memorize new stream and forget oldest
    153    0   yongsun     for (; its_wid != ite_wid; ++its_wid) {
    154    0   yongsun         if (m_memory.size() == contxt_memory_size) {
    155    0   yongsun             TBigram hb;
    156    0   yongsun             hb.first = m_memory.front();
    157    0   yongsun             m_memory.pop_front();
    158    0   yongsun             hb.second = m_memory.front();
    159    0   yongsun 
    160    0   yongsun             decUniFreq(hb.first);
    161    0   yongsun             decBiFreq(hb);
    162    0   yongsun         }
    163    0   yongsun         bigram.first = bigram.second;
    164    0   yongsun         bigram.second = *its_wid;
    165    0   yongsun         m_memory.push_back(*its_wid);
    166    0   yongsun         incUniFreq(bigram.second);
    167    0   yongsun         incBiFreq(bigram);
    168    0   yongsun     }
    169    0   yongsun     return true;
    170    0   yongsun }
    171    0   yongsun 
    172    0   yongsun double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid)
    173    0   yongsun {
    174    0   yongsun     TBigram bigram(DCWID, DCWID);
    175    0   yongsun     if (its_wid != ite_wid) {
    176    0   yongsun         --ite_wid;
    177    0   yongsun         bigram.second = *ite_wid;
    178    0   yongsun         if (its_wid != ite_wid)
    179    0   yongsun             bigram.first = *(ite_wid-1);
    180    0   yongsun     }
    181    0   yongsun     return pr(bigram);
    182    0   yongsun }
    183    0   yongsun 
    184    0   yongsun double CBigramHistory::pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid)
    185    0   yongsun {
    186    0   yongsun     TBigram bigram(DCWID, DCWID);
    187    0   yongsun     if (its_wid != ite_wid)
    188    0   yongsun         bigram.first = *(ite_wid-1);
    189    0   yongsun     bigram.second = wid;
    190    0   yongsun     return pr(bigram);
    191    0   yongsun }
    192    0   yongsun 
    193  198  tchaikov inline uint16_t swap16(uint16_t x)
    194    0   yongsun {
    195    0   yongsun     return ((x << 8) | ((x >> 8) & 0xff));
    196    0   yongsun }
    197    0   yongsun 
    198  198  tchaikov inline uint32_t swap32(uint32_t x)
    199    0   yongsun {
    200    0   yongsun     return ((swap16(x) << 16) | (swap16(x >> 16) & 0xffff));
    201    0   yongsun }
    202    0   yongsun 
    203    0   yongsun bool CBigramHistory::bufferize(void** buf_ptr, size_t* sz)
    204    0   yongsun {
    205    0   yongsun     *buf_ptr = NULL;
    206    0   yongsun     *sz = 0;
    207    0   yongsun     try {
    208  198  tchaikov         *sz = sizeof(uint32_t) * m_memory.size();
    209    0   yongsun         if (*sz > 0) {
    210    0   yongsun             *buf_ptr = malloc(*sz); // malloc for C compatible
    211  198  tchaikov #ifdef WORDS_BIGENDIAN
    212  198  tchaikov             std::copy(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr);
    213  198  tchaikov #else
    214  198  tchaikov             std::transform(m_memory.begin(), m_memory.end(), (uint32_t*)*buf_ptr, swap32);
    215  198  tchaikov #endif
    216    0   yongsun         }
    217    0   yongsun         return true;
    218    0   yongsun     } catch (...) {
    219    0   yongsun         if (*buf_ptr)
    220    0   yongsun             free(*buf_ptr);
    221    0   yongsun         *buf_ptr = NULL;
    222    0   yongsun         *sz = 0;
    223    0   yongsun     }
    224    0   yongsun     return false;
    225    0   yongsun }
    226    0   yongsun 
    227    0   yongsun bool CBigramHistory::loadFromBuffer(void* buf_ptr, size_t sz)
    228    0   yongsun {
    229    0   yongsun     m_memory.clear();
    230    0   yongsun     m_unifreq.clear();
    231    0   yongsun     m_bifreq.clear();
    232    0   yongsun 
    233  198  tchaikov     sz /= sizeof(uint32_t);
    234  198  tchaikov     uint32_t *pw = (uint32_t *)buf_ptr;
    235    0   yongsun 
    236    0   yongsun     if (pw && sz > 0) {
    237  198  tchaikov #ifndef WORDS_BIGENDIAN
    238  198  tchaikov         std::transform(pw, pw+sz, pw, swap32);
    239  198  tchaikov #endif
    240    0   yongsun         TBigram bigram(DCWID, DCWID);
    241    0   yongsun         for (int i=0; i < sz; ++i) {
    242    0   yongsun             bigram.first = bigram.second;
    243    0   yongsun             bigram.second = *pw++;
    244    0   yongsun             m_memory.push_back(bigram.second);
    245    0   yongsun             incUniFreq(bigram.second);
    246    0   yongsun             incBiFreq(bigram);
    247    0   yongsun         }
    248    0   yongsun     }
    249    0   yongsun     return true;
    250    0   yongsun }
    251    0   yongsun 
    252    0   yongsun double CBigramHistory::pr(TBigram& bigram)
    253    0   yongsun {
    254    0   yongsun     int uf0 = uniFreq(bigram.first);
    255    0   yongsun     int bf = biFreq(bigram);
    256    0   yongsun     int uf1 = uniFreq(bigram.second);
    257    0   yongsun     double pr = 0.0;
    258    0   yongsun     pr += 0.68*double(bf)/double(uf0+0.5);
    259    0   yongsun     pr += 0.32*double(uf1)/double(m_memory.size() + (contxt_memory_size-m_memory.size())/10);
    260    0   yongsun     //if (pr != 0) printf("cache pr(%d|%d) = %lf\n", bigram.second, bigram.first, pr);
    261    0   yongsun     return pr;
    262    0   yongsun }
    263    0   yongsun 
    264    0   yongsun int  CBigramHistory::uniFreq(TUnigram& ug)
    265    0   yongsun {
    266    0   yongsun     int freq = 0;
    267    0   yongsun     if (s_stopWords.find(ug) == s_stopWords.end()) {
    268    0   yongsun         TUnigramPool::iterator it = m_unifreq.find(ug);
    269    0   yongsun         if (it != m_unifreq.end()) {
    270    0   yongsun             freq = it->second;
    271    0   yongsun         }
    272    0   yongsun     }
    273    0   yongsun     //if (freq != 0) printf("uniFreq[%d]-->%d\n", ug, freq);
    274    0   yongsun     return freq;
    275    0   yongsun }
    276    0   yongsun 
    277    0   yongsun int  CBigramHistory::biFreq(TBigram& bg)
    278    0   yongsun {
    279    0   yongsun     int freq = 0;
    280    0   yongsun     //std::set<unsigned int>::const_iterator ite = s_stopWords.end();
    281    0   yongsun     if (bg.first != DCWID && bg.second != DCWID) {
    282    0   yongsun         TBigramPool::const_iterator it = m_bifreq.find(bg);
    283    0   yongsun         if (it != m_bifreq.end())
    284    0   yongsun             freq =  it->second;
    285    0   yongsun     }
    286    0   yongsun 
    287    0   yongsun     //if (freq != 0) printf("biFreq[%d,%d]-->%d\n", bg.first, bg.second, freq);
    288    0   yongsun     return freq;
    289    0   yongsun }
    290    0   yongsun 
    291    0   yongsun void CBigramHistory::decUniFreq(TUnigram& ug)
    292    0   yongsun {
    293    0   yongsun     TUnigramPool::iterator it = m_unifreq.find(ug);
    294    0   yongsun     if (it != m_unifreq.end()) {
    295    0   yongsun         if (it->second > 1)
    296    0   yongsun             --(it->second);
    297    0   yongsun         else
    298    0   yongsun             m_unifreq.erase(it);
    299    0   yongsun     }
    300    0   yongsun }
    301    0   yongsun 
    302    0   yongsun bool CBigramHistory::seenBefore(unsigned int wid)
    303    0   yongsun {
    304    0   yongsun     return (wid != DCWID && s_stopWords.find(wid) == s_stopWords.end() &&
    305    0   yongsun             m_unifreq.find(wid) != m_unifreq.end());
    306    0   yongsun }
    307    0   yongsun 
    308    0   yongsun void CBigramHistory::decBiFreq(TBigram& bg)
    309    0   yongsun {
    310    0   yongsun     TBigramPool::iterator it = m_bifreq.find(bg);
    311    0   yongsun     if (it != m_bifreq.end()) {
    312    0   yongsun         if (it->second > 1)
    313    0   yongsun             --(it->second);
    314    0   yongsun         else
    315    0   yongsun             m_bifreq.erase(it);
    316    0   yongsun     }
    317    0   yongsun }
    318    0   yongsun 
    319    0   yongsun void CBigramHistory::incUniFreq(TUnigram& ug)
    320    0   yongsun {
    321    0   yongsun     ++m_unifreq[ug];
    322    0   yongsun     //printf("Remebering uniFreq[%d]-->%d\n", ug, m_unifreq[ug]);
    323    0   yongsun }
    324    0   yongsun 
    325    0   yongsun void CBigramHistory::incBiFreq(TBigram& bg)
    326    0   yongsun {
    327    0   yongsun     ++m_bifreq[bg];
    328    0   yongsun     //printf("Remebering biFreq[%d,%d]-->%d\n", bg.first, bg.second, m_bifreq[bg]);
    329    0   yongsun }
    330