Home | History | Annotate | Download | only in src
      1   0  yongsun /*
      2  82  yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  82  yongsun  *
      4  82  yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  82  yongsun  *
      6  82  yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7  82  yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  82  yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  82  yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  82  yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  82  yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  82  yongsun  * specific language governing permissions and limitations under the License. When
     13  82  yongsun  * distributing the software, include this License Header Notice in each file and
     14  82  yongsun  * include the full text of the License in the License file as well as the
     15  82  yongsun  * following notice:
     16  82  yongsun  *
     17  82  yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  82  yongsun  * (CDDL)
     19  82  yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20  82  yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21  82  yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  82  yongsun  * the Federal Courts of the Northern District of California and the state courts
     23  82  yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24  82  yongsun  *
     25  82  yongsun  * Contributor(s):
     26  82  yongsun  *
     27  82  yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28  82  yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  82  yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  82  yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31  82  yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32  82  yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  82  yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  82  yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35  82  yongsun  * to such option by the copyright holder.
     36   0  yongsun  */
     37  82  yongsun 
     38   0  yongsun #ifndef _SUNPINYIN_CONTEXT_HISTORY_H
     39   0  yongsun #define _SUNPINYIN_CONTEXT_HISTORY_H
     40   0  yongsun 
     41   0  yongsun #include "portability.h"
     42   0  yongsun 
     43   0  yongsun #include <map>
     44   0  yongsun #include <deque>
     45   0  yongsun #include <set>
     46   0  yongsun 
     47   0  yongsun /**
     48   0  yongsun * A forget all history memory
     49   0  yongsun */
     50   0  yongsun class CICHistory {
     51   0  yongsun public:
     52   0  yongsun     /** don't care word id, or seperator word id */
     53   0  yongsun     static const unsigned int DCWID;
     54   0  yongsun 
     55   0  yongsun     virtual ~CICHistory();
     56   0  yongsun 
     57   0  yongsun     virtual bool seenBefore(unsigned int wid);
     58   0  yongsun 
     59   0  yongsun     /**
     60   0  yongsun     * memorize the context stream pointed by [its_wid, ite_wid)
     61   0  yongsun     */
     62   0  yongsun     virtual bool memorize(unsigned int* its_wid, unsigned int* ite_wid);
     63   0  yongsun 
     64   0  yongsun     /**
     65   0  yongsun     * @param its_wid is the first word pointer of the context stream
     66   0  yongsun     * @param ite_wid is the last (exclusive) word pointer of the context stream
     67   0  yongsun     * @return pr(*(ite_wid-1) | *its_wid, ..., *(ite_wid-2))
     68   0  yongsun     * The return value could be zero, i.e. no need to smooth the probabilities
     69   0  yongsun     */
     70   0  yongsun     virtual double pr(unsigned int* its_wid, unsigned int* ite_wid);
     71   0  yongsun 
     72   0  yongsun     /**
     73   0  yongsun     * @param its_wid is the first word pointer of the history stream
     74   0  yongsun     * @param ite_wid is the last (exclusive) word pointer of the history stream
     75   0  yongsun     * @return pr(*wid | *its_wid, ..., *(ite_wid-1))
     76   0  yongsun     * The return value could be zero, i.e. no need to smooth the probabilities
     77   0  yongsun     */
     78   0  yongsun     virtual double pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid);
     79   0  yongsun 
     80   0  yongsun     /**
     81   0  yongsun     * allocate a buffer, and put the context memory's contect into it
     82   0  yongsun     * @param buf_ptr would be stored the buffer pointer
     83   0  yongsun     * @param sz would be the size in byte of the buffer allocated
     84   0  yongsun     * @return false on error
     85   0  yongsun     * Note: the buf_ptr should be used free(*buf_ptr) to free after usage
     86   0  yongsun     */
     87   0  yongsun     virtual bool
     88   0  yongsun     bufferize(void** buf_ptr, size_t* sz);
     89   0  yongsun 
     90   0  yongsun     /**
     91   0  yongsun     * Load context memory according to the buf
     92   0  yongsun     * @param buf_ptr uffer pointer
     93   0  yongsun     * @param sz is the size in byte of the buffer
     94   0  yongsun     * @return false on error
     95   0  yongsun     * call with buf_ptr with NULL value would clear the context memory
     96   0  yongsun     */
     97   0  yongsun     virtual bool
     98   0  yongsun     loadFromBuffer(void* buf_ptr, size_t sz);
     99   0  yongsun };
    100   0  yongsun 
    101   0  yongsun class CBigramHistory : public CICHistory {
    102   0  yongsun public:
    103   0  yongsun     static void initClass();
    104   0  yongsun 
    105   0  yongsun     CBigramHistory();
    106   0  yongsun 
    107   0  yongsun     virtual ~CBigramHistory();
    108   0  yongsun 
    109   0  yongsun     virtual bool seenBefore(unsigned int wid);
    110   0  yongsun 
    111   0  yongsun     virtual bool memorize(unsigned int* its_wid, unsigned int* ite_wid);
    112   0  yongsun 
    113   0  yongsun     /**
    114   0  yongsun     * @param its_wid is the first word pointer of the context stream
    115   0  yongsun     * @param ite_wid is the last (exclusive) word pointer of the context stream
    116   0  yongsun     * @return pr(*(ite_wid-1) | *(ite_wid-2))
    117   0  yongsun     */
    118   0  yongsun     virtual double pr(unsigned int* its_wid, unsigned int* ite_wid);
    119   0  yongsun 
    120   0  yongsun     /**
    121   0  yongsun     * @param its_wid is the first word pointer of the history stream
    122   0  yongsun     * @param ite_wid is the last (exclusive) word pointer of the history stream
    123   0  yongsun     * @return pr(*wid | *(ite_wid-1))
    124   0  yongsun     */
    125   0  yongsun     virtual double pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid);
    126   0  yongsun 
    127   0  yongsun     virtual bool
    128   0  yongsun     bufferize(void** buf_ptr, size_t* sz);
    129   0  yongsun 
    130   0  yongsun     virtual bool
    131   0  yongsun     loadFromBuffer(void* buf_ptr, size_t sz);
    132   0  yongsun 
    133   0  yongsun protected:
    134   0  yongsun     typedef unsigned                              TWordId;
    135   0  yongsun     typedef std::pair<TWordId, TWordId>           TBigram;
    136   0  yongsun     typedef TWordId                               TUnigram;
    137   0  yongsun     typedef std::map<TBigram, int>                TBigramPool;
    138   0  yongsun     typedef std::map<TUnigram, int>               TUnigramPool;
    139   0  yongsun     typedef std::deque<TWordId>                   TContextMemory;
    140   0  yongsun 
    141   0  yongsun     static const size_t contxt_memory_size;
    142   0  yongsun 
    143   0  yongsun     TContextMemory          m_memory;
    144   0  yongsun     TUnigramPool            m_unifreq;
    145   0  yongsun     TBigramPool             m_bifreq;
    146   0  yongsun 
    147   0  yongsun     static std::set<unsigned int>                  s_stopWords;
    148   0  yongsun 
    149   0  yongsun protected:
    150   0  yongsun     double pr(TBigram& bg);
    151   0  yongsun     int  uniFreq(TUnigram& ug);
    152   0  yongsun     int  biFreq(TBigram& bg);
    153   0  yongsun 
    154   0  yongsun     void decUniFreq(TUnigram& ug);
    155   0  yongsun     void decBiFreq(TBigram& bg);
    156   0  yongsun     void incUniFreq(TUnigram& ug);
    157   0  yongsun     void incBiFreq(TBigram& bg);
    158   0  yongsun };
    159   0  yongsun 
    160   0  yongsun #endif
    161