OpenGrok

Cross Reference: sim_slmbuilder.h
xref: /nv-g11n/inputmethod/sunpinyin/slm/src/slm/sim_slmbuilder.h
Home | History | Annotate | Line # | Download | only in slm
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifndef _SIM_SLM_BUILDER_H
     39 #define _SIM_SLM_BUILDER_H
     40 
     41 #include "../portability.h"
     42 
     43 #include "sim_slm.h"
     44 
     45 class CSlmDiscounter;
     46 
     47 class CSlmBuilder {
     48 public:
     49     static const int SLM_MAX_R=16;
     50     typedef CSIMSlm::FREQ_TYPE FREQ_TYPE;
     51     typedef CSIMSlm::PR_TYPE PR_TYPE;
     52     typedef CSIMSlm::TNode TNode;
     53     typedef CSIMSlm::TLeaf TLeaf;
     54 
     55 public:
     56     CSlmBuilder()
     57         : m_nWord(0), nlevel(0), level(NULL), cut(NULL), discounter(NULL),
     58           nr(NULL), breaker(), m_excludes(), bUseLogPr(0) { }
     59     ~CSlmBuilder()
     60         { Close(); }
     61 
     62     void Create(int n);
     63     void SetNumberOfWord(int nWord) { this->m_nWord = nWord; }
     64     void SetCut(FREQ_TYPE threshold[]);
     65     void SetDiscounter(CSlmDiscounter* dis[]);
     66     void SetBreakerIds(int nId, TSIMWordId brks[]);
     67     void SetExcludeIds(int nId, TSIMWordId excludes[]);
     68     void SetUseLogPr(int bUse)
     69          { bUseLogPr = bUse; }
     70 
     71     void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr);
     72     void Build();
     73     void Write(FILE* out);
     74     void Close();
     75 
     76     //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels)
     77     double getPr(int n, TSIMWordId* w);
     78 
     79 public:
     80     typedef std::vector<TNode> TNodeLevel;
     81     typedef std::vector<TLeaf> TLeafLevel;
     82     typedef TNodeLevel::iterator TNodeIterator;
     83     typedef TLeafLevel::iterator TLeafIterator;
     84 
     85 protected:
     86     bool isBreakId(TSIMWordId id);
     87     bool isExcludeId(TSIMWordId id);
     88     void CountNr();
     89     void AppendTails();
     90     void Cut();
     91     void Discount();
     92     void CalcBOW();
     93     void*FindChild(int lvl, TNode* root, TSIMWordId id);
     94     int  CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast,
     95                       TNodeIterator chfirst, TNodeIterator chlast, int thred);
     96     int  CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast,
     97                       TLeafIterator chfirst, TLeafIterator chlast, int thred);
     98 
     99 private:
    100     int    nlevel, bUseLogPr;
    101     void** level;
    102     //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type
    103 
    104     int m_nWord;
    105     FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ...
    106     CSlmDiscounter** discounter; // discounter[1] is for 1-gram...
    107     FREQ_TYPE (*nr)[SLM_MAX_R];//nr[1][SLM_MAX_R] is for 1-gram...
    108     std::vector<TSIMWordId> breaker;
    109     std::vector<TSIMWordId> m_excludes;
    110 };
    111 
    112 class CSlmDiscounter {
    113 public:
    114     // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr;
    115     // nr[1] is number of ngram items with freq 1, ...
    116     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0;
    117 
    118     // freq is the ngram frequence, not the conditional pr
    119     virtual double discount(int freq) = 0;
    120     virtual const char* getName() = 0;
    121 };
    122 
    123 //Good-Turing discount
    124 class CSlmGTDiscounter : public CSlmDiscounter {
    125 public:
    126     CSlmGTDiscounter(int threshold=10, double highfreq_discount=0.95) : thres(threshold), hd(highfreq_discount), dis(NULL) {}
    127     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
    128     virtual double discount(int freq);
    129     virtual const char* getName()
    130         { return "Good-Turing"; }
    131 protected:
    132     int thres;
    133     double hd;
    134     double *dis;
    135 };
    136 
    137 class CSlmAbsoluteDiscounter : public CSlmDiscounter {
    138 public:
    139     CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {}
    140     //c == 0 mean this value should be count according to r[]
    141     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
    142     virtual double discount(int freq);	// return freq - c
    143     virtual const char* getName()
    144         { return "Absolution"; }
    145 protected:
    146         double c;
    147 };
    148 
    149 class CSlmLinearDiscounter : public CSlmDiscounter {
    150 public:
    151     CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {}
    152     //dis == 0 mean this value should be count according to r[]
    153     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
    154     virtual double discount(int freq);	// return freq * dis
    155     virtual const char* getName()
    156         { return "Linear"; }
    157 protected:
    158     double dis;
    159 };
    160 
    161 #endif
    162