Home | History | Annotate | Download | only in slm
      1   0  yongsun /*
      2  82  yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  82  yongsun  *
      4  82  yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  82  yongsun  *
      6  82  yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7  82  yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  82  yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  82  yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  82  yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  82  yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  82  yongsun  * specific language governing permissions and limitations under the License. When
     13  82  yongsun  * distributing the software, include this License Header Notice in each file and
     14  82  yongsun  * include the full text of the License in the License file as well as the
     15  82  yongsun  * following notice:
     16  82  yongsun  *
     17  82  yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  82  yongsun  * (CDDL)
     19  82  yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20  82  yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21  82  yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  82  yongsun  * the Federal Courts of the Northern District of California and the state courts
     23  82  yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24  82  yongsun  *
     25  82  yongsun  * Contributor(s):
     26  82  yongsun  *
     27  82  yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28  82  yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  82  yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  82  yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31  82  yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32  82  yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  82  yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  82  yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35  82  yongsun  * to such option by the copyright holder.
     36   0  yongsun  */
     37  82  yongsun 
     38   0  yongsun #ifndef _SIM_SLM_BUILDER_H
     39   0  yongsun #define _SIM_SLM_BUILDER_H
     40   0  yongsun 
     41   0  yongsun #include "../portability.h"
     42   0  yongsun 
     43   0  yongsun #include "sim_slm.h"
     44   0  yongsun 
     45   0  yongsun class CSlmDiscounter;
     46   0  yongsun 
     47   0  yongsun class CSlmBuilder {
     48   0  yongsun public:
     49   0  yongsun     static const int SLM_MAX_R=16;
     50   0  yongsun     typedef CSIMSlm::FREQ_TYPE FREQ_TYPE;
     51   0  yongsun     typedef CSIMSlm::PR_TYPE PR_TYPE;
     52   0  yongsun     typedef CSIMSlm::TNode TNode;
     53   0  yongsun     typedef CSIMSlm::TLeaf TLeaf;
     54   0  yongsun 
     55   0  yongsun public:
     56   0  yongsun     CSlmBuilder()
     57   0  yongsun         : m_nWord(0), nlevel(0), level(NULL), cut(NULL), discounter(NULL),
     58   0  yongsun           nr(NULL), breaker(), m_excludes(), bUseLogPr(0) { }
     59   0  yongsun     ~CSlmBuilder()
     60   0  yongsun         { Close(); }
     61   0  yongsun 
     62   0  yongsun     void Create(int n);
     63   0  yongsun     void SetNumberOfWord(int nWord) { this->m_nWord = nWord; }
     64   0  yongsun     void SetCut(FREQ_TYPE threshold[]);
     65   0  yongsun     void SetDiscounter(CSlmDiscounter* dis[]);
     66   0  yongsun     void SetBreakerIds(int nId, TSIMWordId brks[]);
     67   0  yongsun     void SetExcludeIds(int nId, TSIMWordId excludes[]);
     68   0  yongsun     void SetUseLogPr(int bUse)
     69   0  yongsun          { bUseLogPr = bUse; }
     70   0  yongsun 
     71   0  yongsun     void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr);
     72   0  yongsun     void Build();
     73   0  yongsun     void Write(FILE* out);
     74   0  yongsun     void Close();
     75   0  yongsun 
     76   0  yongsun     //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels)
     77   0  yongsun     double getPr(int n, TSIMWordId* w);
     78   0  yongsun 
     79   0  yongsun public:
     80   0  yongsun     typedef std::vector<TNode> TNodeLevel;
     81   0  yongsun     typedef std::vector<TLeaf> TLeafLevel;
     82   0  yongsun     typedef TNodeLevel::iterator TNodeIterator;
     83   0  yongsun     typedef TLeafLevel::iterator TLeafIterator;
     84   0  yongsun 
     85   0  yongsun protected:
     86   0  yongsun     bool isBreakId(TSIMWordId id);
     87   0  yongsun     bool isExcludeId(TSIMWordId id);
     88   0  yongsun     void CountNr();
     89   0  yongsun     void AppendTails();
     90   0  yongsun     void Cut();
     91   0  yongsun     void Discount();
     92   0  yongsun     void CalcBOW();
     93   0  yongsun     void*FindChild(int lvl, TNode* root, TSIMWordId id);
     94   0  yongsun     int  CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast,
     95   0  yongsun                       TNodeIterator chfirst, TNodeIterator chlast, int thred);
     96   0  yongsun     int  CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast,
     97   0  yongsun                       TLeafIterator chfirst, TLeafIterator chlast, int thred);
     98   0  yongsun 
     99   0  yongsun private:
    100   0  yongsun     int    nlevel, bUseLogPr;
    101   0  yongsun     void** level;
    102   0  yongsun     //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type
    103   0  yongsun 
    104   0  yongsun     int m_nWord;
    105   0  yongsun     FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ...
    106   0  yongsun     CSlmDiscounter** discounter; // discounter[1] is for 1-gram...
    107   0  yongsun     FREQ_TYPE (*nr)[SLM_MAX_R];//nr[1][SLM_MAX_R] is for 1-gram...
    108   0  yongsun     std::vector<TSIMWordId> breaker;
    109   0  yongsun     std::vector<TSIMWordId> m_excludes;
    110   0  yongsun };
    111   0  yongsun 
    112   0  yongsun class CSlmDiscounter {
    113   0  yongsun public:
    114   0  yongsun     // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr;
    115   0  yongsun     // nr[1] is number of ngram items with freq 1, ...
    116   0  yongsun     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0;
    117   0  yongsun 
    118   0  yongsun     // freq is the ngram frequence, not the conditional pr
    119   0  yongsun     virtual double discount(int freq) = 0;
    120   0  yongsun     virtual const char* getName() = 0;
    121   0  yongsun };
    122   0  yongsun 
    123   0  yongsun //Good-Turing discount
    124   0  yongsun class CSlmGTDiscounter : public CSlmDiscounter {
    125   0  yongsun public:
    126   0  yongsun     CSlmGTDiscounter(int threshold=10, double highfreq_discount=0.95) : thres(threshold), hd(highfreq_discount), dis(NULL) {}
    127   0  yongsun     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
    128   0  yongsun     virtual double discount(int freq);
    129   0  yongsun     virtual const char* getName()
    130   0  yongsun         { return "Good-Turing"; }
    131   0  yongsun protected:
    132   0  yongsun     int thres;
    133   0  yongsun     double hd;
    134   0  yongsun     double *dis;
    135   0  yongsun };
    136   0  yongsun 
    137   0  yongsun class CSlmAbsoluteDiscounter : public CSlmDiscounter {
    138   0  yongsun public:
    139   0  yongsun     CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {}
    140   0  yongsun     //c == 0 mean this value should be count according to r[]
    141   0  yongsun     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
    142   0  yongsun     virtual double discount(int freq);	// return freq - c
    143   0  yongsun     virtual const char* getName()
    144   0  yongsun         { return "Absolution"; }
    145   0  yongsun protected:
    146   0  yongsun         double c;
    147   0  yongsun };
    148   0  yongsun 
    149   0  yongsun class CSlmLinearDiscounter : public CSlmDiscounter {
    150   0  yongsun public:
    151   0  yongsun     CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {}
    152   0  yongsun     //dis == 0 mean this value should be count according to r[]
    153   0  yongsun     virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr);
    154   0  yongsun     virtual double discount(int freq);	// return freq * dis
    155   0  yongsun     virtual const char* getName()
    156   0  yongsun         { return "Linear"; }
    157   0  yongsun protected:
    158   0  yongsun     double dis;
    159   0  yongsun };
    160   0  yongsun 
    161   0  yongsun #endif
    162