1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifndef _SIM_SLM_BUILDER_H 39 0 yongsun #define _SIM_SLM_BUILDER_H 40 0 yongsun 41 0 yongsun #include "../portability.h" 42 0 yongsun 43 0 yongsun #include "sim_slm.h" 44 0 yongsun 45 0 yongsun class CSlmDiscounter; 46 0 yongsun 47 0 yongsun class CSlmBuilder { 48 0 yongsun public: 49 0 yongsun static const int SLM_MAX_R=16; 50 0 yongsun typedef CSIMSlm::FREQ_TYPE FREQ_TYPE; 51 0 yongsun typedef CSIMSlm::PR_TYPE PR_TYPE; 52 0 yongsun typedef CSIMSlm::TNode TNode; 53 0 yongsun typedef CSIMSlm::TLeaf TLeaf; 54 0 yongsun 55 0 yongsun public: 56 0 yongsun CSlmBuilder() 57 0 yongsun : m_nWord(0), nlevel(0), level(NULL), cut(NULL), discounter(NULL), 58 0 yongsun nr(NULL), breaker(), m_excludes(), bUseLogPr(0) { } 59 0 yongsun ~CSlmBuilder() 60 0 yongsun { Close(); } 61 0 yongsun 62 0 yongsun void Create(int n); 63 0 yongsun void SetNumberOfWord(int nWord) { this->m_nWord = nWord; } 64 0 yongsun void SetCut(FREQ_TYPE threshold[]); 65 0 yongsun void SetDiscounter(CSlmDiscounter* dis[]); 66 0 yongsun void SetBreakerIds(int nId, TSIMWordId brks[]); 67 0 yongsun void SetExcludeIds(int nId, TSIMWordId excludes[]); 68 0 yongsun void SetUseLogPr(int bUse) 69 0 yongsun { bUseLogPr = bUse; } 70 0 yongsun 71 0 yongsun void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr); 72 0 yongsun void Build(); 73 0 yongsun void Write(FILE* out); 74 0 yongsun void Close(); 75 0 yongsun 76 0 yongsun //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels) 77 0 yongsun double getPr(int n, TSIMWordId* w); 78 0 yongsun 79 0 yongsun public: 80 0 yongsun typedef std::vector<TNode> TNodeLevel; 81 0 yongsun typedef std::vector<TLeaf> TLeafLevel; 82 0 yongsun typedef TNodeLevel::iterator TNodeIterator; 83 0 yongsun typedef TLeafLevel::iterator TLeafIterator; 84 0 yongsun 85 0 yongsun protected: 86 0 yongsun bool isBreakId(TSIMWordId id); 87 0 yongsun bool isExcludeId(TSIMWordId id); 88 0 yongsun void CountNr(); 89 0 yongsun void AppendTails(); 90 0 yongsun void Cut(); 91 0 yongsun void Discount(); 92 0 yongsun void CalcBOW(); 93 0 yongsun void*FindChild(int lvl, TNode* root, TSIMWordId id); 94 0 yongsun int CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast, 95 0 yongsun TNodeIterator chfirst, TNodeIterator chlast, int thred); 96 0 yongsun int CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast, 97 0 yongsun TLeafIterator chfirst, TLeafIterator chlast, int thred); 98 0 yongsun 99 0 yongsun private: 100 0 yongsun int nlevel, bUseLogPr; 101 0 yongsun void** level; 102 0 yongsun //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type 103 0 yongsun 104 0 yongsun int m_nWord; 105 0 yongsun FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ... 106 0 yongsun CSlmDiscounter** discounter; // discounter[1] is for 1-gram... 107 0 yongsun FREQ_TYPE (*nr)[SLM_MAX_R];//nr[1][SLM_MAX_R] is for 1-gram... 108 0 yongsun std::vector<TSIMWordId> breaker; 109 0 yongsun std::vector<TSIMWordId> m_excludes; 110 0 yongsun }; 111 0 yongsun 112 0 yongsun class CSlmDiscounter { 113 0 yongsun public: 114 0 yongsun // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr; 115 0 yongsun // nr[1] is number of ngram items with freq 1, ... 116 0 yongsun virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0; 117 0 yongsun 118 0 yongsun // freq is the ngram frequence, not the conditional pr 119 0 yongsun virtual double discount(int freq) = 0; 120 0 yongsun virtual const char* getName() = 0; 121 0 yongsun }; 122 0 yongsun 123 0 yongsun //Good-Turing discount 124 0 yongsun class CSlmGTDiscounter : public CSlmDiscounter { 125 0 yongsun public: 126 0 yongsun CSlmGTDiscounter(int threshold=10, double highfreq_discount=0.95) : thres(threshold), hd(highfreq_discount), dis(NULL) {} 127 0 yongsun virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); 128 0 yongsun virtual double discount(int freq); 129 0 yongsun virtual const char* getName() 130 0 yongsun { return "Good-Turing"; } 131 0 yongsun protected: 132 0 yongsun int thres; 133 0 yongsun double hd; 134 0 yongsun double *dis; 135 0 yongsun }; 136 0 yongsun 137 0 yongsun class CSlmAbsoluteDiscounter : public CSlmDiscounter { 138 0 yongsun public: 139 0 yongsun CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {} 140 0 yongsun //c == 0 mean this value should be count according to r[] 141 0 yongsun virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); 142 0 yongsun virtual double discount(int freq); // return freq - c 143 0 yongsun virtual const char* getName() 144 0 yongsun { return "Absolution"; } 145 0 yongsun protected: 146 0 yongsun double c; 147 0 yongsun }; 148 0 yongsun 149 0 yongsun class CSlmLinearDiscounter : public CSlmDiscounter { 150 0 yongsun public: 151 0 yongsun CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {} 152 0 yongsun //dis == 0 mean this value should be count according to r[] 153 0 yongsun virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); 154 0 yongsun virtual double discount(int freq); // return freq * dis 155 0 yongsun virtual const char* getName() 156 0 yongsun { return "Linear"; } 157 0 yongsun protected: 158 0 yongsun double dis; 159 0 yongsun }; 160 0 yongsun 161 0 yongsun #endif 162