1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifndef _SIM_SLM_BUILDER_H 39 #define _SIM_SLM_BUILDER_H 40 41 #include "../portability.h" 42 43 #include "sim_slm.h" 44 45 class CSlmDiscounter; 46 47 class CSlmBuilder { 48 public: 49 static const int SLM_MAX_R=16; 50 typedef CSIMSlm::FREQ_TYPE FREQ_TYPE; 51 typedef CSIMSlm::PR_TYPE PR_TYPE; 52 typedef CSIMSlm::TNode TNode; 53 typedef CSIMSlm::TLeaf TLeaf; 54 55 public: 56 CSlmBuilder() 57 : m_nWord(0), nlevel(0), level(NULL), cut(NULL), discounter(NULL), 58 nr(NULL), breaker(), m_excludes(), bUseLogPr(0) { } 59 ~CSlmBuilder() 60 { Close(); } 61 62 void Create(int n); 63 void SetNumberOfWord(int nWord) { this->m_nWord = nWord; } 64 void SetCut(FREQ_TYPE threshold[]); 65 void SetDiscounter(CSlmDiscounter* dis[]); 66 void SetBreakerIds(int nId, TSIMWordId brks[]); 67 void SetExcludeIds(int nId, TSIMWordId excludes[]); 68 void SetUseLogPr(int bUse) 69 { bUseLogPr = bUse; } 70 71 void AddNGram(TSIMWordId* ngram, FREQ_TYPE fr); 72 void Build(); 73 void Write(FILE* out); 74 void Close(); 75 76 //get pr(w[n-1] | w[0]...w[n-2]) on constructed partial model (low levels) 77 double getPr(int n, TSIMWordId* w); 78 79 public: 80 typedef std::vector<TNode> TNodeLevel; 81 typedef std::vector<TLeaf> TLeafLevel; 82 typedef TNodeLevel::iterator TNodeIterator; 83 typedef TLeafLevel::iterator TLeafIterator; 84 85 protected: 86 bool isBreakId(TSIMWordId id); 87 bool isExcludeId(TSIMWordId id); 88 void CountNr(); 89 void AppendTails(); 90 void Cut(); 91 void Discount(); 92 void CalcBOW(); 93 void*FindChild(int lvl, TNode* root, TSIMWordId id); 94 int CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast, 95 TNodeIterator chfirst, TNodeIterator chlast, int thred); 96 int CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast, 97 TLeafIterator chfirst, TLeafIterator chlast, int thred); 98 99 private: 100 int nlevel, bUseLogPr; 101 void** level; 102 //level[0] is psudeo root level, level[1] is unigram level, ..., all are vector type 103 104 int m_nWord; 105 FREQ_TYPE* cut; // cut[1] is not cut threshold for 1-gram, ... 106 CSlmDiscounter** discounter; // discounter[1] is for 1-gram... 107 FREQ_TYPE (*nr)[SLM_MAX_R];//nr[1][SLM_MAX_R] is for 1-gram... 108 std::vector<TSIMWordId> breaker; 109 std::vector<TSIMWordId> m_excludes; 110 }; 111 112 class CSlmDiscounter { 113 public: 114 // n is array size, nr is FREQ_TYPE[n], nr[0] is corpuse size,or sigma r*nr; 115 // nr[1] is number of ngram items with freq 1, ... 116 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr) = 0; 117 118 // freq is the ngram frequence, not the conditional pr 119 virtual double discount(int freq) = 0; 120 virtual const char* getName() = 0; 121 }; 122 123 //Good-Turing discount 124 class CSlmGTDiscounter : public CSlmDiscounter { 125 public: 126 CSlmGTDiscounter(int threshold=10, double highfreq_discount=0.95) : thres(threshold), hd(highfreq_discount), dis(NULL) {} 127 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); 128 virtual double discount(int freq); 129 virtual const char* getName() 130 { return "Good-Turing"; } 131 protected: 132 int thres; 133 double hd; 134 double *dis; 135 }; 136 137 class CSlmAbsoluteDiscounter : public CSlmDiscounter { 138 public: 139 CSlmAbsoluteDiscounter(double substract = 0.0) : c(substract) {} 140 //c == 0 mean this value should be count according to r[] 141 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); 142 virtual double discount(int freq); // return freq - c 143 virtual const char* getName() 144 { return "Absolution"; } 145 protected: 146 double c; 147 }; 148 149 class CSlmLinearDiscounter : public CSlmDiscounter { 150 public: 151 CSlmLinearDiscounter(double shrink = 0.0) : dis(shrink) {} 152 //dis == 0 mean this value should be count according to r[] 153 virtual void init(int n, CSlmBuilder::FREQ_TYPE *nr); 154 virtual double discount(int freq); // return freq * dis 155 virtual const char* getName() 156 { return "Linear"; } 157 protected: 158 double dis; 159 }; 160 161 #endif 162
