OpenGrok

Cross Reference: slm.h
xref: /nv-g11n/inputmethod/sunpinyin/slm/src/slm/slm.h
Home | History | Annotate | Line # | Download | only in slm
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifndef _SUN_AGC_SLM_H
     39 #define _SUN_AGC_SLM_H
     40 
     41 #include "../portability.h"
     42 
     43 #include <stdio.h>
     44 
     45 /**
     46  * Thread slm make the following modifications to simple back-off language model
     47  *    -# Word id are limited to 18 bits, about 240K word ids
     48  *    -# Compact all float value of -log(pr) into 65536 (16 bits)
     49  *       level and use a table to map the index to a float value;
     50  *    -# Compact all float value of -log(pr) into 16384 (14 bits)
     51  *       level and use a table to map the index to a float value;
     52  *    -# threading infomation embed into binary model file. Threading include
     53  *         - bol(back-off-level) from current level
     54  *         - bon(back-off-node)'s index in the bol level array
     55  *         .
     56  *       The thread could be used:
     57  *         - when leaf node are arrived, it could use (bol,bon) as history for
     58  *           history node.
     59  *         - when a word could not be found in current node (cl, cn)'s children,
     60  *           searching could be transfered to (bol, bon) directly and continue
     61  *           searching the target word
     62  *    -# Add a basic type TState in Language model, a state is pair of\n
     63  *           (level, array_idx_of_the level)
     64  *    -# change all get probability interface to\n
     65  *          double transfer(TState& history, unsigned int wid, TState& result);
     66  */
     67 class CThreadSlm {
     68 public:
     69     enum {
     70         BITS_BOW = 14,
     71         BITS_PR  = 16,
     72         ID_NOT_WORD = 69
     73     };
     74 
     75     /**
     76     * (level:idx) located a state in the language model very well
     77     * Please note the psuedo unigram state, with level == 0, but idx > 0
     78     * it's for used with bigram cache model
     79     */
     80     union TState{
     81         TState(const TState& b) : m_all(b.m_all) { }
     82         TState(unsigned level=0, unsigned idx=0) { anony.m_Level=level; anony.m_Idx=idx; }
     83 
     84         inline TState& operator++()              { ++anony.m_Idx; return *this; }
     85 
     86         inline void setIdx(unsigned int idx)     { anony.m_Idx = idx; }
     87         inline void setLevel(unsigned int lvl)   { anony.m_Level = lvl; }
     88 
     89         inline unsigned int getLevel() const     { return anony.m_Level; }
     90         inline unsigned int getIdx() const       { return anony.m_Idx; }
     91         inline operator unsigned() const         { return m_all; } //((anony.m_Level << 24) + anony.m_Idx); }
     92         inline bool operator==(const TState & b) const  { return m_all == b.m_all; }
     93         inline bool operator< (const TState & b) const  { return unsigned(*this) <  unsigned(b); }
     94 
     95     private:
     96         unsigned int m_all;
     97 #ifndef WORDS_BIGENDIAN
     98         struct TAnonymous {
     99             unsigned m_Idx   :24;
    100             unsigned m_Level : 8;
    101         } anony;
    102 #else
    103         struct TAnonymous {
    104             unsigned m_Level : 8;
    105             unsigned m_Idx   :24;
    106         } anony;
    107 #endif
    108     };
    109 
    110     /**
    111      * Machine dependent
    112      */
    113     struct TNode {
    114     public:
    115         unsigned int wid() const
    116         {
    117             return m_wid;
    118         }
    119 
    120         unsigned int bow() const
    121         {
    122             return m_bow;
    123         }
    124 
    125         unsigned int pr()  const
    126         {
    127             return m_pr;
    128         }
    129 
    130         unsigned int bon() const
    131         {
    132             return m_bon;
    133         }
    134 
    135         unsigned int bol() const
    136         {
    137             return m_bol;
    138         }
    139 
    140         unsigned int ch()  const
    141         {
    142             return ((m_ch_hi << 16) + m_ch_lo);
    143         }
    144 
    145         void set_wid(unsigned int wid)
    146         {
    147             m_wid = wid;
    148         }
    149 
    150         void set_bow(unsigned int bow)
    151         {
    152             m_bow = bow;
    153         }
    154 
    155         void set_pr(unsigned int pr)
    156         {
    157             m_pr = pr;
    158         }
    159 
    160         void set_bon(unsigned int bon)
    161         {
    162             m_bon = bon;
    163         }
    164 
    165         void set_bol(unsigned int bol)
    166         {
    167             m_bol = bol;
    168         }
    169 
    170         void set_ch(unsigned int ch)
    171         {
    172             m_ch_hi=((ch >> 16) & 0x7F);
    173             m_ch_lo=(ch & 0xFFFF);
    174         }
    175 
    176     protected:
    177 #ifndef WORDS_BIGENDIAN
    178         unsigned m_wid       :18;
    179         unsigned m_bow       :14;
    180         unsigned m_pr        :16;
    181         unsigned m_ch_lo     :16;
    182         unsigned m_bon       :23;
    183         unsigned m_bol       : 2;
    184         unsigned m_ch_hi     : 7;
    185 #else
    186         unsigned m_ch_hi     : 7;
    187         unsigned m_bol       : 2;
    188         unsigned m_bon       :23;
    189         unsigned m_ch_lo     :16;
    190         unsigned m_pr        :16;
    191         unsigned m_bow       :14;
    192         unsigned m_wid       :18;
    193 #endif
    194 
    195     private:
    196         /**
    197          * Machine dependent
    198         union TChildIdx {
    199         public:
    200             inline TChildIdx(unsigned val) : m_all(val) { }
    201             inline TChildIdx(const TChildIdx& b) : m_all(b.m_all) { }
    202             inline TChildIdx(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi = hi; anony.m_lo = lo; }
    203 
    204             inline unsigned int lo() { return anony.m_lo; }
    205             inline unsigned int hi() { return anony.m_hi; }
    206             inline unsigned int all(){ return m_all; }
    207 
    208             inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
    209             inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
    210             inline unsigned int set_all(unsigned int all) { return (m_all = all); }
    211 
    212         private:
    213             unsigned int m_all;
    214 #ifndef WORDS_BIGENDIAN
    215             struct TAnony {
    216                 unsigned m_lo :16;
    217                 unsigned m_hi : 7;
    218                 unsigned NOUSE: 9;
    219             } anony;
    220 #else
    221             struct TAnony {
    222                 unsigned NOUSE: 9;
    223                 unsigned m_hi : 7;
    224                 unsigned m_lo :16;
    225             } anony;
    226 #endif
    227         };
    228         */
    229     };
    230 
    231     /**
    232      * Machine dependent
    233      */
    234     struct TLeaf {
    235     public:
    236         inline unsigned int wid() const { return m_wid; }
    237         inline unsigned int bon() const { return m_bon; }
    238         inline unsigned int bol() const { return m_bol; }
    239         inline unsigned int pr()  const { return ((m_pr_hi << 14) + m_pr_lo); }
    240 
    241         inline void set_wid(unsigned int wid) { m_wid = wid; }
    242         inline void set_bon(unsigned int bon) { m_bon = bon; }
    243         inline void set_bol(unsigned int bol) { m_bol = bol; }
    244         inline void set_pr(unsigned int pr)   { m_pr_hi = ((pr >> 14) & 0x3); m_pr_lo = pr & 0x3FFF; }
    245 
    246     protected:
    247 #ifndef WORDS_BIGENDIAN
    248         unsigned m_wid       :18;
    249         unsigned m_pr_lo     :14;
    250         unsigned m_bon       :23;
    251         unsigned m_bol       : 2;
    252         unsigned m_pr_hi     : 2;
    253 #else
    254         unsigned m_pr_hi     : 2;
    255         unsigned m_bol       : 2;
    256         unsigned m_bon       :23;
    257         unsigned m_pr_lo     :14;
    258         unsigned m_wid       :18;
    259 #endif
    260 
    261     private:
    262     /*
    263         union TPr {
    264         public:
    265             inline TPr(unsigned int val) : m_all(val) { }
    266             inline TPr(const TPr & b) : m_all(b.m_all) { }
    267             inline TPr(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi=hi, anony.m_lo=lo; }
    268 
    269             inline unsigned int lo() { return anony.m_lo; }
    270             inline unsigned int hi() { return anony.m_hi; }
    271             inline unsigned int all(){ return m_all; }
    272 
    273             inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
    274             inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
    275             inline unsigned int set_all(unsigned int all) { return (m_all = all); }
    276 
    277         private:
    278             unsigned int m_all;
    279 #ifndef WORDS_BIGENDIAN
    280             struct TAnony {
    281                 unsigned m_lo  :14;
    282                 unsigned m_hi  : 2;
    283                 unsigned NONUSE:16;
    284             } anony;
    285 #else
    286             struct TAnony {
    287                 unsigned NONUSE:16;
    288                 unsigned m_hi  : 2;
    289                 unsigned m_lo  :14;
    290             } anony;
    291 #endif
    292         };
    293         */
    294     };
    295 
    296 public:
    297     CThreadSlm()
    298         : m_N(0), m_UseLogPr(0), m_Levels(NULL), m_LevelSizes(NULL),
    299           m_bowTable(NULL), m_prTable(NULL), m_bMMap(false), m_buf(NULL) { }
    300 
    301     ~CThreadSlm() { free(); }
    302 
    303     bool
    304     load(const char* fname, bool MMap=false);
    305 
    306     unsigned
    307     isUseLogPr() const
    308     { return m_UseLogPr; }
    309 
    310     void
    311     free();
    312 
    313     double
    314     transferNegLog(TState history, unsigned int wid, TState& result);
    315 
    316     double
    317     transfer(TState history, unsigned int wid, TState& result);
    318 
    319     TState
    320     history_state_of(TState st);
    321 
    322     TState&
    323     historify(TState& st);
    324 
    325     unsigned int
    326     lastWordId(TState st);
    327 
    328 protected:
    329     double
    330     rawTransfer(TState history, unsigned int wid, TState& result);
    331 
    332 protected:
    333     typedef  void*   PtrVoid;
    334 
    335     unsigned  m_N;
    336     unsigned  m_UseLogPr;
    337     void    **m_Levels;
    338     unsigned *m_LevelSizes;
    339     float    *m_bowTable;
    340     float    *m_prTable;
    341 
    342 private:
    343     ssize_t   m_bufSize;
    344     bool      m_bMMap;
    345     char     *m_buf;
    346 };
    347 
    348 #endif
    349