Home | History | Annotate | Download | only in slm
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifndef _SUN_AGC_SLM_H
     39    0   yongsun #define _SUN_AGC_SLM_H
     40    0   yongsun 
     41    0   yongsun #include "../portability.h"
     42    0   yongsun 
     43    0   yongsun #include <stdio.h>
     44    0   yongsun 
     45    0   yongsun /**
     46    0   yongsun  * Thread slm make the following modifications to simple back-off language model
     47    0   yongsun  *    -# Word id are limited to 18 bits, about 240K word ids
     48    0   yongsun  *    -# Compact all float value of -log(pr) into 65536 (16 bits)
     49    0   yongsun  *       level and use a table to map the index to a float value;
     50    0   yongsun  *    -# Compact all float value of -log(pr) into 16384 (14 bits)
     51    0   yongsun  *       level and use a table to map the index to a float value;
     52    0   yongsun  *    -# threading infomation embed into binary model file. Threading include
     53    0   yongsun  *         - bol(back-off-level) from current level
     54    0   yongsun  *         - bon(back-off-node)'s index in the bol level array
     55    0   yongsun  *         .
     56    0   yongsun  *       The thread could be used:
     57    0   yongsun  *         - when leaf node are arrived, it could use (bol,bon) as history for
     58    0   yongsun  *           history node.
     59    0   yongsun  *         - when a word could not be found in current node (cl, cn)'s children,
     60    0   yongsun  *           searching could be transfered to (bol, bon) directly and continue
     61    0   yongsun  *           searching the target word
     62    0   yongsun  *    -# Add a basic type TState in Language model, a state is pair of\n
     63    0   yongsun  *           (level, array_idx_of_the level)
     64    0   yongsun  *    -# change all get probability interface to\n
     65    0   yongsun  *          double transfer(TState& history, unsigned int wid, TState& result);
     66    0   yongsun  */
     67    0   yongsun class CThreadSlm {
     68    0   yongsun public:
     69    0   yongsun     enum {
     70    0   yongsun         BITS_BOW = 14,
     71    0   yongsun         BITS_PR  = 16,
     72    0   yongsun         ID_NOT_WORD = 69
     73    0   yongsun     };
     74    0   yongsun 
     75    0   yongsun     /**
     76    0   yongsun     * (level:idx) located a state in the language model very well
     77    0   yongsun     * Please note the psuedo unigram state, with level == 0, but idx > 0
     78    0   yongsun     * it's for used with bigram cache model
     79    0   yongsun     */
     80    0   yongsun     union TState{
     81    0   yongsun         TState(const TState& b) : m_all(b.m_all) { }
     82    0   yongsun         TState(unsigned level=0, unsigned idx=0) { anony.m_Level=level; anony.m_Idx=idx; }
     83    0   yongsun 
     84    0   yongsun         inline TState& operator++()              { ++anony.m_Idx; return *this; }
     85    0   yongsun 
     86    0   yongsun         inline void setIdx(unsigned int idx)     { anony.m_Idx = idx; }
     87    0   yongsun         inline void setLevel(unsigned int lvl)   { anony.m_Level = lvl; }
     88    0   yongsun 
     89    0   yongsun         inline unsigned int getLevel() const     { return anony.m_Level; }
     90    0   yongsun         inline unsigned int getIdx() const       { return anony.m_Idx; }
     91    0   yongsun         inline operator unsigned() const         { return m_all; } //((anony.m_Level << 24) + anony.m_Idx); }
     92    0   yongsun         inline bool operator==(const TState & b) const  { return m_all == b.m_all; }
     93    0   yongsun         inline bool operator< (const TState & b) const  { return unsigned(*this) <  unsigned(b); }
     94    0   yongsun 
     95    0   yongsun     private:
     96    0   yongsun         unsigned int m_all;
     97  198  tchaikov #ifndef WORDS_BIGENDIAN
     98    0   yongsun         struct TAnonymous {
     99    0   yongsun             unsigned m_Idx   :24;
    100    0   yongsun             unsigned m_Level : 8;
    101    0   yongsun         } anony;
    102    0   yongsun #else
    103    0   yongsun         struct TAnonymous {
    104    0   yongsun             unsigned m_Level : 8;
    105    0   yongsun             unsigned m_Idx   :24;
    106    0   yongsun         } anony;
    107    0   yongsun #endif
    108    0   yongsun     };
    109    0   yongsun 
    110    0   yongsun     /**
    111    0   yongsun      * Machine dependent
    112    0   yongsun      */
    113    0   yongsun     struct TNode {
    114    0   yongsun     public:
    115    0   yongsun         unsigned int wid() const
    116    0   yongsun         {
    117    0   yongsun             return m_wid;
    118    0   yongsun         }
    119    0   yongsun 
    120    0   yongsun         unsigned int bow() const
    121    0   yongsun         {
    122    0   yongsun             return m_bow;
    123    0   yongsun         }
    124    0   yongsun 
    125    0   yongsun         unsigned int pr()  const
    126    0   yongsun         {
    127    0   yongsun             return m_pr;
    128    0   yongsun         }
    129    0   yongsun 
    130    0   yongsun         unsigned int bon() const
    131    0   yongsun         {
    132    0   yongsun             return m_bon;
    133    0   yongsun         }
    134    0   yongsun 
    135    0   yongsun         unsigned int bol() const
    136    0   yongsun         {
    137    0   yongsun             return m_bol;
    138    0   yongsun         }
    139    0   yongsun 
    140    0   yongsun         unsigned int ch()  const
    141    0   yongsun         {
    142    0   yongsun             return ((m_ch_hi << 16) + m_ch_lo);
    143    0   yongsun         }
    144    0   yongsun 
    145    0   yongsun         void set_wid(unsigned int wid)
    146    0   yongsun         {
    147    0   yongsun             m_wid = wid;
    148    0   yongsun         }
    149    0   yongsun 
    150    0   yongsun         void set_bow(unsigned int bow)
    151    0   yongsun         {
    152    0   yongsun             m_bow = bow;
    153    0   yongsun         }
    154    0   yongsun 
    155    0   yongsun         void set_pr(unsigned int pr)
    156    0   yongsun         {
    157    0   yongsun             m_pr = pr;
    158    0   yongsun         }
    159    0   yongsun 
    160    0   yongsun         void set_bon(unsigned int bon)
    161    0   yongsun         {
    162    0   yongsun             m_bon = bon;
    163    0   yongsun         }
    164    0   yongsun 
    165    0   yongsun         void set_bol(unsigned int bol)
    166    0   yongsun         {
    167    0   yongsun             m_bol = bol;
    168    0   yongsun         }
    169    0   yongsun 
    170    0   yongsun         void set_ch(unsigned int ch)
    171    0   yongsun         {
    172    0   yongsun             m_ch_hi=((ch >> 16) & 0x7F);
    173    0   yongsun             m_ch_lo=(ch & 0xFFFF);
    174    0   yongsun         }
    175    0   yongsun 
    176    0   yongsun     protected:
    177  198  tchaikov #ifndef WORDS_BIGENDIAN
    178    0   yongsun         unsigned m_wid       :18;
    179    0   yongsun         unsigned m_bow       :14;
    180    0   yongsun         unsigned m_pr        :16;
    181    0   yongsun         unsigned m_ch_lo     :16;
    182    0   yongsun         unsigned m_bon       :23;
    183    0   yongsun         unsigned m_bol       : 2;
    184    0   yongsun         unsigned m_ch_hi     : 7;
    185    0   yongsun #else
    186    0   yongsun         unsigned m_ch_hi     : 7;
    187    0   yongsun         unsigned m_bol       : 2;
    188    0   yongsun         unsigned m_bon       :23;
    189    0   yongsun         unsigned m_ch_lo     :16;
    190    0   yongsun         unsigned m_pr        :16;
    191    0   yongsun         unsigned m_bow       :14;
    192    0   yongsun         unsigned m_wid       :18;
    193    0   yongsun #endif
    194    0   yongsun 
    195    0   yongsun     private:
    196    0   yongsun         /**
    197    0   yongsun          * Machine dependent
    198    0   yongsun         union TChildIdx {
    199    0   yongsun         public:
    200    0   yongsun             inline TChildIdx(unsigned val) : m_all(val) { }
    201    0   yongsun             inline TChildIdx(const TChildIdx& b) : m_all(b.m_all) { }
    202    0   yongsun             inline TChildIdx(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi = hi; anony.m_lo = lo; }
    203    0   yongsun 
    204    0   yongsun             inline unsigned int lo() { return anony.m_lo; }
    205    0   yongsun             inline unsigned int hi() { return anony.m_hi; }
    206    0   yongsun             inline unsigned int all(){ return m_all; }
    207    0   yongsun 
    208    0   yongsun             inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
    209    0   yongsun             inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
    210    0   yongsun             inline unsigned int set_all(unsigned int all) { return (m_all = all); }
    211    0   yongsun 
    212    0   yongsun         private:
    213    0   yongsun             unsigned int m_all;
    214  198  tchaikov #ifndef WORDS_BIGENDIAN
    215    0   yongsun             struct TAnony {
    216    0   yongsun                 unsigned m_lo :16;
    217    0   yongsun                 unsigned m_hi : 7;
    218    0   yongsun                 unsigned NOUSE: 9;
    219    0   yongsun             } anony;
    220    0   yongsun #else
    221    0   yongsun             struct TAnony {
    222    0   yongsun                 unsigned NOUSE: 9;
    223    0   yongsun                 unsigned m_hi : 7;
    224    0   yongsun                 unsigned m_lo :16;
    225    0   yongsun             } anony;
    226    0   yongsun #endif
    227    0   yongsun         };
    228    0   yongsun         */
    229    0   yongsun     };
    230    0   yongsun 
    231    0   yongsun     /**
    232    0   yongsun      * Machine dependent
    233    0   yongsun      */
    234    0   yongsun     struct TLeaf {
    235    0   yongsun     public:
    236    0   yongsun         inline unsigned int wid() const { return m_wid; }
    237    0   yongsun         inline unsigned int bon() const { return m_bon; }
    238    0   yongsun         inline unsigned int bol() const { return m_bol; }
    239    0   yongsun         inline unsigned int pr()  const { return ((m_pr_hi << 14) + m_pr_lo); }
    240    0   yongsun 
    241    0   yongsun         inline void set_wid(unsigned int wid) { m_wid = wid; }
    242    0   yongsun         inline void set_bon(unsigned int bon) { m_bon = bon; }
    243    0   yongsun         inline void set_bol(unsigned int bol) { m_bol = bol; }
    244    0   yongsun         inline void set_pr(unsigned int pr)   { m_pr_hi = ((pr >> 14) & 0x3); m_pr_lo = pr & 0x3FFF; }
    245    0   yongsun 
    246    0   yongsun     protected:
    247  198  tchaikov #ifndef WORDS_BIGENDIAN
    248    0   yongsun         unsigned m_wid       :18;
    249    0   yongsun         unsigned m_pr_lo     :14;
    250    0   yongsun         unsigned m_bon       :23;
    251    0   yongsun         unsigned m_bol       : 2;
    252    0   yongsun         unsigned m_pr_hi     : 2;
    253    0   yongsun #else
    254    0   yongsun         unsigned m_pr_hi     : 2;
    255    0   yongsun         unsigned m_bol       : 2;
    256    0   yongsun         unsigned m_bon       :23;
    257    0   yongsun         unsigned m_pr_lo     :14;
    258    0   yongsun         unsigned m_wid       :18;
    259    0   yongsun #endif
    260    0   yongsun 
    261    0   yongsun     private:
    262    0   yongsun     /*
    263    0   yongsun         union TPr {
    264    0   yongsun         public:
    265    0   yongsun             inline TPr(unsigned int val) : m_all(val) { }
    266    0   yongsun             inline TPr(const TPr & b) : m_all(b.m_all) { }
    267    0   yongsun             inline TPr(unsigned int hi, unsigned lo) : m_all(0) { anony.m_hi=hi, anony.m_lo=lo; }
    268    0   yongsun 
    269    0   yongsun             inline unsigned int lo() { return anony.m_lo; }
    270    0   yongsun             inline unsigned int hi() { return anony.m_hi; }
    271    0   yongsun             inline unsigned int all(){ return m_all; }
    272    0   yongsun 
    273    0   yongsun             inline unsigned int set_lo(unsigned int lo) { return (anony.m_lo = lo); }
    274    0   yongsun             inline unsigned int set_hi(unsigned int hi) { return (anony.m_hi = hi); }
    275    0   yongsun             inline unsigned int set_all(unsigned int all) { return (m_all = all); }
    276    0   yongsun 
    277    0   yongsun         private:
    278    0   yongsun             unsigned int m_all;
    279  198  tchaikov #ifndef WORDS_BIGENDIAN
    280    0   yongsun             struct TAnony {
    281    0   yongsun                 unsigned m_lo  :14;
    282    0   yongsun                 unsigned m_hi  : 2;
    283    0   yongsun                 unsigned NONUSE:16;
    284    0   yongsun             } anony;
    285    0   yongsun #else
    286    0   yongsun             struct TAnony {
    287    0   yongsun                 unsigned NONUSE:16;
    288    0   yongsun                 unsigned m_hi  : 2;
    289    0   yongsun                 unsigned m_lo  :14;
    290    0   yongsun             } anony;
    291    0   yongsun #endif
    292    0   yongsun         };
    293    0   yongsun         */
    294    0   yongsun     };
    295    0   yongsun 
    296    0   yongsun public:
    297    0   yongsun     CThreadSlm()
    298   34  tchaikov         : m_N(0), m_UseLogPr(0), m_Levels(NULL), m_LevelSizes(NULL),
    299   34  tchaikov           m_bowTable(NULL), m_prTable(NULL), m_bMMap(false), m_buf(NULL) { }
    300    0   yongsun 
    301    0   yongsun     ~CThreadSlm() { free(); }
    302    0   yongsun 
    303    0   yongsun     bool
    304    0   yongsun     load(const char* fname, bool MMap=false);
    305    0   yongsun 
    306  198  tchaikov     unsigned
    307  198  tchaikov     isUseLogPr() const
    308    0   yongsun     { return m_UseLogPr; }
    309    0   yongsun 
    310    0   yongsun     void
    311    0   yongsun     free();
    312    0   yongsun 
    313    0   yongsun     double
    314    0   yongsun     transferNegLog(TState history, unsigned int wid, TState& result);
    315    0   yongsun 
    316    0   yongsun     double
    317    0   yongsun     transfer(TState history, unsigned int wid, TState& result);
    318    0   yongsun 
    319    0   yongsun     TState
    320    0   yongsun     history_state_of(TState st);
    321    0   yongsun 
    322    0   yongsun     TState&
    323    0   yongsun     historify(TState& st);
    324    0   yongsun 
    325    0   yongsun     unsigned int
    326    0   yongsun     lastWordId(TState st);
    327    0   yongsun 
    328    0   yongsun protected:
    329    0   yongsun     double
    330    0   yongsun     rawTransfer(TState history, unsigned int wid, TState& result);
    331    0   yongsun 
    332    0   yongsun protected:
    333    0   yongsun     typedef  void*   PtrVoid;
    334    0   yongsun 
    335    0   yongsun     unsigned  m_N;
    336    0   yongsun     unsigned  m_UseLogPr;
    337    0   yongsun     void    **m_Levels;
    338    0   yongsun     unsigned *m_LevelSizes;
    339    0   yongsun     float    *m_bowTable;
    340    0   yongsun     float    *m_prTable;
    341    0   yongsun 
    342    0   yongsun private:
    343  198  tchaikov     ssize_t   m_bufSize;
    344    0   yongsun     bool      m_bMMap;
    345    0   yongsun     char     *m_buf;
    346    0   yongsun };
    347    0   yongsun 
    348    0   yongsun #endif
    349