1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifdef HAVE_CONFIG_H 39 0 yongsun #include "config.h" 40 0 yongsun #endif 41 0 yongsun 42 0 yongsun #ifdef HAVE_ASSERT_H 43 0 yongsun #include <assert.h> 44 0 yongsun #endif 45 0 yongsun 46 0 yongsun #ifdef HAVE_GETOPT_H 47 0 yongsun #include <getopt.h> 48 0 yongsun #endif 49 0 yongsun 50 0 yongsun #include <stdio.h> 51 0 yongsun #include <map> 52 0 yongsun #include <vector> 53 0 yongsun #include <algorithm> 54 0 yongsun 55 0 yongsun #include "../sim_fmerge.h" 56 0 yongsun #include "idngram.h" 57 0 yongsun #include "idngram_merge.h" 58 0 yongsun 59 0 yongsun template<int N> 60 0 yongsun void WriteOut(FILE* out, std::map<CSIM_Idngram<N>, unsigned int> & map) 61 0 yongsun { 62 0 yongsun typedef typename std::map<CSIM_Idngram<N>,unsigned int>::iterator TMapIterator; 63 0 yongsun TMapIterator its=map.begin(), ite=map.end(); 64 0 yongsun for (; its != ite; ++its) { 65 0 yongsun fwrite(its->first.ids, sizeof(TSIMWordId), N, out); 66 0 yongsun fwrite(&(its->second), sizeof(unsigned int), 1, out); 67 0 yongsun } 68 0 yongsun map.clear(); 69 0 yongsun } 70 0 yongsun 71 0 yongsun template<int N> 72 0 yongsun void ProcessingRead(FILE *fp, FILE* swap, std::vector<long>& para_offsets, size_t paraMax) 73 0 yongsun { 74 0 yongsun typedef CSIM_Idngram<N> TNgram; 75 0 yongsun typedef typename std::map<CSIM_Idngram<N>, unsigned int> TMap; 76 0 yongsun 77 0 yongsun TMap map; 78 0 yongsun TNgram ngram; 79 0 yongsun 80 0 yongsun TSIMWordId* ids = ngram.ids; 81 0 yongsun fread(ids, sizeof(TSIMWordId), N-1, fp); 82 0 yongsun while (fread(ids+N-1, sizeof(TSIMWordId), 1, fp) == 1) { 83 0 yongsun ++map[ngram]; 84 0 yongsun if (map.size() >= paraMax) 85 0 yongsun { 86 0 yongsun printf("."); fflush(stdout); 87 0 yongsun WriteOut(swap, map); 88 0 yongsun para_offsets.push_back(ftell(swap)); 89 0 yongsun } 90 0 yongsun for (int i=0; i<N-1; ++i) ids[i] = ids[i+1]; 91 0 yongsun } 92 0 yongsun if (map.size() > 0) { 93 0 yongsun printf("."); fflush(stdout); 94 0 yongsun WriteOut(swap, map); 95 0 yongsun para_offsets.push_back(ftell(swap)); 96 0 yongsun } 97 0 yongsun } 98 0 yongsun 99 0 yongsun static struct option long_options[] = 100 0 yongsun { 101 0 yongsun {"NMax", 1, 0, 'n'}, 102 0 yongsun {"out", 1, 0, 'o'}, 103 0 yongsun {"swap", 1, 0, 's'}, 104 0 yongsun {"para", 1, 0, 'p'}, 105 0 yongsun {0, 0, 0, 0} 106 0 yongsun }; 107 0 yongsun 108 0 yongsun static int N=0; 109 0 yongsun static int paraMax=0; 110 0 yongsun static char* output=NULL; 111 0 yongsun static char* swapfile=NULL; 112 0 yongsun 113 0 yongsun void ShowUsage() 114 0 yongsun { 115 209 tchaikov printf("Usage:\n\tids2ngram options idsfile[ idsfile...]\n"); 116 0 yongsun printf("\nDescription\n"); 117 0 yongsun printf(" This program generate idngram file, which is a sorted [id1,..idN,freq] array, from binary id stream files.\n"); 118 0 yongsun printf("\nInput:\n"); 119 0 yongsun printf("\tBinary id stream files looks like [id0,...,idX]\n"); 120 0 yongsun printf("\nOptions:\n"); 121 0 yongsun printf("\t -n N # N-gram\n"); 122 0 yongsun printf("\t -s swapfile # intermedia temporary file\n"); 123 0 yongsun printf("\t -o outputfile # result idngram file [id1, ... idN, freq]*\n"); 124 0 yongsun printf("\t -p para_size # maxium ngram-items per para\n"); 125 0 yongsun printf("\nExample:\n"); 126 0 yongsun printf(" Following example will use three input idstream file idsfile[1,2,3] to generate the idngram file all.id3gram. Each para (internal map size or hash size) would be 1024000, using swap file for temp result. All temp para result would final be merged to got the final result.\n"); 127 0 yongsun printf("\tids2idngram -n 3 -s /tmp/swap -o all.id3gram -p 1024000 idsfile1 idsfile2 idsfile3\n\n"); 128 0 yongsun exit(100); 129 0 yongsun } 130 0 yongsun 131 0 yongsun static void getParameters(int argc, char* const argv[]) 132 0 yongsun { 133 0 yongsun int option_index = 0; 134 0 yongsun int c; 135 0 yongsun while ((c=getopt_long(argc, argv, "p:n:s:o:", long_options, &option_index)) != -1) 136 0 yongsun { 137 0 yongsun switch (c) { 138 0 yongsun case 'n': 139 0 yongsun N = atoi(strdup(optarg)); 140 0 yongsun break; 141 0 yongsun case 'p': 142 0 yongsun paraMax = atoi(strdup(optarg)); 143 0 yongsun break; 144 0 yongsun case 'o': 145 0 yongsun output = strdup(optarg); 146 0 yongsun break; 147 0 yongsun case 's': 148 0 yongsun swapfile = strdup(optarg); 149 0 yongsun break; 150 0 yongsun default: 151 0 yongsun ShowUsage(); 152 0 yongsun } 153 0 yongsun } 154 0 yongsun if (N < 1 || N > 3 || paraMax < 1024 || output == NULL || swapfile == NULL) 155 0 yongsun ShowUsage(); 156 0 yongsun } 157 0 yongsun 158 0 yongsun static std::vector<long> para_offsets; 159 0 yongsun 160 0 yongsun int main(int argc, char* argv[]) 161 0 yongsun { 162 0 yongsun getParameters(argc, argv); 163 0 yongsun FILE *swap = fopen(swapfile, "wb+"); 164 0 yongsun FILE *out = fopen(output, "wb+"); 165 0 yongsun if (optind >= argc) ShowUsage(); 166 0 yongsun while (optind < argc) { 167 0 yongsun printf("Processing %s:", argv[optind]); fflush(stdout); 168 0 yongsun FILE *fp = fopen(argv[optind], "rb"); 169 0 yongsun switch (N) { 170 0 yongsun case 1: 171 0 yongsun ProcessingRead<1>(fp, swap, para_offsets, paraMax); 172 0 yongsun break; 173 0 yongsun case 2: 174 0 yongsun ProcessingRead<2>(fp, swap, para_offsets, paraMax); 175 0 yongsun break; 176 0 yongsun case 3: 177 0 yongsun ProcessingRead<3>(fp, swap, para_offsets, paraMax); 178 0 yongsun break; 179 0 yongsun } 180 0 yongsun fclose(fp); 181 0 yongsun printf ("\n"); fflush(stdout); 182 0 yongsun ++optind; 183 0 yongsun } 184 0 yongsun printf("Merging..."); fflush(stdout); 185 0 yongsun switch (N) { 186 0 yongsun case 1: 187 0 yongsun ProcessingIdngramMerge<1>(swap, out, para_offsets); 188 0 yongsun break; 189 0 yongsun case 2: 190 0 yongsun ProcessingIdngramMerge<2>(swap, out, para_offsets); 191 0 yongsun break; 192 0 yongsun case 3: 193 0 yongsun ProcessingIdngramMerge<3>(swap, out, para_offsets); 194 0 yongsun break; 195 0 yongsun } 196 0 yongsun printf ("Done\n"); fflush(stdout); 197 0 yongsun fclose(out); 198 0 yongsun fclose(swap); 199 0 yongsun return 0; 200 0 yongsun } 201 0 yongsun 202