Home | History | Annotate | Download | only in ids2ngram
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifdef HAVE_CONFIG_H
     39 #include "config.h"
     40 #endif
     41 
     42 #ifdef HAVE_ASSERT_H
     43 #include <assert.h>
     44 #endif
     45 
     46 #ifdef HAVE_GETOPT_H
     47 #include <getopt.h>
     48 #endif
     49 
     50 #include <stdio.h>
     51 #include <map>
     52 #include <vector>
     53 #include <algorithm>
     54 
     55 #include "../sim_fmerge.h"
     56 #include "idngram.h"
     57 #include "idngram_merge.h"
     58 
     59 template<int N>
     60 void WriteOut(FILE* out, std::map<CSIM_Idngram<N>, unsigned int> & map)
     61 {
     62 	typedef typename std::map<CSIM_Idngram<N>,unsigned int>::iterator TMapIterator;
     63 	TMapIterator its=map.begin(), ite=map.end();
     64 	for (; its != ite; ++its) {
     65 		fwrite(its->first.ids, sizeof(TSIMWordId), N, out);
     66 		fwrite(&(its->second), sizeof(unsigned int), 1, out);
     67 	}
     68 	map.clear();
     69 }
     70 
     71 template<int N>
     72 void ProcessingRead(FILE *fp, FILE* swap, std::vector<long>& para_offsets, size_t paraMax)
     73 {
     74 	typedef CSIM_Idngram<N> TNgram;
     75 	typedef typename std::map<CSIM_Idngram<N>, unsigned int> TMap;
     76 
     77 	TMap map;
     78 	TNgram ngram;
     79 
     80 	TSIMWordId* ids = ngram.ids;
     81 	fread(ids, sizeof(TSIMWordId), N-1, fp);
     82 	while (fread(ids+N-1, sizeof(TSIMWordId), 1, fp) == 1) {
     83 		++map[ngram];
     84 		if (map.size() >= paraMax)
     85 		{
     86 			printf("."); fflush(stdout);
     87 			WriteOut(swap, map);
     88 			para_offsets.push_back(ftell(swap));
     89 		}
     90 		for (int i=0; i<N-1; ++i) ids[i] = ids[i+1];
     91 	}
     92 	if (map.size() > 0) {
     93 		printf("."); fflush(stdout);
     94 		WriteOut(swap, map);
     95 		para_offsets.push_back(ftell(swap));
     96 	}
     97 }
     98 
     99 static struct option long_options[] =
    100 {
    101 	{"NMax", 1, 0, 'n'},
    102 	{"out", 1, 0, 'o'},
    103 	{"swap", 1, 0, 's'},
    104 	{"para", 1, 0, 'p'},
    105 	{0, 0, 0, 0}
    106 };
    107 
    108 static int N=0;
    109 static int paraMax=0;
    110 static char* output=NULL;
    111 static char* swapfile=NULL;
    112 
    113 void ShowUsage()
    114 {
    115 	printf("Usage:\n\tids2ngram options idsfile[ idsfile...]\n");
    116 	printf("\nDescription\n");
    117 	printf("   This program generate idngram file, which is a sorted [id1,..idN,freq] array, from binary id stream files.\n");
    118 	printf("\nInput:\n");
    119 	printf("\tBinary id stream files looks like [id0,...,idX]\n");
    120 	printf("\nOptions:\n");
    121 	printf("\t  -n N               # N-gram\n");
    122 	printf("\t  -s swapfile        # intermedia temporary file\n");
    123 	printf("\t  -o outputfile      # result idngram file [id1, ... idN, freq]*\n");
    124 	printf("\t  -p para_size       # maxium ngram-items per para\n");
    125 	printf("\nExample:\n");
    126 	printf("   Following example will use three input idstream file idsfile[1,2,3] to generate the idngram file all.id3gram. Each para (internal map size or hash size) would be 1024000, using swap file for temp result. All temp para result would final be merged to got the final result.\n");
    127 	printf("\tids2idngram -n 3 -s /tmp/swap -o all.id3gram -p 1024000 idsfile1 idsfile2 idsfile3\n\n");
    128 	exit(100);
    129 }
    130 
    131 static void getParameters(int argc, char* const argv[])
    132 {
    133 	int option_index = 0;
    134 	int c;
    135 	while ((c=getopt_long(argc, argv, "p:n:s:o:", long_options, &option_index)) != -1)
    136 	{
    137 		switch (c) {
    138 		case 'n':
    139 			N = atoi(strdup(optarg));
    140 			break;
    141 		case 'p':
    142 			paraMax = atoi(strdup(optarg));
    143 			break;
    144 		case 'o':
    145 			output = strdup(optarg);
    146 			break;
    147 		case 's':
    148 			swapfile = strdup(optarg);
    149 			break;
    150 		default:
    151 			ShowUsage();
    152 		}
    153 	}
    154 	if (N < 1 || N > 3 || paraMax < 1024 || output == NULL || swapfile == NULL)
    155 		ShowUsage();
    156 }
    157 
    158 static std::vector<long> para_offsets;
    159 
    160 int main(int argc, char* argv[])
    161 {
    162 	getParameters(argc, argv);
    163 	FILE *swap = fopen(swapfile, "wb+");
    164 	FILE *out = fopen(output, "wb+");
    165 	if (optind >= argc) ShowUsage();
    166    while (optind < argc) {
    167     	printf("Processing %s:", argv[optind]); fflush(stdout);
    168     	FILE *fp = fopen(argv[optind], "rb");
    169     	switch (N) {
    170     	case 1:
    171   			ProcessingRead<1>(fp, swap, para_offsets, paraMax);
    172   			break;
    173     	case 2:
    174   			ProcessingRead<2>(fp, swap, para_offsets, paraMax);
    175   			break;
    176     	case 3:
    177   			ProcessingRead<3>(fp, swap, para_offsets, paraMax);
    178   			break;
    179     	}
    180 		fclose(fp);
    181       printf ("\n"); fflush(stdout);
    182 		++optind;
    183    }
    184    printf("Merging..."); fflush(stdout);
    185 	switch (N) {
    186   	case 1:
    187 		ProcessingIdngramMerge<1>(swap, out, para_offsets);
    188 		break;
    189   	case 2:
    190 		ProcessingIdngramMerge<2>(swap, out, para_offsets);
    191 		break;
    192   	case 3:
    193 		ProcessingIdngramMerge<3>(swap, out, para_offsets);
    194 		break;
    195   	}
    196    printf ("Done\n"); fflush(stdout);
    197    fclose(out);
    198    fclose(swap);
    199   	return 0;
    200 }
    201 
    202