Home | History | Annotate | Download | only in csplit
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License, Version 1.0 only
      6  * (the "License").  You may not use this file except in compliance
      7  * with the License.
      8  *
      9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     10  * or http://www.opensolaris.org/os/licensing.
     11  * See the License for the specific language governing permissions
     12  * and limitations under the License.
     13  *
     14  * When distributing Covered Code, include this CDDL HEADER in each
     15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     16  * If applicable, add the following below this CDDL HEADER, with the
     17  * fields enclosed by brackets "[]" replaced with your own identifying
     18  * information: Portions Copyright [yyyy] [name of copyright owner]
     19  *
     20  * CDDL HEADER END
     21  */
     22 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     23 /*	  All Rights Reserved  	*/
     24 
     25 
     26 /*
     27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
     28  * Use is subject to license terms.
     29  */
     30 
     31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     32 
     33 /*
     34  * csplit - Context or line file splitter
     35  * Compile: cc -O -s -o csplit csplit.c
     36  */
     37 
     38 #include <stdio.h>
     39 #include <stdlib.h>
     40 #include <unistd.h>
     41 #include <string.h>
     42 #include <ctype.h>
     43 #include <errno.h>
     44 #include <limits.h>
     45 #include <regexpr.h>
     46 #include <signal.h>
     47 #include <locale.h>
     48 #include <libintl.h>
     49 
     50 #define	LAST	0LL
     51 #define	ERR	-1
     52 #define	FALSE	0
     53 #define	TRUE	1
     54 #define	EXPMODE	2
     55 #define	LINMODE	3
     56 #define	LINSIZ	LINE_MAX	/* POSIX.2 - read lines LINE_MAX long */
     57 
     58 	/* Globals */
     59 
     60 char linbuf[LINSIZ];		/* Input line buffer */
     61 char *expbuf;
     62 char tmpbuf[BUFSIZ];		/* Temporary buffer for stdin */
     63 char file[8192] = "xx";		/* File name buffer */
     64 char *targ;			/* Arg ptr for error messages */
     65 char *sptr;
     66 FILE *infile, *outfile;		/* I/O file streams */
     67 int silent, keep, create;	/* Flags: -s(ilent), -k(eep), (create) */
     68 int errflg;
     69 int fiwidth = 2;		/* file index width (output file names) */
     70 extern int optind;
     71 extern char *optarg;
     72 offset_t offset;		/* Regular expression offset value */
     73 offset_t curline;		/* Current line in input file */
     74 
     75 /*
     76  * These defines are needed for regexp handling(see regexp(7))
     77  */
     78 #define	PERROR(x)	fatal("%s: Illegal Regular Expression\n", targ);
     79 
     80 static int asc_to_ll(char *, long long *);
     81 static void closefile(void);
     82 static void fatal(char *, char *);
     83 static offset_t findline(char *, offset_t);
     84 static void flush(void);
     85 static FILE *getfile(void);
     86 static char *getline(int);
     87 static void line_arg(char *);
     88 static void num_arg(char *, int);
     89 static void re_arg(char *);
     90 static void sig(int);
     91 static void to_line(offset_t);
     92 static void usage(void);
     93 
     94 int
     95 main(int argc, char **argv)
     96 {
     97 	int ch, mode;
     98 	char *ptr;
     99 
    100 	(void) setlocale(LC_ALL, "");
    101 #if !defined(TEXT_DOMAIN)		/* Should be defined by cc -D */
    102 #define	TEXT_DOMAIN	"SYS_TEST"	/* Use this only if it weren't */
    103 #endif
    104 	(void) textdomain(TEXT_DOMAIN);
    105 
    106 	while ((ch = getopt(argc, argv, "skf:n:")) != EOF) {
    107 		switch (ch) {
    108 			case 'f':
    109 				(void) strcpy(file, optarg);
    110 				if ((ptr = strrchr(optarg, '/')) == NULL)
    111 					ptr = optarg;
    112 				else
    113 					ptr++;
    114 
    115 				break;
    116 			case 'n':		/* POSIX.2 */
    117 				for (ptr = optarg; *ptr != NULL; ptr++)
    118 					if (!isdigit((int)*ptr))
    119 						fatal("-n num\n", NULL);
    120 				fiwidth = atoi(optarg);
    121 				break;
    122 			case 'k':
    123 				keep++;
    124 				break;
    125 			case 's':
    126 				silent++;
    127 				break;
    128 			case '?':
    129 				errflg++;
    130 		}
    131 	}
    132 
    133 	argv = &argv[optind];
    134 	argc -= optind;
    135 	if (argc <= 1 || errflg)
    136 		usage();
    137 
    138 	if (strcmp(*argv, "-") == 0) {
    139 		infile = tmpfile();
    140 
    141 		while (fread(tmpbuf, 1, BUFSIZ, stdin) != 0) {
    142 			if (fwrite(tmpbuf, 1, BUFSIZ, infile) == 0)
    143 				if (errno == ENOSPC) {
    144 					(void) fprintf(stderr, "csplit: ");
    145 					(void) fprintf(stderr, gettext(
    146 						"No space left on device\n"));
    147 					exit(1);
    148 				} else {
    149 					(void) fprintf(stderr, "csplit: ");
    150 					(void) fprintf(stderr, gettext(
    151 						"Bad write to temporary "
    152 							"file\n"));
    153 					exit(1);
    154 				}
    155 
    156 	/* clear the buffer to get correct size when writing buffer */
    157 
    158 			(void) memset(tmpbuf, '\0', sizeof (tmpbuf));
    159 		}
    160 		rewind(infile);
    161 	} else if ((infile = fopen(*argv, "r")) == NULL)
    162 		fatal("Cannot open %s\n", *argv);
    163 	++argv;
    164 	curline = (offset_t)1;
    165 	(void) signal(SIGINT, sig);
    166 
    167 	/*
    168 	 * The following for loop handles the different argument types.
    169 	 * A switch is performed on the first character of the argument
    170 	 * and each case calls the appropriate argument handling routine.
    171 	 */
    172 
    173 	for (; *argv; ++argv) {
    174 		targ = *argv;
    175 		switch (**argv) {
    176 		case '/':
    177 			mode = EXPMODE;
    178 			create = TRUE;
    179 			re_arg(*argv);
    180 			break;
    181 		case '%':
    182 			mode = EXPMODE;
    183 			create = FALSE;
    184 			re_arg(*argv);
    185 			break;
    186 		case '{':
    187 			num_arg(*argv, mode);
    188 			mode = FALSE;
    189 			break;
    190 		default:
    191 			mode = LINMODE;
    192 			create = TRUE;
    193 			line_arg(*argv);
    194 			break;
    195 		}
    196 	}
    197 	create = TRUE;
    198 	to_line(LAST);
    199 	return (0);
    200 }
    201 
    202 /*
    203  * asc_to_ll takes an ascii argument(str) and converts it to a long long(plc)
    204  * It returns ERR if an illegal character.  The reason that asc_to_ll
    205  * does not return an answer(long long) is that any value for the long
    206  * long is legal, and this version of asc_to_ll detects error strings.
    207  */
    208 
    209 static int
    210 asc_to_ll(char *str, long long *plc)
    211 {
    212 	int f;
    213 	*plc = 0;
    214 	f = 0;
    215 	for (; ; str++) {
    216 		switch (*str) {
    217 		case ' ':
    218 		case '\t':
    219 			continue;
    220 		case '-':
    221 			f++;
    222 			/* FALLTHROUGH */
    223 		case '+':
    224 			str++;
    225 		}
    226 		break;
    227 	}
    228 	for (; *str != NULL; str++)
    229 		if (*str >= '0' && *str <= '9')
    230 			*plc = *plc * 10 + *str - '0';
    231 		else
    232 			return (ERR);
    233 	if (f)
    234 		*plc = -(*plc);
    235 	return (TRUE);	/* not error */
    236 }
    237 
    238 /*
    239  * Closefile prints the byte count of the file created,(via fseeko
    240  * and ftello), if the create flag is on and the silent flag is not on.
    241  * If the create flag is on closefile then closes the file(fclose).
    242  */
    243 
    244 static void
    245 closefile()
    246 {
    247 	if (!silent && create) {
    248 		(void) fseeko(outfile, (offset_t)0, SEEK_END);
    249 		(void) fprintf(stdout, "%lld\n", (offset_t)ftello(outfile));
    250 	}
    251 	if (create)
    252 		(void) fclose(outfile);
    253 }
    254 
    255 /*
    256  * Fatal handles error messages and cleanup.
    257  * Because "arg" can be the global file, and the cleanup processing
    258  * uses the global file, the error message is printed first.  If the
    259  * "keep" flag is not set, fatal unlinks all created files.  If the
    260  * "keep" flag is set, fatal closes the current file(if there is one).
    261  * Fatal exits with a value of 1.
    262  */
    263 
    264 static void
    265 fatal(char *string, char *arg)
    266 {
    267 	char *fls;
    268 	int num;
    269 
    270 	(void) fprintf(stderr, "csplit: ");
    271 
    272 	/* gettext dynamically replaces string */
    273 
    274 	(void) fprintf(stderr, gettext(string), arg);
    275 	if (!keep) {
    276 		if (outfile) {
    277 			(void) fclose(outfile);
    278 			for (fls = file; *fls != '\0'; fls++)
    279 				continue;
    280 			fls -= fiwidth;
    281 			for (num = atoi(fls); num >= 0; num--) {
    282 				(void) sprintf(fls, "%.*d", fiwidth, num);
    283 				(void) unlink(file);
    284 			}
    285 		}
    286 	} else
    287 		if (outfile)
    288 			closefile();
    289 	exit(1);
    290 }
    291 
    292 /*
    293  * Findline returns the line number referenced by the current argument.
    294  * Its arguments are a pointer to the compiled regular expression(expr),
    295  * and an offset(oset).  The variable lncnt is used to count the number
    296  * of lines searched.  First the current stream location is saved via
    297  * ftello(), and getline is called so that R.E. searching starts at the
    298  * line after the previously referenced line.  The while loop checks
    299  * that there are more lines(error if none), bumps the line count, and
    300  * checks for the R.E. on each line.  If the R.E. matches on one of the
    301  * lines the old stream location is restored, and the line number
    302  * referenced by the R.E. and the offset is returned.
    303  */
    304 
    305 static offset_t
    306 findline(char *expr, offset_t oset)
    307 {
    308 	static int benhere = 0;
    309 	offset_t lncnt = 0, saveloc;
    310 
    311 	saveloc = ftello(infile);
    312 	if (curline != (offset_t)1 || benhere)	/* If first line, first time, */
    313 		(void) getline(FALSE);		/* then don't skip */
    314 	else
    315 		lncnt--;
    316 	benhere = 1;
    317 	while (getline(FALSE) != NULL) {
    318 		lncnt++;
    319 		if ((sptr = strrchr(linbuf, '\n')) != NULL)
    320 			*sptr = '\0';
    321 		if (step(linbuf, expr)) {
    322 			(void) fseeko(infile, (offset_t)saveloc, SEEK_SET);
    323 			return (curline+lncnt+oset);
    324 		}
    325 	}
    326 	(void) fseeko(infile, (offset_t)saveloc, SEEK_SET);
    327 	return (curline+lncnt+oset+2);
    328 }
    329 
    330 /*
    331  * Flush uses fputs to put lines on the output file stream(outfile)
    332  * Since fputs does its own buffering, flush doesn't need to.
    333  * Flush does nothing if the create flag is not set.
    334  */
    335 
    336 static void
    337 flush()
    338 {
    339 	if (create)
    340 		(void) fputs(linbuf, outfile);
    341 }
    342 
    343 /*
    344  * Getfile does nothing if the create flag is not set.  If the create
    345  * flag is set, getfile positions the file pointer(fptr) at the end of
    346  * the file name prefix on the first call(fptr=0).  The file counter is
    347  * stored in the file name and incremented.  If the subsequent fopen
    348  * fails, the file name is copied to tfile for the error message, the
    349  * previous file name is restored for cleanup, and fatal is called.  If
    350  * the fopen succeeds, the stream(opfil) is returned.
    351  */
    352 
    353 FILE *
    354 getfile()
    355 {
    356 	static char *fptr;
    357 	static int ctr;
    358 	FILE *opfil;
    359 	char tfile[15];
    360 	char *delim;
    361 	char savedelim;
    362 
    363 	if (create) {
    364 		if (fptr == 0)
    365 			for (fptr = file; *fptr != NULL; fptr++);
    366 		(void) sprintf(fptr, "%.*d", fiwidth, ctr++);
    367 
    368 		/* check for suffix length overflow */
    369 		if (strlen(fptr) > fiwidth) {
    370 			fatal("Suffix longer than %ld chars; increase -n\n",
    371 			    (char *)fiwidth);
    372 		}
    373 
    374 		/* check for filename length overflow */
    375 
    376 		delim = strrchr(file, '/');
    377 		if (delim == (char *)NULL) {
    378 			if (strlen(file) > pathconf(".", _PC_NAME_MAX)) {
    379 				fatal("Name too long: %s\n", file);
    380 			}
    381 		} else {
    382 			/* truncate file at pathname delim to do pathconf */
    383 			savedelim = *delim;
    384 			*delim = '\0';
    385 			/*
    386 			 * file: pppppppp\0fffff\0
    387 			 * ..... ^ file
    388 			 * ............. ^ delim
    389 			 */
    390 			if (strlen(delim + 1) > pathconf(file, _PC_NAME_MAX)) {
    391 				fatal("Name too long: %s\n", delim + 1);
    392 			}
    393 			*delim = savedelim;
    394 		}
    395 
    396 		if ((opfil = fopen(file, "w")) == NULL) {
    397 			(void) strcpy(tfile, file);
    398 			(void) sprintf(fptr, "%.*d", fiwidth, (ctr-2));
    399 			fatal("Cannot create %s\n", tfile);
    400 		}
    401 		return (opfil);
    402 	}
    403 	return (NULL);
    404 }
    405 
    406 /*
    407  * Getline gets a line via fgets from the input stream "infile".
    408  * The line is put into linbuf and may not be larger than LINSIZ.
    409  * If getline is called with a non-zero value, the current line
    410  * is bumped, otherwise it is not(for R.E. searching).
    411  */
    412 
    413 static char *
    414 getline(int bumpcur)
    415 {
    416 	char *ret;
    417 	if (bumpcur)
    418 		curline++;
    419 	ret = fgets(linbuf, LINSIZ, infile);
    420 	return (ret);
    421 }
    422 
    423 /*
    424  * Line_arg handles line number arguments.
    425  * line_arg takes as its argument a pointer to a character string
    426  * (assumed to be a line number).  If that character string can be
    427  * converted to a number(long long), to_line is called with that number,
    428  * otherwise error.
    429  */
    430 
    431 static void
    432 line_arg(char *line)
    433 {
    434 	long long to;
    435 
    436 	if (asc_to_ll(line, &to) == ERR)
    437 		fatal("%s: bad line number\n", line);
    438 	to_line(to);
    439 }
    440 
    441 /*
    442  * Num_arg handles repeat arguments.
    443  * Num_arg copies the numeric argument to "rep" (error if number is
    444  * larger than 20 characters or } is left off).  Num_arg then converts
    445  * the number and checks for validity.  Next num_arg checks the mode
    446  * of the previous argument, and applys the argument the correct number
    447  * of times. If the mode is not set properly its an error.
    448  */
    449 
    450 static void
    451 num_arg(char *arg, int md)
    452 {
    453 	offset_t repeat, toline;
    454 	char rep[21];
    455 	char *ptr;
    456 	int		len;
    457 
    458 	ptr = rep;
    459 	for (++arg; *arg != '}'; arg += len) {
    460 		if (*arg == NULL)
    461 			fatal("%s: missing '}'\n", targ);
    462 		if ((len = mblen(arg, MB_LEN_MAX)) <= 0)
    463 			len = 1;
    464 		if ((ptr + len) >= &rep[20])
    465 			fatal("%s: Repeat count too large\n", targ);
    466 		(void) memcpy(ptr, arg, len);
    467 		ptr += len;
    468 	}
    469 	*ptr = NULL;
    470 	if ((asc_to_ll(rep, &repeat) == ERR) || repeat < 0L)
    471 		fatal("Illegal repeat count: %s\n", targ);
    472 	if (md == LINMODE) {
    473 		toline = offset = curline;
    474 		for (; repeat > 0LL; repeat--) {
    475 			toline += offset;
    476 			to_line(toline);
    477 		}
    478 	} else	if (md == EXPMODE)
    479 			for (; repeat > 0LL; repeat--)
    480 				to_line(findline(expbuf, offset));
    481 		else
    482 			fatal("No operation for %s\n", targ);
    483 }
    484 
    485 /*
    486  * Re_arg handles regular expression arguments.
    487  * Re_arg takes a csplit regular expression argument.  It checks for
    488  * delimiter balance, computes any offset, and compiles the regular
    489  * expression.  Findline is called with the compiled expression and
    490  * offset, and returns the corresponding line number, which is used
    491  * as input to the to_line function.
    492  */
    493 
    494 static void
    495 re_arg(char *string)
    496 {
    497 	char *ptr;
    498 	char ch;
    499 	int		len;
    500 
    501 	ch = *string;
    502 	ptr = string;
    503 	ptr++;
    504 	while (*ptr != ch) {
    505 		if (*ptr == '\\')
    506 			++ptr;
    507 
    508 		if (*ptr == NULL)
    509 			fatal("%s: missing delimiter\n", targ);
    510 
    511 		if ((len = mblen(ptr, MB_LEN_MAX)) <= 0)
    512 			len = 1;
    513 		ptr += len;
    514 	}
    515 
    516 	/*
    517 	 * The line below was added because compile no longer supports
    518 	 * the fourth argument being passed.  The fourth argument used
    519 	 * to be '/' or '%'.
    520 	 */
    521 
    522 	*ptr = NULL;
    523 	if (asc_to_ll(++ptr, &offset) == ERR)
    524 		fatal("%s: illegal offset\n", string);
    525 
    526 	/*
    527 	 * The line below was added because INIT which did this for us
    528 	 * was removed from compile in regexp.h
    529 	 */
    530 
    531 	string++;
    532 	expbuf = compile(string, (char *)0, (char *)0);
    533 	if (regerrno)
    534 		PERROR(regerrno);
    535 	to_line(findline(expbuf, offset));
    536 }
    537 
    538 /*
    539  * Sig handles breaks.  When a break occurs the signal is reset,
    540  * and fatal is called to clean up and print the argument which
    541  * was being processed at the time the interrupt occured.
    542  */
    543 
    544 /* ARGSUSED */
    545 static void
    546 sig(int s)
    547 {
    548 	(void) signal(SIGINT, sig);
    549 	fatal("Interrupt - program aborted at arg '%s'\n", targ);
    550 }
    551 
    552 /*
    553  * To_line creates split files.
    554  * To_line gets as its argument the line which the current argument
    555  * referenced.  To_line calls getfile for a new output stream, which
    556  * does nothing if create is False.  If to_line's argument is not LAST
    557  * it checks that the current line is not greater than its argument.
    558  * While the current line is less than the desired line to_line gets
    559  * lines and flushes(error if EOF is reached).
    560  * If to_line's argument is LAST, it checks for more lines, and gets
    561  * and flushes lines till the end of file.
    562  * Finally, to_line calls closefile to close the output stream.
    563  */
    564 
    565 static void
    566 to_line(offset_t ln)
    567 {
    568 	outfile = getfile();
    569 	if (ln != LAST) {
    570 		if (curline > ln)
    571 			fatal("%s - out of range\n", targ);
    572 		while (curline < ln) {
    573 			if (getline(TRUE) == NULL)
    574 				fatal("%s - out of range\n", targ);
    575 			flush();
    576 		}
    577 	} else		/* last file */
    578 		if (getline(TRUE) != NULL) {
    579 			flush();
    580 			for (;;) {
    581 				if (getline(TRUE) == NULL)
    582 					break;
    583 				flush();
    584 			}
    585 		} else
    586 			fatal("%s - out of range\n", targ);
    587 	closefile();
    588 }
    589 
    590 static void
    591 usage()
    592 {
    593 	(void) fprintf(stderr, gettext(
    594 		"usage: csplit [-ks] [-f prefix] [-n number] "
    595 			"file arg1 ...argn\n"));
    596 	exit(1);
    597 }
    598