Home | History | Annotate | Download | only in apache
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the License).
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/CDDL.txt
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/CDDL.txt.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets [] replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  *
     26  * apache.c - Common utilities for highly available apache
     27  */
     28 
     29 #pragma ident	"@(#)apache.c	1.66	07/06/06 SMI"
     30 
     31 #include <stdio.h>
     32 #include <stdlib.h>
     33 #include <strings.h>
     34 #include <unistd.h>
     35 #include <libintl.h>
     36 #include <errno.h>
     37 #include <sys/stat.h>
     38 #include <sys/wait.h>
     39 #include <rgm/libdsdev.h>
     40 #include <ds_common.h>
     41 #include "apache.h"
     42 
     43 static int get_apachectl_value(const char *ctlfile, const char *var,
     44     char *value);
     45 static void remove_pidfile(scds_handle_t scds_handle);
     46 static int validate_monitor_uri_list(scds_handle_t scds_handle,
     47     scds_net_resource_list_t *snrlp, boolean_t print_messages);
     48 static int validate_server_url(char *uri, scds_net_resource_list_t *snrlp,
     49     boolean_t print_messages);
     50 
     51 typedef struct run_cmd_return_struct {
     52 	/* B_TRUE if system() returns -1, meaning fork or exec failed */
     53 	boolean_t system_failed;
     54 	/*
     55 	 * B_TRUE if cmd was interrupted by a signal, i.e. if
     56 	 * WIFSIGNALED() is true.
     57 	 */
     58 	boolean_t cmd_did_not_finish;
     59 
     60 	/* Return code from system() */
     61 	int rc_system;
     62 	/*
     63 	 * Return code from cmd, i.e. WEXITSTATUS(system(run_cmd))
     64 	 * Value not valid if system_failed == B_TRUE or
     65 	 * if cmd_did_not_finish == B_TRUE
     66 	 */
     67 	int rc_cmd;
     68 } run_cmd_return_t;
     69 
     70 /*
     71  * run_cmd() runs the command requested, redirecting stdout and stderr to the
     72  * requested place(s). A debug message is issued, if requested before
     73  * attempting to run the command.  See typedef definitions of run_cmd_args_t
     74  * and run_cmd_return_t for explanation of args to run_cmd() and return
     75  * information from run_cmd().
     76  *
     77  * Parameters:
     78  *   arg_cmd_to_run - Command run_cmd() will run
     79  *   arg_log_error  - whether or not to issue scds_syslog () messages
     80  */
     81 
     82 void run_cmd(char *arg_cmd_to_run, boolean_t arg_log_error,
     83 	run_cmd_return_t *argp_rc, boolean_t print_messages)
     84 {
     85 	/* Declare return code for internal functions */
     86 	int rc;
     87 
     88 	/* Initialize return info */
     89 	argp_rc->system_failed = B_FALSE;
     90 
     91 	argp_rc->cmd_did_not_finish = B_FALSE;
     92 	argp_rc->rc_system = 0;
     93 	argp_rc->rc_cmd = 0;
     94 
     95 	/* Run the command */
     96 	rc = system(arg_cmd_to_run);
     97 	/* Set return value. */
     98 	argp_rc->rc_system = rc;
     99 
    100 	/* If system() has an error fork-ing or exec-ing... */
    101 	if (rc == -1) {
    102 		/* ...set return value and... */
    103 		argp_rc->system_failed = B_TRUE;
    104 		/* ...say so, if we're logging messages */
    105 		rc = errno; /*lint !e746 */
    106 		if (arg_log_error == B_TRUE) {
    107 			/*
    108 			 * SCMSGS
    109 			 * @explanation
    110 			 * Failure in executing the command.
    111 			 * @user_action
    112 			 * Check the syslog message for the command
    113 			 * description. Check whether the system is low in
    114 			 * memory or the process table is full and take
    115 			 * appropriate action. Make sure that the executable
    116 			 * exists.
    117 			 */
    118 			scds_syslog(LOG_ERR,
    119 				"Cannot Execute %s: %s.",
    120 				arg_cmd_to_run,
    121 				strerror(rc));
    122 		}
    123 		if (print_messages) {
    124 			(void) fprintf(stderr, gettext("Cannot Execute %s: "
    125 				"%s.\n"), arg_cmd_to_run,
    126 				gettext(strerror(rc)));
    127 		}
    128 	/* If system() ran OK... */
    129 	} else {
    130 		/* If we didn't complete due to being signaled... */
    131 		if (WIFSIGNALED((uint_t)rc)) {
    132 			/* ...set return value and... */
    133 			argp_rc->cmd_did_not_finish = B_TRUE;
    134 			/* ...say so, if we're logging messages */
    135 			if (arg_log_error == B_TRUE) {
    136 				/*
    137 				 * SCMSGS
    138 				 * @explanation
    139 				 * Need explanation of this message!
    140 				 * @user_action
    141 				 * Need a user action for this message.
    142 				 */
    143 				scds_syslog(LOG_ERR,
    144 					"%s failed to complete.",
    145 					arg_cmd_to_run);
    146 			}
    147 			if (print_messages) {
    148 				(void) fprintf(stderr, gettext("%s failed to "
    149 					"complete.\n"), arg_cmd_to_run);
    150 			}
    151 		/* If we completed... */
    152 		} else {
    153 			/* ...get arg_cmd_to_run's exit status */
    154 			argp_rc->rc_cmd = WEXITSTATUS((uint_t)rc);
    155 		}
    156 	}
    157 }
    158 
    159 
    160 /*
    161  * The initial timeout allowed  for the apache dataservice to
    162  * be fully up and running. We will wait for for 3 % (SVC_WAIT_PCT)
    163  * of the start_timeout time before probing the service.
    164  */
    165 #define	SVC_WAIT_PCT		2
    166 
    167 /*
    168  * We need to use 95% of probe_timeout to connect to the port and the
    169  * remaining time is used to disconnect from port in the svc_probe function.
    170  */
    171 #define	SVC_CONNECT_TIMEOUT_PCT		95
    172 
    173 /*
    174  * We need to wait for SVC_WAIT_TIME ( 5 secs) for pmf
    175  * to send the failure message before probing the service
    176  */
    177 
    178 #define	SVC_WAIT_TIME		5
    179 
    180 /*
    181  * This value will be used as disconnect timeout, if there is no
    182  * time left from the probe_timeout.
    183  */
    184 
    185 #define	SVC_DISCONNECT_TIMEOUT_SECONDS		2
    186 
    187 /*
    188  * This variable will be set by svc_validate and will also be
    189  * used by svc_start.
    190  */
    191 
    192 static char binpath[SCDS_CMD_SIZE] = "";
    193 
    194 /*
    195  * svc_validate():
    196  * Do apache specific validation of the resource configration.
    197  * Called by start/validate/update/monitor methods.
    198  * Return 0 on success, > 0 on failures.
    199  *
    200  * svc_validate will check for the following
    201  * 1. Bin_dir
    202  * 2. Executable permissions, if filesystem is mounted on this node
    203  * 3. Parse httpd.conf file
    204  * 4. Port_list
    205  * 5. Logical hostname resources
    206  * 6. Extension properties
    207  */
    208 
    209 int
    210 svc_validate(scds_handle_t scds_handle, boolean_t print_messages)
    211 {
    212 	struct stat statbuf;
    213 	char apache_cmd[SCDS_ARRAY_SIZE];
    214 	char cmd_buffer[SCDS_ARRAY_SIZE];
    215 	run_cmd_return_t rcrc;
    216 	int err = 0, rc = 0, i;
    217 	scds_net_resource_list_t *snrlp = NULL;
    218 	scds_port_list_t *portlist = NULL;
    219 	scha_extprop_value_t *bindir = NULL;
    220 
    221 	scds_hasp_status_t	hasp_status;
    222 	/* default is to perform all fs related checks */
    223 	boolean_t do_file_checks = B_TRUE;
    224 
    225 	/*
    226 	 * apachectl is the control file for "regular" apache and
    227 	 * for mod_ssl+apache. apache-ssl uses httpsdctl instead.
    228 	 * these are the *ctls we can work with, NULL marks end of list
    229 	 */
    230 	char *xctl[] = {"httpsdctl", "apachectl", NULL};
    231 
    232 	(void) scds_get_ext_property(scds_handle, "Bin_dir",
    233 		SCHA_PTYPE_STRING, &bindir);
    234 	/* Check that the bindir or bindir string is not NULL */
    235 	if (bindir == NULL || bindir->val.val_str == NULL) {
    236 		/*
    237 		 * SCMSGS
    238 		 * @explanation
    239 		 * The property has not been set by the user and must be.
    240 		 * @user_action
    241 		 * Reissue the scrgadm command with the required property and
    242 		 * value.
    243 		 */
    244 		scds_syslog(LOG_ERR, "Property %s is not set.", "Bin_dir");
    245 		if (print_messages) {
    246 			(void) fprintf(stderr, gettext("Property %s is not "
    247 				"set.\n"), "Bin_dir");
    248 		}
    249 		return (1);
    250 	} else {
    251 		/* Copy Bin_dir path to static global buffer */
    252 		if (strlcpy(binpath, bindir->val.val_str, sizeof (binpath))
    253 			>= sizeof (binpath)) {
    254 			/*
    255 			 * SCMSGS
    256 			 * @explanation
    257 			 * An internal error has occurred.
    258 			 * @user_action
    259 			 * Save a copy of the /var/adm/messages files on all
    260 			 * nodes. Contact your authorized Sun service provider
    261 			 * for assistance in diagnosing the problem.
    262 			 */
    263 			scds_syslog(LOG_ERR, "INTERNAL ERROR: %s.",
    264 				"Insufficient space in buffer");
    265 			if (print_messages) {
    266 				(void) fprintf(stderr, gettext("INTERNAL "
    267 					"ERROR: %s.\n"), gettext("Insufficient "
    268 					"space in buffer"));
    269 			}
    270 			return (1);
    271 		}
    272 	}
    273 
    274 	/* check for HAStoragePlus resources */
    275 	rc = scds_hasp_check(scds_handle, &hasp_status);
    276 	if (rc != SCHA_ERR_NOERR) {
    277 		/* scha_hasp_check() logs everytime it fails */
    278 		if (print_messages) {
    279 			(void) fprintf(stderr, gettext("INTERNAL ERROR: %s.\n"),
    280 				gettext("scds_hasp_check failed"));
    281 		}
    282 		return (1);
    283 	}
    284 
    285 	if (hasp_status == SCDS_HASP_NOT_ONLINE) {
    286 		/*
    287 		 * SCMSGS
    288 		 * @explanation
    289 		 * The resource depends on a SUNW.HAStoragePlus resource that
    290 		 * is not online on any cluster node.
    291 		 * @user_action
    292 		 * Bring all SUNW.HAStoragePlus resources, that this HA-NFS
    293 		 * resource depends on, online before performing the operation
    294 		 * that caused this error.
    295 		 */
    296 		scds_syslog(LOG_ERR, "Resource depends on a "
    297 			"SUNW.HAStoragePlus type resource that is "
    298 			"not online anywhere.");
    299 		if (print_messages) {
    300 			(void) fprintf(stderr, gettext("Resource depends on a "
    301 				"SUNW.HAStoragePlus type resource that is "
    302 				"not online anywhere.\n"));
    303 		}
    304 		return (1);
    305 	} else if (hasp_status == SCDS_HASP_ERR_CONFIG) {
    306 		/* problem syslogged by scds_hasp_check */
    307 		if (print_messages) {
    308 			(void) fprintf(stderr, gettext("This resource depends "
    309 				"on a HAStoragePlus resouce that is in a "
    310 				"different Resource Group. This configuration "
    311 				"is not supported.\n"));
    312 		}
    313 		return (1);
    314 	}
    315 
    316 	/* zero out the contents of statbuf, helps avoid lint lint 644 */
    317 	bzero(&statbuf, sizeof (statbuf));
    318 
    319 	/*
    320 	 * We need to work with either httpsdctl or apachectl,
    321 	 * cant call webserver_type() to decide because it might
    322 	 * falsely return REGULAR for some HAStoragePlus configs
    323 	 */
    324 	for (i = 0; xctl[i] != NULL; i++) {
    325 		(void) snprintf(apache_cmd, sizeof (apache_cmd), "%s/%s",
    326 				binpath, xctl[i]);
    327 		if (stat(apache_cmd, &statbuf) != 0) {
    328 			if (errno == ENOENT) {
    329 				/* no cause for alarm, check next in list */
    330 				continue;
    331 			} else {
    332 				/*
    333 				 * failed for this ctl, and *not* with
    334 				 * ENOENT. This cant be good...
    335 				 */
    336 				rc = errno;
    337 				/*
    338 				 * SCMSGS
    339 				 * @explanation
    340 				 * The start script is not accessible and
    341 				 * executable. This may be due to the script
    342 				 * not existing or the permissions not being
    343 				 * set properly.
    344 				 * @user_action
    345 				 * Make sure the script exists, is in the
    346 				 * proper directory, and has read nd execute
    347 				 * permissions set appropriately.
    348 				 */
    349 				scds_syslog(LOG_ERR,
    350 					"Cannot access start script %s: %s",
    351 					apache_cmd, strerror(rc));
    352 				if (print_messages) {
    353 					(void) fprintf(stderr,
    354 						gettext("Cannot access start "
    355 						"script %s: %s\n"), apache_cmd,
    356 						gettext(strerror(rc)));
    357 				}
    358 				return (1);
    359 			}
    360 		} else {
    361 			/* stat worked: found a ctl, no need to look further */
    362 			do_file_checks = B_TRUE;
    363 			break;
    364 		}
    365 	}
    366 
    367 	/* if xtl[i] is NULL then all stats failed with ENOENT */
    368 	if (xctl[i] == NULL) {
    369 		if (hasp_status == SCDS_HASP_ONLINE_NOT_LOCAL) {
    370 			/* Bin_dir is on a failover fs thats not here */
    371 			do_file_checks = B_FALSE;
    372 		} else {
    373 			/*
    374 			 * all ENOENTs when there should have been one
    375 			 * ctl here!! Print an error message for the
    376 			 * last one: that should be apachectl
    377 			 */
    378 			scds_syslog(LOG_ERR,
    379 					"Cannot access start script %s: %s",
    380 					apache_cmd, strerror(ENOENT));
    381 			if (print_messages) {
    382 				(void) fprintf(stderr, gettext("Cannot access "
    383 					"start script %s: %s\n"), apache_cmd,
    384 					gettext(strerror(ENOENT)));
    385 			}
    386 			return (1);
    387 		}
    388 	}
    389 
    390 	/*
    391 	 * apache_cmd contains the right *ctl, statbuf has the stat()
    392 	 * results for that ctl and do_file_checks has been (re)set,
    393 	 * everything is fine at this point.
    394 	 */
    395 	if (do_file_checks) {
    396 		/* check that the binary is executable */
    397 		if (!(statbuf.st_mode & S_IXUSR)) {
    398 			/*
    399 			 * SCMSGS
    400 			 * @explanation
    401 			 * This file does not have the expected default
    402 			 * execute permissions.
    403 			 * @user_action
    404 			 * Reset the permissions to allow execute permissions
    405 			 * using the chmod command.
    406 			 */
    407 			scds_syslog(LOG_ERR,
    408 				"Incorrect permissions set for %s.",
    409 				apache_cmd);
    410 			if (print_messages) {
    411 				(void) fprintf(stderr, gettext("Incorrect "
    412 					"permissions set for %s.\n"),
    413 					apache_cmd);
    414 			}
    415 			return (1);
    416 		}
    417 
    418 		/* Run apachectl configtest if everything is available here */
    419 		if ((hasp_status == SCDS_HASP_ONLINE_LOCAL) ||
    420 			(hasp_status == SCDS_HASP_NO_RESOURCE)) {
    421 			/* Assemble the command. */
    422 			rc = snprintf(cmd_buffer, sizeof (cmd_buffer),
    423 				"%s configtest >/dev/null 2>&1", apache_cmd);
    424 
    425 			/* If the snprintf has an error... */
    426 			if (rc < 0) {
    427 				/* ...syslog an error message */
    428 				char *internal_err_str = "String handling "
    429 					"error creating apachectl configtest "
    430 					"command";
    431 
    432 				scds_syslog(LOG_ERR, "INTERNAL ERROR: %s.",
    433 					internal_err_str);
    434 				if (print_messages) {
    435 					(void) fprintf(stderr,
    436 						gettext("INTERNAL ERROR: "
    437 						"%s.\n"),
    438 						gettext(internal_err_str));
    439 				}
    440 				return (1);
    441 			}
    442 
    443 			/* Issue debug message before starting. */
    444 			scds_syslog_debug(DBG_LEVEL_LOW, "Starting server to "
    445 				"check config file with apachectl configtest "
    446 				"command.");
    447 
    448 			/* Run "apachectl configtest" */
    449 			run_cmd(cmd_buffer, B_TRUE, &rcrc, print_messages);
    450 
    451 			/* If config file doesn't validate, return with error */
    452 			if (rcrc.system_failed || rcrc.cmd_did_not_finish) {
    453 				/*
    454 				 * If system() failed or command did not finish
    455 				 * (was interrupted), error message was already
    456 				 * issued
    457 				 */
    458 				return (1);
    459 			}
    460 			if (rcrc.rc_cmd != 0) {
    461 				/* non-zero return code means error */
    462 				/*
    463 				 * SCMSGS
    464 				 * @explanation
    465 				 * The command noted did not return the
    466 				 * expected value. Additional information may
    467 				 * be found in the error message after the
    468 				 * ":", or in subsequent messages in syslog.
    469 				 * @user_action
    470 				 * This message is issued from a general
    471 				 * purpose routine. Appropriate action may be
    472 				 * indicated by the additional information in
    473 				 * the message or in syslog.
    474 				 */
    475 				scds_syslog(LOG_ERR, "Command {%s} failed: %s.",
    476 					cmd_buffer, "httpd cannot parse "
    477 					"httpd.conf");
    478 				if (print_messages) {
    479 					(void) fprintf(stderr,
    480 						gettext("Command {%s} failed: "
    481 						"%s.\n"), cmd_buffer,
    482 						gettext("httpd cannot parse "
    483 						"httpd.conf"));
    484 				}
    485 				return (1);
    486 			}
    487 		}
    488 	} else {
    489 		/*
    490 		 * SCMSGS
    491 		 * @explanation
    492 		 * This resource will not perform some filesystem specific
    493 		 * checks (during VALIDATE or MONITOR_CHECK) on this node
    494 		 * because atleast one SUNW.HAStoragePlus resource that it
    495 		 * depends on is online on some other node.
    496 		 * @user_action
    497 		 * None.
    498 		 */
    499 		scds_syslog(LOG_INFO, "Skipping checks dependant on "
    500 			"HAStoragePlus resources on this node.");
    501 	} /* fs specific checks */
    502 
    503 	/* Network aware service should have at least one port specified */
    504 
    505 	err = scds_get_port_list(scds_handle, &portlist);
    506 	if (err != SCHA_ERR_NOERR) {
    507 		/*
    508 		 * SCMSGS
    509 		 * @explanation
    510 		 * API operation has failed in retrieving the cluster
    511 		 * property.
    512 		 * @user_action
    513 		 * For property name, check the syslog message. For more
    514 		 * details about API call failure, check the syslog messages
    515 		 * from other components.
    516 		 */
    517 		scds_syslog(LOG_ERR,
    518 			"Failed to retrieve the property %s: %s.",
    519 			"Port_list", scds_error_string(err));
    520 		if (print_messages) {
    521 			(void) fprintf(stderr, gettext("Failed to retrieve "
    522 				"the property %s: %s.\n"), "Port_list",
    523 				gettext(scds_error_string(err)));
    524 		}
    525 		goto finished_validate;
    526 	}
    527 
    528 	if (portlist == NULL || portlist->num_ports < 1) {
    529 		scds_syslog(LOG_ERR, "Property %s is not set.", "Port_list");
    530 		if (print_messages) {
    531 			(void) fprintf(stderr, gettext("Property %s is not "
    532 				"set.\n"), "Port_list");
    533 		}
    534 		err = 1;
    535 		goto finished_validate;
    536 	}
    537 
    538 	/*
    539 	 * Return an error if there is an error when trying to get the
    540 	 * available network address resources for this resource
    541 	 */
    542 	if ((err = scds_get_rs_hostnames(scds_handle, &snrlp))
    543 		!= SCHA_ERR_NOERR) {
    544 		/*
    545 		 * SCMSGS
    546 		 * @explanation
    547 		 * Error trying to retrieve network address associated with a
    548 		 * resource.
    549 		 * @user_action
    550 		 * For a failover data service, add a network address resource
    551 		 * to the resource group. For a scalable data service, add a
    552 		 * network resource to the resource group referenced by the
    553 		 * RG_dependencies property.
    554 		 */
    555 		scds_syslog(LOG_ERR,
    556 			"Error in trying to access the configured network "
    557 			"resources : %s.", scds_error_string(err));
    558 		if (print_messages) {
    559 			(void) fprintf(stderr, gettext("Error in trying to "
    560 				"access the configured network resources "
    561 				": %s.\n"), gettext(scds_error_string(err)));
    562 		}
    563 		goto finished_validate;
    564 	}
    565 
    566 	/* Return an error if there are no network address resources */
    567 	if (snrlp == NULL || snrlp->num_netresources == 0) {
    568 		/*
    569 		 * SCMSGS
    570 		 * @explanation
    571 		 * A resource has no associated network address.
    572 		 * @user_action
    573 		 * For a failover data service, add a network address resource
    574 		 * to the resource group. For a scalable data service, add a
    575 		 * network resource to the resource group referenced by the
    576 		 * RG_dependencies property.
    577 		 */
    578 		scds_syslog(LOG_ERR,
    579 			"No network address resource in resource group.");
    580 		if (print_messages) {
    581 			(void) fprintf(stderr, gettext("No network address "
    582 				"resource in resource group.\n"));
    583 		}
    584 		err = 1;
    585 		goto finished_validate;
    586 	}
    587 
    588 	/* Check to make sure other important extension props are set */
    589 	if (scds_get_ext_monitor_retry_count(scds_handle) <= 0) {
    590 		scds_syslog(LOG_ERR,
    591 			"Property %s is not set.",
    592 			"Monitor_retry_count");
    593 		if (print_messages) {
    594 			(void) fprintf(stderr, gettext("Property %s is "
    595 				"not set.\n"), "Monitor_retry_count");
    596 		}
    597 		err = 1; /* Validation Failure */
    598 		goto finished_validate;
    599 	}
    600 	if (scds_get_ext_monitor_retry_interval(scds_handle) <= 0) {
    601 		scds_syslog(LOG_ERR,
    602 			"Property %s is not set.",
    603 			"Monitor_retry_interval");
    604 		if (print_messages) {
    605 			(void) fprintf(stderr, gettext("Property %s is "
    606 				"not set.\n"), "Monitor_retry_interval");
    607 		}
    608 		err = 1; /* Validation Failure */
    609 		goto finished_validate;
    610 	}
    611 	if (scds_get_ext_probe_timeout(scds_handle) <= 0) {
    612 		scds_syslog(LOG_ERR,
    613 			"Property %s is not set.",
    614 			"Probe_timeout");
    615 		if (print_messages) {
    616 			(void) fprintf(stderr, gettext("Property %s is "
    617 				"not set.\n"), "Probe_timeout");
    618 		}
    619 		err = 1; /* Validation Failure */
    620 		goto finished_validate;
    621 	}
    622 
    623 	/*
    624 	 * Make sure that URIs provided (if any) look OK. Also makes sure
    625 	 * that all the Uris have hostnames that are in the list of network
    626 	 * resources used by the resource.
    627 	 */
    628 	if (validate_monitor_uri_list(scds_handle, snrlp, print_messages) !=
    629 	    0) {
    630 		err = 1;
    631 		goto finished_validate;
    632 	}
    633 
    634 	/* All validation checks were successful */
    635 	/*
    636 	 * SCMSGS
    637 	 * @explanation
    638 	 * The validation of the configuration for the data service was
    639 	 * successful.
    640 	 * @user_action
    641 	 * None. This is only an informational message.
    642 	 */
    643 	scds_syslog(LOG_INFO, "Successful validation.");
    644 
    645 finished_validate:
    646 	if (snrlp)
    647 		scds_free_net_list(snrlp);
    648 	if (portlist)
    649 		scds_free_port_list(portlist);
    650 
    651 	return (err);
    652 }
    653 
    654 
    655 /*
    656  * svc_start():
    657  */
    658 
    659 int
    660 svc_start(scds_handle_t scds_handle)
    661 {
    662 	char	cmd[SCDS_CMD_SIZE];
    663 	char	*rsname = NULL, *rgname = NULL;
    664 	int	rc = SCHA_ERR_NOERR;
    665 
    666 	/* Get resource and resource group names */
    667 	rsname = (char *)scds_get_resource_name(scds_handle);
    668 	rgname = (char *)scds_get_resource_group_name(scds_handle);
    669 
    670 	switch (webserver_type(scds_handle)) {
    671 	case MOD_SSL: (void) snprintf(cmd, sizeof (cmd),
    672 			"%s/apachectl startssl", binpath);
    673 		break;
    674 	case APACHE_SSL: (void) snprintf(cmd, sizeof (cmd),
    675 			"%s/httpsdctl start", binpath);
    676 		break;
    677 	case REGULAR: (void) snprintf(cmd, sizeof (cmd),
    678 			"%s/apachectl start", binpath);
    679 		break;
    680 	case ST_ERROR: /* error already logged */
    681 	default: return (1);
    682 	}
    683 
    684 	/*
    685 	 * SCMSGS
    686 	 * @explanation
    687 	 * Sun Cluster is starting the application with the specified command.
    688 	 * @user_action
    689 	 * This is an informational message, no user action is needed.
    690 	 */
    691 	scds_syslog(LOG_NOTICE,
    692 		"Starting server with command %s.", cmd);
    693 
    694 	remove_pidfile(scds_handle);
    695 
    696 	rc = scds_pmf_start(scds_handle, SCDS_PMF_TYPE_SVC, 0, cmd, -1);
    697 
    698 	if (rc == SCHA_ERR_NOERR) {
    699 		/*
    700 		 * SCMSGS
    701 		 * @explanation
    702 		 * The resource successfully started the application.
    703 		 * @user_action
    704 		 * This message is informational; no user action is needed.
    705 		 */
    706 		scds_syslog(LOG_NOTICE,
    707 			"Start of %s completed successfully.", cmd);
    708 		(void) scha_resource_setstatus(rsname, rgname,
    709 			SCHA_RSSTATUS_OK,
    710 			"Completed successfully.");
    711 	} else {
    712 		char msg[SCDS_ARRAY_SIZE];
    713 
    714 		/*
    715 		 * SCMSGS
    716 		 * @explanation
    717 		 * Sun Cluster could not start the application. It would
    718 		 * attempt to start the service on another node if possible.
    719 		 * @user_action
    720 		 * 1) Check prior syslog messages for specific problems and
    721 		 * correct them.
    722 		 *
    723 		 * 2) This problem may occur when the cluster is under load
    724 		 * and Sun Cluster cannot start the application within the
    725 		 * timeout period specified. You may consider increasing the
    726 		 * Start_timeout property.
    727 		 *
    728 		 * 3) If the resource was unable to start on any node,
    729 		 * resource would be in START_FAILED state. In this case, use
    730 		 * scswitch to bring the resource ONLINE on this node.
    731 		 *
    732 		 * 4) If the service was successfully started on another node,
    733 		 * attempt to restart the service on this node using scswitch.
    734 		 *
    735 		 * 5) If the above steps do not help, disable the resource
    736 		 * using scswitch. Check to see that the application can run
    737 		 * outside of the Sun Cluster framework. If it cannot, fix any
    738 		 * problems specific to the application, until the application
    739 		 * can run outside of the Sun Cluster framework. Enable the
    740 		 * resource using scswitch. If the application runs outside of
    741 		 * the Sun Cluster framework but not in response to starting
    742 		 * the data service, contact your authorized Sun service
    743 		 * provider for assistance in diagnosing the problem.
    744 		 */
    745 		scds_syslog(LOG_ERR,
    746 			"Failed to start %s.", cmd);
    747 
    748 		(void) sprintf(msg, "Failed to start %s.", APP_NAME);
    749 		(void) scha_resource_setstatus(rsname, rgname,
    750 			SCHA_RSSTATUS_FAULTED, msg);
    751 	}
    752 
    753 	return (rc);
    754 }
    755 
    756 
    757 /*
    758  * svc_wait():
    759  *
    760  * wait for the data service to start up fully and make sure it is running
    761  * healthy
    762  */
    763 
    764 int
    765 svc_wait(scds_handle_t scds_handle)
    766 {
    767 	int		svc_start_timeout, probe_timeout,
    768 			probe_result, i;
    769 	scha_err_t	err = SCHA_ERR_NOERR;
    770 	scds_netaddr_list_t *netaddr = NULL;
    771 	scha_extprop_value_t *urilist = NULL;
    772 	scha_str_array_t *uris = NULL;
    773 
    774 	/* obtain the network resource to use for probing */
    775 	err = scds_get_netaddr_list(scds_handle, &netaddr);
    776 	if (err != SCHA_ERR_NOERR) {
    777 		scds_syslog(LOG_ERR,
    778 			"Error in trying to access the "
    779 			"configured network resources : %s.",
    780 			scds_error_string(err));
    781 		return (1);
    782 	}
    783 
    784 	/* Return an error if there are no network resources */
    785 	if (netaddr == NULL || netaddr->num_netaddrs == 0) {
    786 		scds_syslog(LOG_ERR,
    787 			"No network address resource in resource group.");
    788 		return (1);
    789 	}
    790 
    791 	/* Retrieve the list of uris that we were told to probe */
    792 	err = scds_get_ext_property(scds_handle, "Monitor_Uri_List",
    793 			SCHA_PTYPE_STRINGARRAY, &urilist);
    794 	if ((err != SCHA_ERR_NOERR) && (err != SCHA_ERR_PROP)) {
    795 		/* failed with something other than SCHA_ERR_PROP */
    796 		scds_syslog(LOG_ERR,
    797 				"Failed to retrieve the "
    798 				"property %s: %s.",
    799 				"Monitor_Uri_List", scds_error_string(err));
    800 		scds_free_netaddr_list(netaddr);
    801 		return (1);
    802 	}
    803 	if (urilist != NULL) {
    804 		uris = urilist->val.val_strarray;
    805 	}
    806 
    807 	/* Get the Start method timeout and the Probe timeout value */
    808 	svc_start_timeout = scds_get_rs_start_timeout(scds_handle);
    809 	probe_timeout = scds_get_ext_probe_timeout(scds_handle);
    810 
    811 	/*
    812 	 * sleep for SVC_WAIT_PCT percentage of start_timeout time
    813 	 * before actually probing the dataservice. This is to allow
    814 	 * the dataservice to be fully up inorder to reply to the
    815 	 * probe.
    816 	 */
    817 	if (scds_svc_wait(scds_handle,
    818 		(svc_start_timeout * SVC_WAIT_PCT)/100)
    819 		!= SCHA_ERR_NOERR) {
    820 		scds_syslog(LOG_ERR, "Failed to start %s.", APP_NAME);
    821 		scds_free_ext_property(urilist);
    822 		scds_free_netaddr_list(netaddr);
    823 		return (1);
    824 	}
    825 
    826 	do {
    827 		scds_syslog_debug(DBG_LEVEL_HIGH,
    828 			"Probing service, t = %d\n", time(NULL));
    829 
    830 		/*
    831 		 * Probe the data service on the logicalhostnames
    832 		 * and the ports.
    833 		 */
    834 		probe_result = 0;
    835 		for (i = 0; (i < netaddr->num_netaddrs) && (probe_result == 0);
    836 		    i++) {
    837 			probe_result = svc_probe(scds_handle,
    838 				netaddr->netaddrs[i].hostname,
    839 				netaddr->netaddrs[i].port_proto.port,
    840 				probe_timeout,
    841 				B_FALSE);
    842 		}
    843 		/* probe all uris (if any) that were supplied */
    844 		if (uris) {
    845 			for (i = 0; (i < (int)uris->array_cnt) &&
    846 			    (probe_result == 0); i++) {
    847 				probe_result = probe_uri(scds_handle,
    848 				    uris->str_array[i], probe_timeout, B_TRUE);
    849 			}
    850 		}
    851 
    852 		if (probe_result == SCHA_ERR_NOERR) {
    853 			/* Success. Free up resources and return */
    854 			scds_free_ext_property(urilist);
    855 			scds_free_netaddr_list(netaddr);
    856 			return (SCHA_ERR_NOERR);
    857 		}
    858 
    859 		/*
    860 		 * Dataservice is still trying to come up. Sleep for a while
    861 		 * before probing again.
    862 		 */
    863 		if (scds_svc_wait(scds_handle, SVC_WAIT_TIME)
    864 			!= SCHA_ERR_NOERR) {
    865 			scds_syslog(LOG_ERR,
    866 				"Failed to start %s.", APP_NAME);
    867 			scds_free_ext_property(urilist);
    868 			scds_free_netaddr_list(netaddr);
    869 			return (1);
    870 		}
    871 
    872 	/* We rely on RGM to timeout and terminate the program */
    873 	} while (1);
    874 }
    875 
    876 
    877 /*
    878  * Stop the apache server
    879  * Return 0 on success, > 0 on failures.
    880  */
    881 
    882 int
    883 svc_stop(scds_handle_t scds_handle)
    884 {
    885 	char	*rsname = NULL, *rgname = NULL;
    886 	char	msg[SCDS_ARRAY_SIZE];
    887 	int	rc;
    888 
    889 	/* Get resource and resource group names */
    890 	rsname = (char *)scds_get_resource_name(scds_handle);
    891 	rgname = (char *)scds_get_resource_group_name(scds_handle);
    892 
    893 	/*
    894 	 * SCMSGS
    895 	 * @explanation
    896 	 * Sun Cluster is stopping the specified application.
    897 	 * @user_action
    898 	 * This is an informational message, no user action is needed.
    899 	 */
    900 	scds_syslog(LOG_NOTICE, "Stopping %s.", APP_NAME);
    901 
    902 	/*
    903 	 * Give the whole timeout to scds_pmf_stop.  scds_pmf_stop()
    904 	 * will 80% of that time to sending the SIGTERM then 15% to
    905 	 * sending SIGKILL.
    906 	 */
    907 
    908 	if (scds_pmf_stop(scds_handle, SCDS_PMF_TYPE_SVC,
    909 		SCDS_PMF_SINGLE_INSTANCE, SIGTERM,
    910 		scds_get_rs_stop_timeout(scds_handle))
    911 		!= SCHA_ERR_NOERR) {
    912 		/*
    913 		 * SCMSGS
    914 		 * @explanation
    915 		 * Sun Cluster failed to stop the application.
    916 		 * @user_action
    917 		 * Use process monitor facility (pmfadm (1M)) with -L option
    918 		 * to retrieve all the tags that are running on the server.
    919 		 * Identify the tag name for the application in this resource.
    920 		 * This can be easily identified as the tag ends in the string
    921 		 * ".svc" and contains the resource group name and the
    922 		 * resource name. Then use pmfadm (1M) with -s option to stop
    923 		 * the application.
    924 		 *
    925 		 * This problem may occur when the cluster is under load and
    926 		 * Sun Cluster cannot stop the application within the timeout
    927 		 * period specified. You may consider increasing the
    928 		 * Stop_timeout property.
    929 		 *
    930 		 * If the error still persists, then reboot the node.
    931 		 */
    932 		scds_syslog(LOG_ERR, "Failed to stop %s.", APP_NAME);
    933 	}
    934 
    935 	/*
    936 	 * Regardless of whether the SIGTERM succeeded send SIGKILL to
    937 	 * the pmf tag. This will ensure that the process tree goes
    938 	 * away if it still exists. If it doesn't exist by then, we
    939 	 * return NOERR.
    940 	 *
    941 	 * Notice that this call will return with success, even if the
    942 	 * tag does not exist by now.
    943 	 *
    944 	 * Timeout of -1 will wait until PMF succeeds or we are timed
    945 	 * out by RGM.
    946 	 */
    947 	if ((rc = scds_pmf_stop(scds_handle, SCDS_PMF_TYPE_SVC, 0,
    948 		SIGKILL, -1)) != SCHA_ERR_NOERR) {
    949 		/*
    950 		 * Failed to stop the application even with SIGKILL.
    951 		 */
    952 		/*
    953 		 * SCMSGS
    954 		 * @explanation
    955 		 * The STOP method was unable to stop the application by
    956 		 * sending it a SIGKILL.
    957 		 * @user_action
    958 		 * Contact your authorized Sun service provider to determine
    959 		 * whether a workaround or patch is available.
    960 		 */
    961 		scds_syslog(LOG_ERR,
    962 			"Failed to stop the application with SIGKILL. "
    963 			"Returning with failure from stop method.");
    964 		(void) sprintf(msg, "Failed to stop %s.", APP_NAME);
    965 		(void) scha_resource_setstatus(rsname, rgname,
    966 			SCHA_RSSTATUS_FAULTED, msg);
    967 
    968 	} else {	/* rc == SCHA_ERR_NOERR */
    969 		/*
    970 		 * SCMSGS
    971 		 * @explanation
    972 		 * The STOP method successfully stopped the resource.
    973 		 * @user_action
    974 		 * This message is informational; no user action is needed.
    975 		 */
    976 		scds_syslog(LOG_NOTICE,
    977 			"Successfully stopped the application");
    978 		(void) sprintf(msg, "Successfully stopped %s.", APP_NAME);
    979 		(void) scha_resource_setstatus(rsname, rgname,
    980 			SCHA_RSSTATUS_OFFLINE, msg);
    981 	}
    982 
    983 	return (rc);
    984 }
    985 
    986 
    987 /*
    988  * This function starts the fault monitor for a apache resource.
    989  * This is done by starting the probe under PMF. The PMF tag
    990  * is derived as <RG-name,RS-name,instance_number.mon>. The restart
    991  * option of PMF is used but not the "infinite restart". Instead
    992  * interval/retry_time is obtained from the RTR file.
    993  */
    994 
    995 int
    996 mon_start(scds_handle_t scds_handle)
    997 {
    998 	scds_syslog_debug(DBG_LEVEL_HIGH,
    999 		"Calling MONITOR_START method for resource %s.",
   1000 		scds_get_resource_name(scds_handle));
   1001 
   1002 	/*
   1003 	 * The probe apache_probe is assumed to be available in the same
   1004 	 * subdirectory where the other callback methods for the RT are
   1005 	 * installed. The last parameter to scds_pmf_start denotes the
   1006 	 * child monitor level. Since we are starting the probe under PMF
   1007 	 * we need to monitor the probe process only and hence we are using
   1008 	 * a value of 0.
   1009 	 */
   1010 	if (scds_pmf_start(scds_handle, SCDS_PMF_TYPE_MON,
   1011 		SCDS_PMF_SINGLE_INSTANCE, "apache_probe", 0)
   1012 		!= SCHA_ERR_NOERR) {
   1013 		/*
   1014 		 * SCMSGS
   1015 		 * @explanation
   1016 		 * The fault monitor for this data service was not started.
   1017 		 * There may be prior messages in syslog indicating specific
   1018 		 * problems.
   1019 		 * @user_action
   1020 		 * The user should correct the problems specified in prior
   1021 		 * syslog messages.
   1022 		 *
   1023 		 * This problem may occur when the cluster is under load and
   1024 		 * Sun Cluster cannot start the application within the timeout
   1025 		 * period specified. You may consider increasing the
   1026 		 * Monitor_Start_timeout property.
   1027 		 *
   1028 		 * Try switching the resource group to another node using
   1029 		 * scswitch (1M).
   1030 		 */
   1031 		scds_syslog(LOG_ERR,
   1032 			"Failed to start fault monitor.");
   1033 		return (1);
   1034 	}
   1035 
   1036 	/*
   1037 	 * SCMSGS
   1038 	 * @explanation
   1039 	 * The fault monitor for this data service was started successfully.
   1040 	 * @user_action
   1041 	 * No action needed.
   1042 	 */
   1043 	scds_syslog(LOG_INFO, "Started the fault monitor.");
   1044 
   1045 	return (SCHA_ERR_NOERR);
   1046 }
   1047 
   1048 
   1049 /*
   1050  * This function stops the fault monitor for a apache resource.
   1051  * This is done via PMF. The PMF tag for the fault monitor is
   1052  * constructed based on <RG-name_RS-name,instance_number.mon>.
   1053  */
   1054 
   1055 int
   1056 mon_stop(scds_handle_t scds_handle)
   1057 {
   1058 	scds_syslog_debug(DBG_LEVEL_HIGH,
   1059 		"Calling MONITOR_STOP method for resource %s.",
   1060 		scds_get_resource_name(scds_handle));
   1061 
   1062 	if (scds_pmf_stop(scds_handle, SCDS_PMF_TYPE_MON,
   1063 		SCDS_PMF_SINGLE_INSTANCE, SIGKILL, -1)
   1064 		!= SCHA_ERR_NOERR) {
   1065 		/*
   1066 		 * SCMSGS
   1067 		 * @explanation
   1068 		 * An attempt was made to stop the fault monitor and it
   1069 		 * failed. There may be prior messages in syslog indicating
   1070 		 * specific problems.
   1071 		 * @user_action
   1072 		 * If there are prior messages in syslog indicating specific
   1073 		 * problems, these should be corrected. If that doesn't
   1074 		 * resolve the issue, the user can try the following.
   1075 		 *
   1076 		 * Use process monitor facility (pmfadm (1M)) with -L option
   1077 		 * to retrieve all the tags that are running on the server.
   1078 		 * Identify the tag name for the fault monitor of this
   1079 		 * resource. This can be easily identified as the tag ends in
   1080 		 * string ".mon" and contains the resource group name and the
   1081 		 * resource name. Then use pmfadm (1M) with -s option to stop
   1082 		 * the fault monitor.
   1083 		 *
   1084 		 * This problem may occur when the cluster is under load and
   1085 		 * Sun Cluster cannot stop the fault monitor within the
   1086 		 * timeout period specified. You may consider increasing the
   1087 		 * Monitor_Stop_timeout property.
   1088 		 *
   1089 		 * If the error still persists, then reboot the node.
   1090 		 */
   1091 		scds_syslog(LOG_ERR,
   1092 			"Failed to stop fault monitor.");
   1093 		return (1);
   1094 	}
   1095 
   1096 	/*
   1097 	 * SCMSGS
   1098 	 * @explanation
   1099 	 * The fault monitor for this data service was stopped successfully.
   1100 	 * @user_action
   1101 	 * No action needed.
   1102 	 */
   1103 	scds_syslog(LOG_INFO, "Stopped the fault monitor.");
   1104 
   1105 	return (SCHA_ERR_NOERR);
   1106 }
   1107 
   1108 
   1109 /*
   1110  * svc_probe(): Do data service specific probing. Return a value
   1111  * between 0 (success) and 100(complete failure).
   1112  *
   1113  * The probe does a simple socket connection to the apache server on the
   1114  * specified port which is configured as the resource extension property
   1115  * (Port_list) and pings the dataservice. If the probe fails to connect to
   1116  * the port, we return a value of 100 indicating that there is a total
   1117  * failure. If the connection goes through and the disconnect to the port
   1118  * fails, then a value of 50 is returned indicating a partial failure.
   1119  */
   1120 
   1121 int
   1122 svc_probe(scds_handle_t scds_handle, char *hostname,
   1123 	int port, int timeout, boolean_t arg_syslog_msgs)
   1124 {
   1125 	ulong_t	t1, t2;
   1126 	int	sock, rc = 0, retval = 0, time_used, time_remaining;
   1127 	size_t	size = 0;
   1128 	char	buf[SCDS_ARRAY_SIZE];
   1129 	long	connect_timeout;
   1130 	server_type st;
   1131 
   1132 
   1133 	/*
   1134 	 * Probe the dataservice by doing a socket connection to the port
   1135 	 * specified in the port_list property to the host that is
   1136 	 * serving the apache dataservice. If the apache service which is
   1137 	 * configured to listen on the specified port, replies to the
   1138 	 * connection, then the probe is successfull. Else we will wait for
   1139 	 * a time period set in probe_timeout property before concluding
   1140 	 * that the probe failed.
   1141 	 */
   1142 
   1143 	/*
   1144 	 * Use the SVC_CONNECT_TIMEOUT_PCT percentage of timeout
   1145 	 * to connect to the port
   1146 	 */
   1147 	connect_timeout = (SVC_CONNECT_TIMEOUT_PCT * timeout)/100;
   1148 	t1 = (ulong_t)(gethrtime()/1E9);
   1149 
   1150 	/*
   1151 	 * the probe makes a connection to the specified hostname and port.
   1152 	 * The connection is timed for 95% of the actual probe_timeout.
   1153 	 */
   1154 	rc = scds_fm_tcp_connect(scds_handle, &sock, hostname, port,
   1155 		connect_timeout);
   1156 	if (rc) {
   1157 		/*
   1158 		 * SCMSGS
   1159 		 * @explanation
   1160 		 * An error occurred while fault monitor attempted to probe
   1161 		 * the health of the data service.
   1162 		 * @user_action
   1163 		 * Wait for the fault monitor to correct this by doing restart
   1164 		 * or failover. For more error description, look at the syslog
   1165 		 * messages.
   1166 		 */
   1167 		scds_syslog(LOG_ERR,
   1168 			"Failed to connect to host %s and "
   1169 			"port %d: %s.",
   1170 			hostname, port, strerror(errno));
   1171 		/* this is a complete failure */
   1172 		return (SCDS_PROBE_COMPLETE_FAILURE);
   1173 	} else {
   1174 		scds_syslog_debug(DBG_LEVEL_LOW,
   1175 			"Successful connection to server %s "
   1176 			"port %d for resource %s.",
   1177 			hostname, port,
   1178 			scds_get_resource_name(scds_handle));
   1179 	}
   1180 
   1181 	t2 = (ulong_t)(gethrtime()/1E9);
   1182 
   1183 	/*
   1184 	 * Compute the actual time it took to connect. This should be less than
   1185 	 * or equal to connect_timeout, the time allocated to connect.
   1186 	 * If the connect uses all the time that is allocated for it,
   1187 	 * then the remaining value from the probe_timeout that is passed to
   1188 	 * this function will be used as disconnect timeout. Otherwise, the
   1189 	 * the remaining time from the connect call will also be added to
   1190 	 * the disconnect timeout.
   1191 	 *
   1192 	 */
   1193 
   1194 	time_used = (int)(t2 - t1);
   1195 
   1196 	/*
   1197 	 * Use the remaining time(timeout - time_took_to_connect) to disconnect
   1198 	 */
   1199 
   1200 	time_remaining = timeout - time_used;
   1201 
   1202 	/* what type of server is it? */
   1203 	st = webserver_type(scds_handle);
   1204 	if (st == ST_ERROR) {
   1205 		/* couldnt even figure out what the server is */
   1206 		retval = SCDS_PROBE_COMPLETE_FAILURE;
   1207 		goto finished;
   1208 	}
   1209 
   1210 	/*
   1211 	 * If all the time is used up, use a small hardcoded timeout
   1212 	 * to still try to disconnect. This will avoid the fd leak.
   1213 	 */
   1214 	if (time_remaining <= 0) {
   1215 		scds_syslog_debug(DBG_LEVEL_LOW,
   1216 			"svc_probe used entire timeout of "
   1217 			"%d seconds during connect operation "
   1218 			"and exceeded the timeout by %d seconds. "
   1219 			"Attempting disconnect with timeout %d",
   1220 			connect_timeout,
   1221 			abs(time_remaining),
   1222 			SVC_DISCONNECT_TIMEOUT_SECONDS);
   1223 
   1224 		time_remaining = SVC_DISCONNECT_TIMEOUT_SECONDS;
   1225 	} else if (st == REGULAR) { /* dont bother secure servers */
   1226 
   1227 		/* Generic HTML/1.0 HEAD check */
   1228 		(void) strcpy(buf, "HEAD / HTTP/1.0\n\n");
   1229 
   1230 		size = strlen(buf);
   1231 		if (scds_fm_tcp_write(scds_handle, sock,
   1232 			buf, &size, time_remaining) < 0) {
   1233 			/*
   1234 			 * write()s should never fail unless the server
   1235 			 * (apache) has closed its end of the socket.
   1236 			 * That sounds like a serious problem. Hence 1
   1237 			 * as probe result.
   1238 			 */
   1239 			retval = SCDS_PROBE_COMPLETE_FAILURE;
   1240 			if (arg_syslog_msgs) {
   1241 				/*
   1242 				 * SCMSGS
   1243 				 * @explanation
   1244 				 * The data service fault monitor probe was
   1245 				 * trying to read from or write to the service
   1246 				 * specified and failed. Sun Cluster will
   1247 				 * attempt to correct the situation by either
   1248 				 * doing a restart or a failover of the data
   1249 				 * service. The problem may be due to an
   1250 				 * overloaded system or other problems,
   1251 				 * causing a timeout to occur before
   1252 				 * communications could be completed.
   1253 				 * @user_action
   1254 				 * If this problem is due to an overloaded
   1255 				 * system, you may consider increasing the
   1256 				 * Probe_timeout property.
   1257 				 */
   1258 				scds_syslog(LOG_ERR,
   1259 					"Failed to communicate with "
   1260 					"server %s port %d: %s.",
   1261 					hostname, port, strerror(errno));
   1262 			}
   1263 
   1264 			goto finished;
   1265 		}
   1266 
   1267 		/*
   1268 		 * Data sent to us by server may span several packets,
   1269 		 * hence must do things in a loop().
   1270 		 */
   1271 		do {
   1272 			t2 = (ulong_t)(gethrtime()/1E9);
   1273 			time_used = (int)(t2 - t1);
   1274 			time_remaining = timeout - time_used;
   1275 			if (time_remaining &l