1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the License). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/CDDL.txt 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/CDDL.txt. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets [] replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * apache.c - Common utilities for highly available apache 27 */ 28 29 #pragma ident "@(#)apache.c 1.66 07/06/06 SMI" 30 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <strings.h> 34 #include <unistd.h> 35 #include <libintl.h> 36 #include <errno.h> 37 #include <sys/stat.h> 38 #include <sys/wait.h> 39 #include <rgm/libdsdev.h> 40 #include <ds_common.h> 41 #include "apache.h" 42 43 static int get_apachectl_value(const char *ctlfile, const char *var, 44 char *value); 45 static void remove_pidfile(scds_handle_t scds_handle); 46 static int validate_monitor_uri_list(scds_handle_t scds_handle, 47 scds_net_resource_list_t *snrlp, boolean_t print_messages); 48 static int validate_server_url(char *uri, scds_net_resource_list_t *snrlp, 49 boolean_t print_messages); 50 51 typedef struct run_cmd_return_struct { 52 /* B_TRUE if system() returns -1, meaning fork or exec failed */ 53 boolean_t system_failed; 54 /* 55 * B_TRUE if cmd was interrupted by a signal, i.e. if 56 * WIFSIGNALED() is true. 57 */ 58 boolean_t cmd_did_not_finish; 59 60 /* Return code from system() */ 61 int rc_system; 62 /* 63 * Return code from cmd, i.e. WEXITSTATUS(system(run_cmd)) 64 * Value not valid if system_failed == B_TRUE or 65 * if cmd_did_not_finish == B_TRUE 66 */ 67 int rc_cmd; 68 } run_cmd_return_t; 69 70 /* 71 * run_cmd() runs the command requested, redirecting stdout and stderr to the 72 * requested place(s). A debug message is issued, if requested before 73 * attempting to run the command. See typedef definitions of run_cmd_args_t 74 * and run_cmd_return_t for explanation of args to run_cmd() and return 75 * information from run_cmd(). 76 * 77 * Parameters: 78 * arg_cmd_to_run - Command run_cmd() will run 79 * arg_log_error - whether or not to issue scds_syslog () messages 80 */ 81 82 void run_cmd(char *arg_cmd_to_run, boolean_t arg_log_error, 83 run_cmd_return_t *argp_rc, boolean_t print_messages) 84 { 85 /* Declare return code for internal functions */ 86 int rc; 87 88 /* Initialize return info */ 89 argp_rc->system_failed = B_FALSE; 90 91 argp_rc->cmd_did_not_finish = B_FALSE; 92 argp_rc->rc_system = 0; 93 argp_rc->rc_cmd = 0; 94 95 /* Run the command */ 96 rc = system(arg_cmd_to_run); 97 /* Set return value. */ 98 argp_rc->rc_system = rc; 99 100 /* If system() has an error fork-ing or exec-ing... */ 101 if (rc == -1) { 102 /* ...set return value and... */ 103 argp_rc->system_failed = B_TRUE; 104 /* ...say so, if we're logging messages */ 105 rc = errno; /*lint !e746 */ 106 if (arg_log_error == B_TRUE) { 107 /* 108 * SCMSGS 109 * @explanation 110 * Failure in executing the command. 111 * @user_action 112 * Check the syslog message for the command 113 * description. Check whether the system is low in 114 * memory or the process table is full and take 115 * appropriate action. Make sure that the executable 116 * exists. 117 */ 118 scds_syslog(LOG_ERR, 119 "Cannot Execute %s: %s.", 120 arg_cmd_to_run, 121 strerror(rc)); 122 } 123 if (print_messages) { 124 (void) fprintf(stderr, gettext("Cannot Execute %s: " 125 "%s.\n"), arg_cmd_to_run, 126 gettext(strerror(rc))); 127 } 128 /* If system() ran OK... */ 129 } else { 130 /* If we didn't complete due to being signaled... */ 131 if (WIFSIGNALED((uint_t)rc)) { 132 /* ...set return value and... */ 133 argp_rc->cmd_did_not_finish = B_TRUE; 134 /* ...say so, if we're logging messages */ 135 if (arg_log_error == B_TRUE) { 136 /* 137 * SCMSGS 138 * @explanation 139 * Need explanation of this message! 140 * @user_action 141 * Need a user action for this message. 142 */ 143 scds_syslog(LOG_ERR, 144 "%s failed to complete.", 145 arg_cmd_to_run); 146 } 147 if (print_messages) { 148 (void) fprintf(stderr, gettext("%s failed to " 149 "complete.\n"), arg_cmd_to_run); 150 } 151 /* If we completed... */ 152 } else { 153 /* ...get arg_cmd_to_run's exit status */ 154 argp_rc->rc_cmd = WEXITSTATUS((uint_t)rc); 155 } 156 } 157 } 158 159 160 /* 161 * The initial timeout allowed for the apache dataservice to 162 * be fully up and running. We will wait for for 3 % (SVC_WAIT_PCT) 163 * of the start_timeout time before probing the service. 164 */ 165 #define SVC_WAIT_PCT 2 166 167 /* 168 * We need to use 95% of probe_timeout to connect to the port and the 169 * remaining time is used to disconnect from port in the svc_probe function. 170 */ 171 #define SVC_CONNECT_TIMEOUT_PCT 95 172 173 /* 174 * We need to wait for SVC_WAIT_TIME ( 5 secs) for pmf 175 * to send the failure message before probing the service 176 */ 177 178 #define SVC_WAIT_TIME 5 179 180 /* 181 * This value will be used as disconnect timeout, if there is no 182 * time left from the probe_timeout. 183 */ 184 185 #define SVC_DISCONNECT_TIMEOUT_SECONDS 2 186 187 /* 188 * This variable will be set by svc_validate and will also be 189 * used by svc_start. 190 */ 191 192 static char binpath[SCDS_CMD_SIZE] = ""; 193 194 /* 195 * svc_validate(): 196 * Do apache specific validation of the resource configration. 197 * Called by start/validate/update/monitor methods. 198 * Return 0 on success, > 0 on failures. 199 * 200 * svc_validate will check for the following 201 * 1. Bin_dir 202 * 2. Executable permissions, if filesystem is mounted on this node 203 * 3. Parse httpd.conf file 204 * 4. Port_list 205 * 5. Logical hostname resources 206 * 6. Extension properties 207 */ 208 209 int 210 svc_validate(scds_handle_t scds_handle, boolean_t print_messages) 211 { 212 struct stat statbuf; 213 char apache_cmd[SCDS_ARRAY_SIZE]; 214 char cmd_buffer[SCDS_ARRAY_SIZE]; 215 run_cmd_return_t rcrc; 216 int err = 0, rc = 0, i; 217 scds_net_resource_list_t *snrlp = NULL; 218 scds_port_list_t *portlist = NULL; 219 scha_extprop_value_t *bindir = NULL; 220 221 scds_hasp_status_t hasp_status; 222 /* default is to perform all fs related checks */ 223 boolean_t do_file_checks = B_TRUE; 224 225 /* 226 * apachectl is the control file for "regular" apache and 227 * for mod_ssl+apache. apache-ssl uses httpsdctl instead. 228 * these are the *ctls we can work with, NULL marks end of list 229 */ 230 char *xctl[] = {"httpsdctl", "apachectl", NULL}; 231 232 (void) scds_get_ext_property(scds_handle, "Bin_dir", 233 SCHA_PTYPE_STRING, &bindir); 234 /* Check that the bindir or bindir string is not NULL */ 235 if (bindir == NULL || bindir->val.val_str == NULL) { 236 /* 237 * SCMSGS 238 * @explanation 239 * The property has not been set by the user and must be. 240 * @user_action 241 * Reissue the scrgadm command with the required property and 242 * value. 243 */ 244 scds_syslog(LOG_ERR, "Property %s is not set.", "Bin_dir"); 245 if (print_messages) { 246 (void) fprintf(stderr, gettext("Property %s is not " 247 "set.\n"), "Bin_dir"); 248 } 249 return (1); 250 } else { 251 /* Copy Bin_dir path to static global buffer */ 252 if (strlcpy(binpath, bindir->val.val_str, sizeof (binpath)) 253 >= sizeof (binpath)) { 254 /* 255 * SCMSGS 256 * @explanation 257 * An internal error has occurred. 258 * @user_action 259 * Save a copy of the /var/adm/messages files on all 260 * nodes. Contact your authorized Sun service provider 261 * for assistance in diagnosing the problem. 262 */ 263 scds_syslog(LOG_ERR, "INTERNAL ERROR: %s.", 264 "Insufficient space in buffer"); 265 if (print_messages) { 266 (void) fprintf(stderr, gettext("INTERNAL " 267 "ERROR: %s.\n"), gettext("Insufficient " 268 "space in buffer")); 269 } 270 return (1); 271 } 272 } 273 274 /* check for HAStoragePlus resources */ 275 rc = scds_hasp_check(scds_handle, &hasp_status); 276 if (rc != SCHA_ERR_NOERR) { 277 /* scha_hasp_check() logs everytime it fails */ 278 if (print_messages) { 279 (void) fprintf(stderr, gettext("INTERNAL ERROR: %s.\n"), 280 gettext("scds_hasp_check failed")); 281 } 282 return (1); 283 } 284 285 if (hasp_status == SCDS_HASP_NOT_ONLINE) { 286 /* 287 * SCMSGS 288 * @explanation 289 * The resource depends on a SUNW.HAStoragePlus resource that 290 * is not online on any cluster node. 291 * @user_action 292 * Bring all SUNW.HAStoragePlus resources, that this HA-NFS 293 * resource depends on, online before performing the operation 294 * that caused this error. 295 */ 296 scds_syslog(LOG_ERR, "Resource depends on a " 297 "SUNW.HAStoragePlus type resource that is " 298 "not online anywhere."); 299 if (print_messages) { 300 (void) fprintf(stderr, gettext("Resource depends on a " 301 "SUNW.HAStoragePlus type resource that is " 302 "not online anywhere.\n")); 303 } 304 return (1); 305 } else if (hasp_status == SCDS_HASP_ERR_CONFIG) { 306 /* problem syslogged by scds_hasp_check */ 307 if (print_messages) { 308 (void) fprintf(stderr, gettext("This resource depends " 309 "on a HAStoragePlus resouce that is in a " 310 "different Resource Group. This configuration " 311 "is not supported.\n")); 312 } 313 return (1); 314 } 315 316 /* zero out the contents of statbuf, helps avoid lint lint 644 */ 317 bzero(&statbuf, sizeof (statbuf)); 318 319 /* 320 * We need to work with either httpsdctl or apachectl, 321 * cant call webserver_type() to decide because it might 322 * falsely return REGULAR for some HAStoragePlus configs 323 */ 324 for (i = 0; xctl[i] != NULL; i++) { 325 (void) snprintf(apache_cmd, sizeof (apache_cmd), "%s/%s", 326 binpath, xctl[i]); 327 if (stat(apache_cmd, &statbuf) != 0) { 328 if (errno == ENOENT) { 329 /* no cause for alarm, check next in list */ 330 continue; 331 } else { 332 /* 333 * failed for this ctl, and *not* with 334 * ENOENT. This cant be good... 335 */ 336 rc = errno; 337 /* 338 * SCMSGS 339 * @explanation 340 * The start script is not accessible and 341 * executable. This may be due to the script 342 * not existing or the permissions not being 343 * set properly. 344 * @user_action 345 * Make sure the script exists, is in the 346 * proper directory, and has read nd execute 347 * permissions set appropriately. 348 */ 349 scds_syslog(LOG_ERR, 350 "Cannot access start script %s: %s", 351 apache_cmd, strerror(rc)); 352 if (print_messages) { 353 (void) fprintf(stderr, 354 gettext("Cannot access start " 355 "script %s: %s\n"), apache_cmd, 356 gettext(strerror(rc))); 357 } 358 return (1); 359 } 360 } else { 361 /* stat worked: found a ctl, no need to look further */ 362 do_file_checks = B_TRUE; 363 break; 364 } 365 } 366 367 /* if xtl[i] is NULL then all stats failed with ENOENT */ 368 if (xctl[i] == NULL) { 369 if (hasp_status == SCDS_HASP_ONLINE_NOT_LOCAL) { 370 /* Bin_dir is on a failover fs thats not here */ 371 do_file_checks = B_FALSE; 372 } else { 373 /* 374 * all ENOENTs when there should have been one 375 * ctl here!! Print an error message for the 376 * last one: that should be apachectl 377 */ 378 scds_syslog(LOG_ERR, 379 "Cannot access start script %s: %s", 380 apache_cmd, strerror(ENOENT)); 381 if (print_messages) { 382 (void) fprintf(stderr, gettext("Cannot access " 383 "start script %s: %s\n"), apache_cmd, 384 gettext(strerror(ENOENT))); 385 } 386 return (1); 387 } 388 } 389 390 /* 391 * apache_cmd contains the right *ctl, statbuf has the stat() 392 * results for that ctl and do_file_checks has been (re)set, 393 * everything is fine at this point. 394 */ 395 if (do_file_checks) { 396 /* check that the binary is executable */ 397 if (!(statbuf.st_mode & S_IXUSR)) { 398 /* 399 * SCMSGS 400 * @explanation 401 * This file does not have the expected default 402 * execute permissions. 403 * @user_action 404 * Reset the permissions to allow execute permissions 405 * using the chmod command. 406 */ 407 scds_syslog(LOG_ERR, 408 "Incorrect permissions set for %s.", 409 apache_cmd); 410 if (print_messages) { 411 (void) fprintf(stderr, gettext("Incorrect " 412 "permissions set for %s.\n"), 413 apache_cmd); 414 } 415 return (1); 416 } 417 418 /* Run apachectl configtest if everything is available here */ 419 if ((hasp_status == SCDS_HASP_ONLINE_LOCAL) || 420 (hasp_status == SCDS_HASP_NO_RESOURCE)) { 421 /* Assemble the command. */ 422 rc = snprintf(cmd_buffer, sizeof (cmd_buffer), 423 "%s configtest >/dev/null 2>&1", apache_cmd); 424 425 /* If the snprintf has an error... */ 426 if (rc < 0) { 427 /* ...syslog an error message */ 428 char *internal_err_str = "String handling " 429 "error creating apachectl configtest " 430 "command"; 431 432 scds_syslog(LOG_ERR, "INTERNAL ERROR: %s.", 433 internal_err_str); 434 if (print_messages) { 435 (void) fprintf(stderr, 436 gettext("INTERNAL ERROR: " 437 "%s.\n"), 438 gettext(internal_err_str)); 439 } 440 return (1); 441 } 442 443 /* Issue debug message before starting. */ 444 scds_syslog_debug(DBG_LEVEL_LOW, "Starting server to " 445 "check config file with apachectl configtest " 446 "command."); 447 448 /* Run "apachectl configtest" */ 449 run_cmd(cmd_buffer, B_TRUE, &rcrc, print_messages); 450 451 /* If config file doesn't validate, return with error */ 452 if (rcrc.system_failed || rcrc.cmd_did_not_finish) { 453 /* 454 * If system() failed or command did not finish 455 * (was interrupted), error message was already 456 * issued 457 */ 458 return (1); 459 } 460 if (rcrc.rc_cmd != 0) { 461 /* non-zero return code means error */ 462 /* 463 * SCMSGS 464 * @explanation 465 * The command noted did not return the 466 * expected value. Additional information may 467 * be found in the error message after the 468 * ":", or in subsequent messages in syslog. 469 * @user_action 470 * This message is issued from a general 471 * purpose routine. Appropriate action may be 472 * indicated by the additional information in 473 * the message or in syslog. 474 */ 475 scds_syslog(LOG_ERR, "Command {%s} failed: %s.", 476 cmd_buffer, "httpd cannot parse " 477 "httpd.conf"); 478 if (print_messages) { 479 (void) fprintf(stderr, 480 gettext("Command {%s} failed: " 481 "%s.\n"), cmd_buffer, 482 gettext("httpd cannot parse " 483 "httpd.conf")); 484 } 485 return (1); 486 } 487 } 488 } else { 489 /* 490 * SCMSGS 491 * @explanation 492 * This resource will not perform some filesystem specific 493 * checks (during VALIDATE or MONITOR_CHECK) on this node 494 * because atleast one SUNW.HAStoragePlus resource that it 495 * depends on is online on some other node. 496 * @user_action 497 * None. 498 */ 499 scds_syslog(LOG_INFO, "Skipping checks dependant on " 500 "HAStoragePlus resources on this node."); 501 } /* fs specific checks */ 502 503 /* Network aware service should have at least one port specified */ 504 505 err = scds_get_port_list(scds_handle, &portlist); 506 if (err != SCHA_ERR_NOERR) { 507 /* 508 * SCMSGS 509 * @explanation 510 * API operation has failed in retrieving the cluster 511 * property. 512 * @user_action 513 * For property name, check the syslog message. For more 514 * details about API call failure, check the syslog messages 515 * from other components. 516 */ 517 scds_syslog(LOG_ERR, 518 "Failed to retrieve the property %s: %s.", 519 "Port_list", scds_error_string(err)); 520 if (print_messages) { 521 (void) fprintf(stderr, gettext("Failed to retrieve " 522 "the property %s: %s.\n"), "Port_list", 523 gettext(scds_error_string(err))); 524 } 525 goto finished_validate; 526 } 527 528 if (portlist == NULL || portlist->num_ports < 1) { 529 scds_syslog(LOG_ERR, "Property %s is not set.", "Port_list"); 530 if (print_messages) { 531 (void) fprintf(stderr, gettext("Property %s is not " 532 "set.\n"), "Port_list"); 533 } 534 err = 1; 535 goto finished_validate; 536 } 537 538 /* 539 * Return an error if there is an error when trying to get the 540 * available network address resources for this resource 541 */ 542 if ((err = scds_get_rs_hostnames(scds_handle, &snrlp)) 543 != SCHA_ERR_NOERR) { 544 /* 545 * SCMSGS 546 * @explanation 547 * Error trying to retrieve network address associated with a 548 * resource. 549 * @user_action 550 * For a failover data service, add a network address resource 551 * to the resource group. For a scalable data service, add a 552 * network resource to the resource group referenced by the 553 * RG_dependencies property. 554 */ 555 scds_syslog(LOG_ERR, 556 "Error in trying to access the configured network " 557 "resources : %s.", scds_error_string(err)); 558 if (print_messages) { 559 (void) fprintf(stderr, gettext("Error in trying to " 560 "access the configured network resources " 561 ": %s.\n"), gettext(scds_error_string(err))); 562 } 563 goto finished_validate; 564 } 565 566 /* Return an error if there are no network address resources */ 567 if (snrlp == NULL || snrlp->num_netresources == 0) { 568 /* 569 * SCMSGS 570 * @explanation 571 * A resource has no associated network address. 572 * @user_action 573 * For a failover data service, add a network address resource 574 * to the resource group. For a scalable data service, add a 575 * network resource to the resource group referenced by the 576 * RG_dependencies property. 577 */ 578 scds_syslog(LOG_ERR, 579 "No network address resource in resource group."); 580 if (print_messages) { 581 (void) fprintf(stderr, gettext("No network address " 582 "resource in resource group.\n")); 583 } 584 err = 1; 585 goto finished_validate; 586 } 587 588 /* Check to make sure other important extension props are set */ 589 if (scds_get_ext_monitor_retry_count(scds_handle) <= 0) { 590 scds_syslog(LOG_ERR, 591 "Property %s is not set.", 592 "Monitor_retry_count"); 593 if (print_messages) { 594 (void) fprintf(stderr, gettext("Property %s is " 595 "not set.\n"), "Monitor_retry_count"); 596 } 597 err = 1; /* Validation Failure */ 598 goto finished_validate; 599 } 600 if (scds_get_ext_monitor_retry_interval(scds_handle) <= 0) { 601 scds_syslog(LOG_ERR, 602 "Property %s is not set.", 603 "Monitor_retry_interval"); 604 if (print_messages) { 605 (void) fprintf(stderr, gettext("Property %s is " 606 "not set.\n"), "Monitor_retry_interval"); 607 } 608 err = 1; /* Validation Failure */ 609 goto finished_validate; 610 } 611 if (scds_get_ext_probe_timeout(scds_handle) <= 0) { 612 scds_syslog(LOG_ERR, 613 "Property %s is not set.", 614 "Probe_timeout"); 615 if (print_messages) { 616 (void) fprintf(stderr, gettext("Property %s is " 617 "not set.\n"), "Probe_timeout"); 618 } 619 err = 1; /* Validation Failure */ 620 goto finished_validate; 621 } 622 623 /* 624 * Make sure that URIs provided (if any) look OK. Also makes sure 625 * that all the Uris have hostnames that are in the list of network 626 * resources used by the resource. 627 */ 628 if (validate_monitor_uri_list(scds_handle, snrlp, print_messages) != 629 0) { 630 err = 1; 631 goto finished_validate; 632 } 633 634 /* All validation checks were successful */ 635 /* 636 * SCMSGS 637 * @explanation 638 * The validation of the configuration for the data service was 639 * successful. 640 * @user_action 641 * None. This is only an informational message. 642 */ 643 scds_syslog(LOG_INFO, "Successful validation."); 644 645 finished_validate: 646 if (snrlp) 647 scds_free_net_list(snrlp); 648 if (portlist) 649 scds_free_port_list(portlist); 650 651 return (err); 652 } 653 654 655 /* 656 * svc_start(): 657 */ 658 659 int 660 svc_start(scds_handle_t scds_handle) 661 { 662 char cmd[SCDS_CMD_SIZE]; 663 char *rsname = NULL, *rgname = NULL; 664 int rc = SCHA_ERR_NOERR; 665 666 /* Get resource and resource group names */ 667 rsname = (char *)scds_get_resource_name(scds_handle); 668 rgname = (char *)scds_get_resource_group_name(scds_handle); 669 670 switch (webserver_type(scds_handle)) { 671 case MOD_SSL: (void) snprintf(cmd, sizeof (cmd), 672 "%s/apachectl startssl", binpath); 673 break; 674 case APACHE_SSL: (void) snprintf(cmd, sizeof (cmd), 675 "%s/httpsdctl start", binpath); 676 break; 677 case REGULAR: (void) snprintf(cmd, sizeof (cmd), 678 "%s/apachectl start", binpath); 679 break; 680 case ST_ERROR: /* error already logged */ 681 default: return (1); 682 } 683 684 /* 685 * SCMSGS 686 * @explanation 687 * Sun Cluster is starting the application with the specified command. 688 * @user_action 689 * This is an informational message, no user action is needed. 690 */ 691 scds_syslog(LOG_NOTICE, 692 "Starting server with command %s.", cmd); 693 694 remove_pidfile(scds_handle); 695 696 rc = scds_pmf_start(scds_handle, SCDS_PMF_TYPE_SVC, 0, cmd, -1); 697 698 if (rc == SCHA_ERR_NOERR) { 699 /* 700 * SCMSGS 701 * @explanation 702 * The resource successfully started the application. 703 * @user_action 704 * This message is informational; no user action is needed. 705 */ 706 scds_syslog(LOG_NOTICE, 707 "Start of %s completed successfully.", cmd); 708 (void) scha_resource_setstatus(rsname, rgname, 709 SCHA_RSSTATUS_OK, 710 "Completed successfully."); 711 } else { 712 char msg[SCDS_ARRAY_SIZE]; 713 714 /* 715 * SCMSGS 716 * @explanation 717 * Sun Cluster could not start the application. It would 718 * attempt to start the service on another node if possible. 719 * @user_action 720 * 1) Check prior syslog messages for specific problems and 721 * correct them. 722 * 723 * 2) This problem may occur when the cluster is under load 724 * and Sun Cluster cannot start the application within the 725 * timeout period specified. You may consider increasing the 726 * Start_timeout property. 727 * 728 * 3) If the resource was unable to start on any node, 729 * resource would be in START_FAILED state. In this case, use 730 * scswitch to bring the resource ONLINE on this node. 731 * 732 * 4) If the service was successfully started on another node, 733 * attempt to restart the service on this node using scswitch. 734 * 735 * 5) If the above steps do not help, disable the resource 736 * using scswitch. Check to see that the application can run 737 * outside of the Sun Cluster framework. If it cannot, fix any 738 * problems specific to the application, until the application 739 * can run outside of the Sun Cluster framework. Enable the 740 * resource using scswitch. If the application runs outside of 741 * the Sun Cluster framework but not in response to starting 742 * the data service, contact your authorized Sun service 743 * provider for assistance in diagnosing the problem. 744 */ 745 scds_syslog(LOG_ERR, 746 "Failed to start %s.", cmd); 747 748 (void) sprintf(msg, "Failed to start %s.", APP_NAME); 749 (void) scha_resource_setstatus(rsname, rgname, 750 SCHA_RSSTATUS_FAULTED, msg); 751 } 752 753 return (rc); 754 } 755 756 757 /* 758 * svc_wait(): 759 * 760 * wait for the data service to start up fully and make sure it is running 761 * healthy 762 */ 763 764 int 765 svc_wait(scds_handle_t scds_handle) 766 { 767 int svc_start_timeout, probe_timeout, 768 probe_result, i; 769 scha_err_t err = SCHA_ERR_NOERR; 770 scds_netaddr_list_t *netaddr = NULL; 771 scha_extprop_value_t *urilist = NULL; 772 scha_str_array_t *uris = NULL; 773 774 /* obtain the network resource to use for probing */ 775 err = scds_get_netaddr_list(scds_handle, &netaddr); 776 if (err != SCHA_ERR_NOERR) { 777 scds_syslog(LOG_ERR, 778 "Error in trying to access the " 779 "configured network resources : %s.", 780 scds_error_string(err)); 781 return (1); 782 } 783 784 /* Return an error if there are no network resources */ 785 if (netaddr == NULL || netaddr->num_netaddrs == 0) { 786 scds_syslog(LOG_ERR, 787 "No network address resource in resource group."); 788 return (1); 789 } 790 791 /* Retrieve the list of uris that we were told to probe */ 792 err = scds_get_ext_property(scds_handle, "Monitor_Uri_List", 793 SCHA_PTYPE_STRINGARRAY, &urilist); 794 if ((err != SCHA_ERR_NOERR) && (err != SCHA_ERR_PROP)) { 795 /* failed with something other than SCHA_ERR_PROP */ 796 scds_syslog(LOG_ERR, 797 "Failed to retrieve the " 798 "property %s: %s.", 799 "Monitor_Uri_List", scds_error_string(err)); 800 scds_free_netaddr_list(netaddr); 801 return (1); 802 } 803 if (urilist != NULL) { 804 uris = urilist->val.val_strarray; 805 } 806 807 /* Get the Start method timeout and the Probe timeout value */ 808 svc_start_timeout = scds_get_rs_start_timeout(scds_handle); 809 probe_timeout = scds_get_ext_probe_timeout(scds_handle); 810 811 /* 812 * sleep for SVC_WAIT_PCT percentage of start_timeout time 813 * before actually probing the dataservice. This is to allow 814 * the dataservice to be fully up inorder to reply to the 815 * probe. 816 */ 817 if (scds_svc_wait(scds_handle, 818 (svc_start_timeout * SVC_WAIT_PCT)/100) 819 != SCHA_ERR_NOERR) { 820 scds_syslog(LOG_ERR, "Failed to start %s.", APP_NAME); 821 scds_free_ext_property(urilist); 822 scds_free_netaddr_list(netaddr); 823 return (1); 824 } 825 826 do { 827 scds_syslog_debug(DBG_LEVEL_HIGH, 828 "Probing service, t = %d\n", time(NULL)); 829 830 /* 831 * Probe the data service on the logicalhostnames 832 * and the ports. 833 */ 834 probe_result = 0; 835 for (i = 0; (i < netaddr->num_netaddrs) && (probe_result == 0); 836 i++) { 837 probe_result = svc_probe(scds_handle, 838 netaddr->netaddrs[i].hostname, 839 netaddr->netaddrs[i].port_proto.port, 840 probe_timeout, 841 B_FALSE); 842 } 843 /* probe all uris (if any) that were supplied */ 844 if (uris) { 845 for (i = 0; (i < (int)uris->array_cnt) && 846 (probe_result == 0); i++) { 847 probe_result = probe_uri(scds_handle, 848 uris->str_array[i], probe_timeout, B_TRUE); 849 } 850 } 851 852 if (probe_result == SCHA_ERR_NOERR) { 853 /* Success. Free up resources and return */ 854 scds_free_ext_property(urilist); 855 scds_free_netaddr_list(netaddr); 856 return (SCHA_ERR_NOERR); 857 } 858 859 /* 860 * Dataservice is still trying to come up. Sleep for a while 861 * before probing again. 862 */ 863 if (scds_svc_wait(scds_handle, SVC_WAIT_TIME) 864 != SCHA_ERR_NOERR) { 865 scds_syslog(LOG_ERR, 866 "Failed to start %s.", APP_NAME); 867 scds_free_ext_property(urilist); 868 scds_free_netaddr_list(netaddr); 869 return (1); 870 } 871 872 /* We rely on RGM to timeout and terminate the program */ 873 } while (1); 874 } 875 876 877 /* 878 * Stop the apache server 879 * Return 0 on success, > 0 on failures. 880 */ 881 882 int 883 svc_stop(scds_handle_t scds_handle) 884 { 885 char *rsname = NULL, *rgname = NULL; 886 char msg[SCDS_ARRAY_SIZE]; 887 int rc; 888 889 /* Get resource and resource group names */ 890 rsname = (char *)scds_get_resource_name(scds_handle); 891 rgname = (char *)scds_get_resource_group_name(scds_handle); 892 893 /* 894 * SCMSGS 895 * @explanation 896 * Sun Cluster is stopping the specified application. 897 * @user_action 898 * This is an informational message, no user action is needed. 899 */ 900 scds_syslog(LOG_NOTICE, "Stopping %s.", APP_NAME); 901 902 /* 903 * Give the whole timeout to scds_pmf_stop. scds_pmf_stop() 904 * will 80% of that time to sending the SIGTERM then 15% to 905 * sending SIGKILL. 906 */ 907 908 if (scds_pmf_stop(scds_handle, SCDS_PMF_TYPE_SVC, 909 SCDS_PMF_SINGLE_INSTANCE, SIGTERM, 910 scds_get_rs_stop_timeout(scds_handle)) 911 != SCHA_ERR_NOERR) { 912 /* 913 * SCMSGS 914 * @explanation 915 * Sun Cluster failed to stop the application. 916 * @user_action 917 * Use process monitor facility (pmfadm (1M)) with -L option 918 * to retrieve all the tags that are running on the server. 919 * Identify the tag name for the application in this resource. 920 * This can be easily identified as the tag ends in the string 921 * ".svc" and contains the resource group name and the 922 * resource name. Then use pmfadm (1M) with -s option to stop 923 * the application. 924 * 925 * This problem may occur when the cluster is under load and 926 * Sun Cluster cannot stop the application within the timeout 927 * period specified. You may consider increasing the 928 * Stop_timeout property. 929 * 930 * If the error still persists, then reboot the node. 931 */ 932 scds_syslog(LOG_ERR, "Failed to stop %s.", APP_NAME); 933 } 934 935 /* 936 * Regardless of whether the SIGTERM succeeded send SIGKILL to 937 * the pmf tag. This will ensure that the process tree goes 938 * away if it still exists. If it doesn't exist by then, we 939 * return NOERR. 940 * 941 * Notice that this call will return with success, even if the 942 * tag does not exist by now. 943 * 944 * Timeout of -1 will wait until PMF succeeds or we are timed 945 * out by RGM. 946 */ 947 if ((rc = scds_pmf_stop(scds_handle, SCDS_PMF_TYPE_SVC, 0, 948 SIGKILL, -1)) != SCHA_ERR_NOERR) { 949 /* 950 * Failed to stop the application even with SIGKILL. 951 */ 952 /* 953 * SCMSGS 954 * @explanation 955 * The STOP method was unable to stop the application by 956 * sending it a SIGKILL. 957 * @user_action 958 * Contact your authorized Sun service provider to determine 959 * whether a workaround or patch is available. 960 */ 961 scds_syslog(LOG_ERR, 962 "Failed to stop the application with SIGKILL. " 963 "Returning with failure from stop method."); 964 (void) sprintf(msg, "Failed to stop %s.", APP_NAME); 965 (void) scha_resource_setstatus(rsname, rgname, 966 SCHA_RSSTATUS_FAULTED, msg); 967 968 } else { /* rc == SCHA_ERR_NOERR */ 969 /* 970 * SCMSGS 971 * @explanation 972 * The STOP method successfully stopped the resource. 973 * @user_action 974 * This message is informational; no user action is needed. 975 */ 976 scds_syslog(LOG_NOTICE, 977 "Successfully stopped the application"); 978 (void) sprintf(msg, "Successfully stopped %s.", APP_NAME); 979 (void) scha_resource_setstatus(rsname, rgname, 980 SCHA_RSSTATUS_OFFLINE, msg); 981 } 982 983 return (rc); 984 } 985 986 987 /* 988 * This function starts the fault monitor for a apache resource. 989 * This is done by starting the probe under PMF. The PMF tag 990 * is derived as <RG-name,RS-name,instance_number.mon>. The restart 991 * option of PMF is used but not the "infinite restart". Instead 992 * interval/retry_time is obtained from the RTR file. 993 */ 994 995 int 996 mon_start(scds_handle_t scds_handle) 997 { 998 scds_syslog_debug(DBG_LEVEL_HIGH, 999 "Calling MONITOR_START method for resource %s.", 1000 scds_get_resource_name(scds_handle)); 1001 1002 /* 1003 * The probe apache_probe is assumed to be available in the same 1004 * subdirectory where the other callback methods for the RT are 1005 * installed. The last parameter to scds_pmf_start denotes the 1006 * child monitor level. Since we are starting the probe under PMF 1007 * we need to monitor the probe process only and hence we are using 1008 * a value of 0. 1009 */ 1010 if (scds_pmf_start(scds_handle, SCDS_PMF_TYPE_MON, 1011 SCDS_PMF_SINGLE_INSTANCE, "apache_probe", 0) 1012 != SCHA_ERR_NOERR) { 1013 /* 1014 * SCMSGS 1015 * @explanation 1016 * The fault monitor for this data service was not started. 1017 * There may be prior messages in syslog indicating specific 1018 * problems. 1019 * @user_action 1020 * The user should correct the problems specified in prior 1021 * syslog messages. 1022 * 1023 * This problem may occur when the cluster is under load and 1024 * Sun Cluster cannot start the application within the timeout 1025 * period specified. You may consider increasing the 1026 * Monitor_Start_timeout property. 1027 * 1028 * Try switching the resource group to another node using 1029 * scswitch (1M). 1030 */ 1031 scds_syslog(LOG_ERR, 1032 "Failed to start fault monitor."); 1033 return (1); 1034 } 1035 1036 /* 1037 * SCMSGS 1038 * @explanation 1039 * The fault monitor for this data service was started successfully. 1040 * @user_action 1041 * No action needed. 1042 */ 1043 scds_syslog(LOG_INFO, "Started the fault monitor."); 1044 1045 return (SCHA_ERR_NOERR); 1046 } 1047 1048 1049 /* 1050 * This function stops the fault monitor for a apache resource. 1051 * This is done via PMF. The PMF tag for the fault monitor is 1052 * constructed based on <RG-name_RS-name,instance_number.mon>. 1053 */ 1054 1055 int 1056 mon_stop(scds_handle_t scds_handle) 1057 { 1058 scds_syslog_debug(DBG_LEVEL_HIGH, 1059 "Calling MONITOR_STOP method for resource %s.", 1060 scds_get_resource_name(scds_handle)); 1061 1062 if (scds_pmf_stop(scds_handle, SCDS_PMF_TYPE_MON, 1063 SCDS_PMF_SINGLE_INSTANCE, SIGKILL, -1) 1064 != SCHA_ERR_NOERR) { 1065 /* 1066 * SCMSGS 1067 * @explanation 1068 * An attempt was made to stop the fault monitor and it 1069 * failed. There may be prior messages in syslog indicating 1070 * specific problems. 1071 * @user_action 1072 * If there are prior messages in syslog indicating specific 1073 * problems, these should be corrected. If that doesn't 1074 * resolve the issue, the user can try the following. 1075 * 1076 * Use process monitor facility (pmfadm (1M)) with -L option 1077 * to retrieve all the tags that are running on the server. 1078 * Identify the tag name for the fault monitor of this 1079 * resource. This can be easily identified as the tag ends in 1080 * string ".mon" and contains the resource group name and the 1081 * resource name. Then use pmfadm (1M) with -s option to stop 1082 * the fault monitor. 1083 * 1084 * This problem may occur when the cluster is under load and 1085 * Sun Cluster cannot stop the fault monitor within the 1086 * timeout period specified. You may consider increasing the 1087 * Monitor_Stop_timeout property. 1088 * 1089 * If the error still persists, then reboot the node. 1090 */ 1091 scds_syslog(LOG_ERR, 1092 "Failed to stop fault monitor."); 1093 return (1); 1094 } 1095 1096 /* 1097 * SCMSGS 1098 * @explanation 1099 * The fault monitor for this data service was stopped successfully. 1100 * @user_action 1101 * No action needed. 1102 */ 1103 scds_syslog(LOG_INFO, "Stopped the fault monitor."); 1104 1105 return (SCHA_ERR_NOERR); 1106 } 1107 1108 1109 /* 1110 * svc_probe(): Do data service specific probing. Return a value 1111 * between 0 (success) and 100(complete failure). 1112 * 1113 * The probe does a simple socket connection to the apache server on the 1114 * specified port which is configured as the resource extension property 1115 * (Port_list) and pings the dataservice. If the probe fails to connect to 1116 * the port, we return a value of 100 indicating that there is a total 1117 * failure. If the connection goes through and the disconnect to the port 1118 * fails, then a value of 50 is returned indicating a partial failure. 1119 */ 1120 1121 int 1122 svc_probe(scds_handle_t scds_handle, char *hostname, 1123 int port, int timeout, boolean_t arg_syslog_msgs) 1124 { 1125 ulong_t t1, t2; 1126 int sock, rc = 0, retval = 0, time_used, time_remaining; 1127 size_t size = 0; 1128 char buf[SCDS_ARRAY_SIZE]; 1129 long connect_timeout; 1130 server_type st; 1131 1132 1133 /* 1134 * Probe the dataservice by doing a socket connection to the port 1135 * specified in the port_list property to the host that is 1136 * serving the apache dataservice. If the apache service which is 1137 * configured to listen on the specified port, replies to the 1138 * connection, then the probe is successfull. Else we will wait for 1139 * a time period set in probe_timeout property before concluding 1140 * that the probe failed. 1141 */ 1142 1143 /* 1144 * Use the SVC_CONNECT_TIMEOUT_PCT percentage of timeout 1145 * to connect to the port 1146 */ 1147 connect_timeout = (SVC_CONNECT_TIMEOUT_PCT * timeout)/100; 1148 t1 = (ulong_t)(gethrtime()/1E9); 1149 1150 /* 1151 * the probe makes a connection to the specified hostname and port. 1152 * The connection is timed for 95% of the actual probe_timeout. 1153 */ 1154 rc = scds_fm_tcp_connect(scds_handle, &sock, hostname, port, 1155 connect_timeout); 1156 if (rc) { 1157 /* 1158 * SCMSGS 1159 * @explanation 1160 * An error occurred while fault monitor attempted to probe 1161 * the health of the data service. 1162 * @user_action 1163 * Wait for the fault monitor to correct this by doing restart 1164 * or failover. For more error description, look at the syslog 1165 * messages. 1166 */ 1167 scds_syslog(LOG_ERR, 1168 "Failed to connect to host %s and " 1169 "port %d: %s.", 1170 hostname, port, strerror(errno)); 1171 /* this is a complete failure */ 1172 return (SCDS_PROBE_COMPLETE_FAILURE); 1173 } else { 1174 scds_syslog_debug(DBG_LEVEL_LOW, 1175 "Successful connection to server %s " 1176 "port %d for resource %s.", 1177 hostname, port, 1178 scds_get_resource_name(scds_handle)); 1179 } 1180 1181 t2 = (ulong_t)(gethrtime()/1E9); 1182 1183 /* 1184 * Compute the actual time it took to connect. This should be less than 1185 * or equal to connect_timeout, the time allocated to connect. 1186 * If the connect uses all the time that is allocated for it, 1187 * then the remaining value from the probe_timeout that is passed to 1188 * this function will be used as disconnect timeout. Otherwise, the 1189 * the remaining time from the connect call will also be added to 1190 * the disconnect timeout. 1191 * 1192 */ 1193 1194 time_used = (int)(t2 - t1); 1195 1196 /* 1197 * Use the remaining time(timeout - time_took_to_connect) to disconnect 1198 */ 1199 1200 time_remaining = timeout - time_used; 1201 1202 /* what type of server is it? */ 1203 st = webserver_type(scds_handle); 1204 if (st == ST_ERROR) { 1205 /* couldnt even figure out what the server is */ 1206 retval = SCDS_PROBE_COMPLETE_FAILURE; 1207 goto finished; 1208 } 1209 1210 /* 1211 * If all the time is used up, use a small hardcoded timeout 1212 * to still try to disconnect. This will avoid the fd leak. 1213 */ 1214 if (time_remaining <= 0) { 1215 scds_syslog_debug(DBG_LEVEL_LOW, 1216 "svc_probe used entire timeout of " 1217 "%d seconds during connect operation " 1218 "and exceeded the timeout by %d seconds. " 1219 "Attempting disconnect with timeout %d", 1220 connect_timeout, 1221 abs(time_remaining), 1222 SVC_DISCONNECT_TIMEOUT_SECONDS); 1223 1224 time_remaining = SVC_DISCONNECT_TIMEOUT_SECONDS; 1225 } else if (st == REGULAR) { /* dont bother secure servers */ 1226 1227 /* Generic HTML/1.0 HEAD check */ 1228 (void) strcpy(buf, "HEAD / HTTP/1.0\n\n"); 1229 1230 size = strlen(buf); 1231 if (scds_fm_tcp_write(scds_handle, sock, 1232 buf, &size, time_remaining) < 0) { 1233 /* 1234 * write()s should never fail unless the server 1235 * (apache) has closed its end of the socket. 1236 * That sounds like a serious problem. Hence 1 1237 * as probe result. 1238 */ 1239 retval = SCDS_PROBE_COMPLETE_FAILURE; 1240 if (arg_syslog_msgs) { 1241 /* 1242 * SCMSGS 1243 * @explanation 1244 * The data service fault monitor probe was 1245 * trying to read from or write to the service 1246 * specified and failed. Sun Cluster will 1247 * attempt to correct the situation by either 1248 * doing a restart or a failover of the data 1249 * service. The problem may be due to an 1250 * overloaded system or other problems, 1251 * causing a timeout to occur before 1252 * communications could be completed. 1253 * @user_action 1254 * If this problem is due to an overloaded 1255 * system, you may consider increasing the 1256 * Probe_timeout property. 1257 */ 1258 scds_syslog(LOG_ERR, 1259 "Failed to communicate with " 1260 "server %s port %d: %s.", 1261 hostname, port, strerror(errno)); 1262 } 1263 1264 goto finished; 1265 } 1266 1267 /* 1268 * Data sent to us by server may span several packets, 1269 * hence must do things in a loop(). 1270 */ 1271 do { 1272 t2 = (ulong_t)(gethrtime()/1E9); 1273 time_used = (int)(t2 - t1); 1274 time_remaining = timeout - time_used; 1275 if (time_remaining &l