1 // 2 // CDDL HEADER START 3 // 4 // The contents of this file are subject to the terms of the 5 // Common Development and Distribution License (the License). 6 // You may not use this file except in compliance with the License. 7 // 8 // You can obtain a copy of the license at usr/src/CDDL.txt 9 // or http://www.opensolaris.org/os/licensing. 10 // See the License for the specific language governing permissions 11 // and limitations under the License. 12 // 13 // When distributing Covered Code, include this CDDL HEADER in each 14 // file and include the License file at usr/src/CDDL.txt. 15 // If applicable, add the following below this CDDL HEADER, with the 16 // fields enclosed by brackets [] replaced with your own identifying 17 // information: Portions Copyright [yyyy] [name of copyright owner] 18 // 19 // CDDL HEADER END 20 // 21 22 // 23 // Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 // Use is subject to license terms. 25 // 26 27 #ifndef _MOUNT_SERVER_IMPL_H 28 #define _MOUNT_SERVER_IMPL_H 29 30 #pragma ident "@(#)mount_server_impl.h 1.48 08/05/20 SMI" 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 36 #ifdef __cplusplus 37 } 38 #endif 39 40 #include <h/sol.h> 41 #include <h/pxfs.h> 42 #include <h/pxfs_v1.h> 43 #include <h/repl_pxfs.h> 44 #include <sys/list_def.h> // List template 45 #include <sys/threadpool.h> 46 #include <pxfs/common/pxfslib.h> 47 #include <pxfs/mount/mount_replica_impl.h> 48 #include <pxfs/mount/mount_debug.h> 49 #include <pxfs/mount/bitmap.h> 50 51 // Forward declarations. 52 class mount_server_impl; 53 class mount_client_elem; 54 55 // 56 // The mount HA service supports multiple versions of pxfs file system 57 // software. The pxfs file system software does not change instantaneously. 58 // The version of pxfs file system software is chosen at the time the 59 // system mounts the file system. This means that different file systems 60 // can concurrently be using different pxfs software versions. 61 // 62 // A pxfs file system can be mounted on a mount point within another 63 // pxfs file system. Obviously, one should not unmount a file system 64 // while there are still file systems mounted on mount points within this 65 // file system. Thus there can be an unmount dependency order between 66 // pxfs file systems. The mount service does not keep track of the true 67 // dependency order. Instead the mount service simply unmounts file systems 68 // in the reverse order in which the file systems were mounted. This is 69 // cautious, but safe. A file system using new pxfs version software can 70 // be mounted on a file system using old pxfs version software, 71 // and vice versa under unusual cases. So file systems using different 72 // version pxfs software can have dependencies between them. 73 // 74 // The above two facts led to the decision to jointly manage pxfs file 75 // systems using different software versions of pxfs. The convention is that 76 // the string "_v1" means that the entity supports pxfs version 1 file 77 // systems. 78 // 79 80 // 81 // Class used to store one file system mount element in a list of all mounts. 82 // 83 class fs_elem : public _DList::ListElem { 84 public: 85 fs_elem(pxfs_v1::filesystem_ptr fsp, const pxfs_v1::fs_info &finfo, 86 const sol::mounta &md, const char *options, bool is_ha, 87 const char *name, const sol::nodeid_seq_t &nids); 88 89 ~fs_elem(); 90 91 enum fs_elem_ver_t { VERSION_0, VERSION_1 }; 92 93 // 94 // Public data members. 95 // 96 97 // 98 // An fs_elem can support either version 0 or 1, 99 // but not both simultaneously. 100 // A union does not allow members requiring 101 // initialization or destruction. 102 // 103 pxfs_v1::filesystem_var fs_v1_ptr; 104 105 union { 106 pxfs_v1::fs_info fs_v1_info; 107 }; 108 109 sol::mounta ma; 110 CORBA::String_var mntoptions; 111 bool dev_is_ha; 112 CORBA::String_var dev_name; 113 sol::nodeid_seq_t dev_nids; 114 fs_elem_ver_t fs_elem_ver; 115 116 private: 117 // Disallowed operations. 118 fs_elem(); 119 fs_elem & operator = (const fs_elem &); 120 fs_elem(const fs_elem &); 121 }; 122 123 // 124 // Class used to record device locks (see mount_server_impl::devlock()). 125 // 126 class devlock_elem : public _SList::ListElem { 127 public: 128 devlock_elem(fs::mount_client_ptr client_p, sol::nodeid_t nodeid, 129 const char *name); 130 ~devlock_elem(); 131 132 // 133 // Public data members. 134 // 135 fs::mount_client_var owner; // node which owns the lock 136 char *spec; // device path name 137 uint_t nwaiters; // non 0 if someone is waiting 138 sol::nodeid_t ownerid; // nodeid which owns the lock 139 bitmap<NODEID_MAX> waiters; // for deadlock detection 140 bool unlocked; // true if unlock was signaled 141 os::mutex_t waiter_lock; // signal device lock waiters 142 os::condvar_t waiter_cv; // signal device lock waiters 143 144 private: 145 // Disallowed operations. 146 devlock_elem & operator = (const devlock_elem &); 147 devlock_elem(const devlock_elem &); 148 }; 149 150 // 151 // Class used to store mount_client elements. 152 // 153 class mount_client_elem : 154 public mc_replica_of<fs::mount_client_died>, 155 public _SList::ListElem 156 { 157 public: 158 // Primary constructor. 159 mount_client_elem(mount_server_impl *server, 160 fs::mount_client_ptr client, sol::nodeid_t nid, 161 mount_replica_impl *replp); 162 163 // Secondary constructor. 164 mount_client_elem(mount_server_impl *server, 165 fs::mount_client_ptr client, sol::nodeid_t nid, 166 bool is_shutdown, fs::mount_client_died_ptr obj); 167 168 ~mount_client_elem(); 169 void _unreferenced(unref_t arg); 170 171 // 172 // Public data members. 173 // 174 mount_server_impl *serverp; // pointer to mount server 175 fs::mount_client_var clientptr; // mount_client reference 176 sol::nodeid_t nodeid; // nodeid of the client 177 bool shutdown; // true if being shut down 178 179 private: 180 // Disallowed operations. 181 mount_client_elem & operator = (const mount_client_elem &); 182 mount_client_elem(const mount_client_elem &); 183 }; 184 185 // 186 // This is used by mount_server::mount() to detect when the client 187 // crashes and there is a failover of the server. In this case, 188 // we need to unlock the mount point on all nodes. 189 // 190 class mount_state : public transaction_state { 191 public: 192 enum mount_ver_t { VERSION_0, VERSION_1, VERSION_UNKNOWN }; 193 194 mount_state(const sol::mounta &md, fs::mount_client_ptr client_p, 195 mount_server_impl &srvr, mount_ver_t ver); 196 197 virtual ~mount_state(); 198 199 // 200 // Routines to implement the transaction_state template. 201 // 202 virtual void orphaned(Environment &e); 203 virtual void committed(); 204 205 // Public data members. 206 fs::mount_client_var mountpoint_lock_c; 207 sol::mounta ma; 208 sol::error_t error; 209 bool is_remount; 210 211 // 212 // A mount_state can support either version 0 or 1, 213 // but not both simultaneously. 214 // A union does not allow members requiring 215 // initialization or destruction. 216 // 217 pxfs_v1::filesystem_var fs_v1_ptr; 218 219 union { 220 pxfs_v1::fs_info fs_v1_info; 221 }; 222 223 CORBA::String_var mntoptions; 224 mount_ver_t mount_ver; 225 226 private: 227 // Reference to server which created this object. 228 mount_server_impl &server; 229 230 // Disallowed operations. 231 mount_state(); 232 mount_state & operator = (const mount_state &); 233 mount_state(const mount_state &); 234 }; 235 236 // 237 // This is used by mount_server::remove_client() to detect when the client 238 // crashes and there is a failover of the server. In this case, 239 // we need to unlock the mount point on all nodes. 240 // 241 class unmount_state : public transaction_state { 242 public: 243 unmount_state(pxfs_v1::filesystem_ptr fsptr, int32_t flags, 244 solobj::cred_ptr crptr, mount_server_impl &srvr, 245 fs::mount_client_ptr clptr); 246 247 virtual ~unmount_state(); 248 249 enum unmount_ver_t { VERSION_0, VERSION_1 }; 250 251 // 252 // Routines to implement the transaction_state template. 253 // 254 virtual void orphaned(Environment &e); 255 virtual void committed(); 256 257 enum state_t { 258 START, UNMOUNTED, NOTIFIED, COMMITTED 259 }; 260 261 // 262 // Public data members. 263 // 264 fs::mount_client_var skip; // skip prepare_unmount() 265 266 int32_t flags; // umount system call flags 267 // 268 // An unmount_state can support either version 0 or 1, 269 // but not both simultaneously. 270 // A union does not allow members requiring 271 // initialization or destruction. 272 // 273 // file system being unmounted 274 // 275 pxfs_v1::filesystem_var fs_v1_obj; 276 277 solobj::cred_var credobj; 278 state_t state; // checkpoint state for unmount 279 sol::error_t error; 280 CORBA::String_var service_name; // HA FS service to shut down 281 unmount_ver_t unmount_ver; 282 283 private: 284 // Reference to server which created this object. 285 mount_server_impl &server; 286 287 // Disallowed operations. 288 unmount_state(); 289 unmount_state & operator = (const unmount_state &); 290 unmount_state(const unmount_state &); 291 }; 292 293 // 294 // This is used by DCS to notify the mount server of device configuration 295 // changes. 296 // 297 class dc_callback_impl : public mc_replica_of<fs::dc_callback> { 298 public: 299 // Primary constructor. 300 dc_callback_impl(mount_server_impl &srvr, mount_replica_impl *serverp); 301 302 // Secondary constructor. 303 dc_callback_impl(mount_server_impl &srvr, fs::dc_callback_ptr obj); 304 305 ~dc_callback_impl(); 306 void _unreferenced(unref_t arg); 307 308 protected: 309 // fs::dc_callback::* 310 /* dc_callback */ 311 void notify_change(sol::dev_t gdev, const sol::nodeid_seq_t &nodes, 312 Environment &_environment); 313 314 bool still_active(sol::dev_t gdev, Environment &_environment); 315 // 316 317 private: 318 // Reference to server which created this object. 319 mount_server_impl &server; 320 }; 321 322 // 323 // Note: Only the HA version of fs::mount_server interface is 324 // needed since there is only one mount server per cluster. 325 // All nodes need to be able to become the primary so that the list 326 // of mounted files is never lost (also enables rolling upgrade). 327 // 328 class mount_server_impl : public mc_replica_of<fs::mount_server> { 329 public: 330 // Primary constructor. 331 mount_server_impl(mount_replica_impl *serverp); 332 333 // Secondary constructor. 334 mount_server_impl(mount_replica_impl *serverp, 335 fs::mount_server_ptr obj); 336 337 ~mount_server_impl(); 338 339 void _generic_method(CORBA::octet_seq_t &, 340 CORBA::object_seq_t &, Environment &); 341 342 void _unreferenced(unref_t arg); 343 344 // 345 // Called from mount_replica_impl when switching to primary, 346 // secondary, or spare. 347 // 348 void convert_to_primary(); 349 void convert_to_secondary(); 350 void convert_to_spare(); 351 void freeze_primary(); 352 void unfreeze_primary(); 353 354 // 355 // This is called to upgrade mount_client references 356 // during Rolling Upgrade commit. 357 // 358 void upgrade_client_reference(Environment &_environment); 359 360 // 361 // This is called if a mount client dies 362 // (mount_client_elem::_unreferenced()). 363 // 364 void client_died(mount_client_elem *cep); 365 366 // Helper routine for handling mount clients dying. See client_died(). 367 void do_unref(mount_client_elem *unref_clientp); 368 369 // 370 // Helper routine for mount_state::orphaned() to clean up mount(). 371 // Return true if the operation is completed. 372 // 373 bool mount_orphaned(mount_state *statep, bool orph, 374 Environment &env); 375 376 // 377 // Common code for mount(), remount() and mount_orphaned(). 378 // 379 void mount_end_v1(fs::mount_client_ptr skip, const sol::mounta &ma, 380 const char *mntoptions, pxfs_v1::filesystem_ptr fs, 381 const pxfs_v1::fs_info &fsinfo, bool is_remount, Environment &env); 382 383 // Helper routine for unmount_state::orphaned() to clean up unmount(). 384 void unmount_orphaned(unmount_state *statep, bool ret_err, 385 Environment &env); 386 387 // Helper routine to update device configuration changes. 388 void notify_change(sol::dev_t gdev, const sol::nodeid_seq_t &nodes, 389 Environment &env); 390 391 // Helper routine to check if device is in use. 392 bool still_active(sol::dev_t gdev); 393 394 // 395 // Helper functions for checkpointing state on a secondary. 396 // These are similar to the IDL operations above but don't have 397 // the _environment. 398 // 399 void ckpt_add_client(fs::mount_client_died_ptr clobj, 400 fs::mount_client_ptr client_p, sol::nodeid_t nodeid, bool shutdown); 401 402 void ckpt_remove_client(fs::mount_client_ptr client_p); 403 404 void ckpt_mount_start_v1(const sol::mounta &ma, 405 fs::mount_client_ptr client, Environment &env); 406 407 void ckpt_mount_err(sol::error_t error, Environment &env); 408 409 void ckpt_mount_v1( 410 pxfs_v1::filesystem_ptr fs, 411 const pxfs_v1::fs_info &fsinfo, 412 const sol::mounta &ma, 413 const char *mntoptions, 414 bool dev_is_ha, 415 const char *dev_name, 416 const sol::nodeid_seq_t &dev_nids); 417 418 void ckpt_mount_middle_v1( 419 pxfs_v1::filesystem_ptr fs, 420 const pxfs_v1::fs_info &fsinfo, 421 const char *mntoptions, 422 bool dev_is_ha, 423 const char *dev_name, 424 const sol::nodeid_seq_t &dev_nids, 425 Environment &env); 426 427 void ckpt_remount_middle_v1( 428 pxfs_v1::filesystem_ptr fs, 429 uint32_t vfsflags, 430 const char *mntoptions, 431 Environment &env); 432 433 void ckpt_unmount_start_v1( 434 pxfs_v1::filesystem_ptr fs, 435 int32_t flags, 436 solobj::cred_ptr credobj, 437 fs::mount_client_ptr client, 438 bool is_shutdown, 439 Environment &env); 440 441 void ckpt_unmount_middle(sol::error_t error, Environment &env); 442 443 void ckpt_unmount_notified(Environment &env); 444 445 void ckpt_unmount_end(Environment &env); 446 447 void ckpt_unmount_shutdown(fs::mount_client_ptr client); 448 449 void ckpt_devlock(fs::mount_client_ptr client, sol::nodeid_t nodeid, 450 const char *dev_name); 451 452 void ckpt_devunlock(const char *dev_name); 453 454 void ckpt_get_dc_callback(fs::dc_callback_ptr cb); 455 456 void ckpt_notify_change(sol::dev_t gdev, 457 const sol::nodeid_seq_t &dev_nids); 458 459 void ckpt_upgrade_client_list(fs::mount_client_ptr client_p, 460 sol::nodeid_t nodeid); 461 462 void ckpt_upgrade_devlock_list(const char *dev_name, 463 fs::mount_client_ptr client_p); 464 465 void dump_state(repl_pxfs::mount_replica_ptr ckptp, 466 Environment &env); 467 468 // Checkpoint accessor function 469 repl_pxfs::mount_replica_ptr get_checkpoint(); 470 471 protected: 472 // fs::mount_server::* 473 /* mount_server */ 474 void add_client(fs::mount_client_ptr client_p, sol::nodeid_t nodeid, 475 fs::mount_client_died_out clobj, Environment &_environment); 476 477 void remove_client(fs::mount_client_ptr client_p, 478 solobj::cred_ptr credobj, Environment &_environment); 479 480 void mount(const sol::mounta &ma, sol::uintptr_t mvp, 481 solobj::cred_ptr credobj, fs::mount_client_ptr client_p, 482 bool dev_is_ha, const char *dev_name, 483 const sol::nodeid_seq_t &dev_nids, fs::filesystem_out fs, 484 fs::fs_info &fsinfo, CORBA::String_out mntoptions, 485 Environment &_environment); 486 487 void mount_v1(const sol::mounta &ma, sol::uintptr_t mvp, 488 solobj::cred_ptr credobj, fs::mount_client_ptr client_p, 489 bool dev_is_ha, const char *dev_name, 490 const sol::nodeid_seq_t &dev_nids, pxfs_v1::filesystem_out fs, 491 pxfs_v1::fs_info &fsinfo, CORBA::String_out mntoptions, 492 Environment &_environment); 493 494 void remount(fs::filesystem_ptr fs, fs::fobj_ptr mntpnt, 495 const sol::mounta &ma, solobj::cred_ptr credobj, 496 fs::mount_client_ptr client_p, uint32_t &vfsflags, 497 CORBA::String_out mntoptions, Environment &_environment); 498 499 void remount_v1(pxfs_v1::filesystem_ptr fs, pxfs_v1::fobj_ptr mntpnt, 500 const sol::mounta &ma, solobj::cred_ptr credobj, 501 fs::mount_client_ptr client_p, uint32_t &vfsflags, 502 CORBA::String_out mntoptions, Environment &_environment); 503 504 void unmount(fs::filesystem_ptr fs, solobj::cred_ptr credobj, 505 fs::mount_client_ptr client_p, sol::nodeid_t nodeid, 506 bool is_shutdown, Environment &_environment); 507 508 void unmount_1(fs::filesystem_ptr fs, int32_t flags, 509 solobj::cred_ptr credobj, 510 fs::mount_client_ptr c, sol::nodeid_t nodeid, 511 bool is_shutdown, Environment &_environment); 512 513 void unmount_v1(pxfs_v1::filesystem_ptr fs, int32_t flags, 514 solobj::cred_ptr credobj, 515 fs::mount_client_ptr client_p, sol::nodeid_t nodeid, 516 bool is_shutdown, Environment &_environment); 517 518 void devlock(fs::mount_client_ptr client_p, sol::nodeid_t nodeid, 519 const char *dev_name, Environment &_environment); 520 521 void devunlock(const char *dev_name, Environment &_environment); 522 523 void get_devlock_owner(const char *dev_name, 524 sol::nodeid_t &lock_owner, Environment &_environment); 525 526 fs::dc_callback_ptr get_dc_callback(Environment &_environment); 527 // 528 529 private: 530 enum fs_status_t { 531 AVAILABLE, NOT_AVAILABLE, ERROR 532 }; 533 534 // Helper function to test for simultaneous remounts 535 void check_multiple_remounts(const char *pathp, 536 Environment &_environment); 537 538 // Version for forced unmount rolling upgrade support 539 sol::error_t unmount_common_1(fs::mount_client_ptr skip, 540 fs_elem *fep, int32_t flags, 541 solobj::cred_ptr credobj, unmount_state::state_t state, 542 sol::error_t error, const char *service_name, Environment &env); 543 544 // Helper routine to lock mount points. 545 sol::error_t lock_mntpnt(fs::mount_client_ptr skip, 546 const char *mountpoint, int32_t mntflags, Environment &env); 547 548 // Helper routine to unlock mount points already locked. 549 void unlock_mntpnt(fs::mount_client_ptr skip, 550 mount_client_elem *endp, 551 const char *mountpoint, Environment &env); 552 553 // Helper routine to find a mount_client_elem given a node number. 554 mount_client_elem *find_client(nodeid_t nid); 555 mount_client_elem *find_client(fs::mount_client_ptr client_p); 556 557 // Return the fs_elem for fs or NULL if not in fs_list. 558 fs_elem *find_fs(pxfs_v1::filesystem_ptr fs); 559 fs_elem *find_fs(const char *spec); 560 561 // Return the devlock_elem or NULL if not in devlock_list. 562 devlock_elem *find_devlock(const char *spec); 563 564 // Return true if nodeid is waiting for a lock. 565 bool find_devlock_waiter(sol::nodeid_t nodeid); 566 567 // Find out if the given filesystem is available or not. 568 fs_status_t get_fs_status(fs_elem *fep); 569 570 // 571 // Lists of mount clients and file systems. 572 // 573 // lock ordering: 574 // client_list_lock.lock(); 575 // fs_list_lock.lock(); 576 // devlock_list_lock.lock(); 577 // 578 579 typedef IntrList<mount_client_elem, _SList> client_list_t; 580 typedef IntrList<fs_elem, _DList> fs_list_t; 581 typedef IntrList<devlock_elem, _SList> devlock_list_t; 582 583 // List of mount clients 584 client_list_t client_list; 585 586 // Lock to protect the list of clients. 587 os::rwlock_t client_list_lock; 588 589 // List of globally mounted file systems. 590 fs_list_t fs_list; 591 592 // Lock to protect the list of filesystems. 593 os::rwlock_t fs_list_lock; 594 595 // List of device locks to prevent multiple simultaneous fsck's. 596 devlock_list_t devlock_list; 597 598 // Lock to protect the list of device locks. 599 os::rwlock_t devlock_list_lock; 600 601 // Lock protecting current the currentmnt string 602 os::mutex_t current_mount_lock; 603 604 // CV to broadcast change of currentmnt state back to NULL 605 os::condvar_t currentmnt_cv; 606 607 // 608 // IDL pointer to the dc_callback object. 609 // This is protected by fs_list_lock 610 // (could be a separate lock though). 611 // 612 fs::dc_callback_var dc_callback_obj; 613 614 // Pointer to our replica server object. 615 mount_replica_impl *repl_serverp; 616 617 // True if this replica is the primary. 618 bool primary; 619 620 // True if this service is frozen. Protected by fs_list_lock. 621 bool frozen; 622 623 // Current mountpoint that is being processed. 624 char *currentmnt; 625 }; 626 627 #ifdef __cplusplus 628 extern "C" { 629 #endif 630 631 #ifdef __cplusplus 632 } 633 #endif 634 635 #endif // _MOUNT_SERVER_IMPL_H 636