Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * sun4v Memory DR Module
     29  */
     30 
     31 
     32 #include <sys/types.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/vmem.h>
     35 #include <sys/kmem.h>
     36 #include <sys/systm.h>
     37 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
     38 #include <sys/errno.h>
     39 #include <sys/memnode.h>
     40 #include <sys/memlist.h>
     41 #include <sys/memlist_impl.h>
     42 #include <sys/tuneable.h>
     43 #include <sys/proc.h>
     44 #include <sys/disp.h>
     45 #include <sys/debug.h>
     46 #include <sys/vm.h>
     47 #include <sys/callb.h>
     48 #include <sys/memlist_plat.h>	/* for installed_top_size() */
     49 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
     50 #include <sys/dumphdr.h>	/* for dump_resize() */
     51 #include <sys/atomic.h>		/* for use in stats collection */
     52 #include <sys/rwlock.h>
     53 #include <vm/seg_kmem.h>
     54 #include <vm/seg_kpm.h>
     55 #include <vm/page.h>
     56 #include <vm/vm_dep.h>
     57 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
     58 #include <sys/sunddi.h>
     59 #include <sys/mem_config.h>
     60 #include <sys/mem_cage.h>
     61 #include <sys/lgrp.h>
     62 #include <sys/ddi.h>
     63 
     64 #include <sys/modctl.h>
     65 #include <sys/sysevent/dr.h>
     66 #include <sys/mach_descrip.h>
     67 #include <sys/mdesc.h>
     68 #include <sys/ds.h>
     69 #include <sys/drctl.h>
     70 #include <sys/dr_util.h>
     71 #include <sys/dr_mem.h>
     72 
     73 
     74 /*
     75  * DR operations are subject to Memory Alignment restrictions
     76  * for both address and the size of the request.
     77  */
     78 #define	MA_ADDR	0x10000000	/* addr alignment 256M */
     79 #define	MA_SIZE	0x10000000	/* size alignment 256M */
     80 
     81 #define	MBLK_IS_VALID(m) \
     82 	(IS_P2ALIGNED((m)->addr, MA_ADDR) && IS_P2ALIGNED((m)->size, MA_SIZE))
     83 
     84 static memhandle_t dr_mh;	/* memory handle for delete */
     85 
     86 static struct modlmisc modlmisc = {
     87 	&mod_miscops,
     88 	"sun4v memory DR"
     89 };
     90 
     91 static struct modlinkage modlinkage = {
     92 	MODREV_1,
     93 	(void *)&modlmisc,
     94 	NULL
     95 };
     96 
     97 static int dr_mem_allow_unload = 0;
     98 
     99 typedef int (*fn_t)(dr_mem_blk_t *, int *);
    100 
    101 /*
    102  * Global Domain Services (DS) Handle
    103  */
    104 static ds_svc_hdl_t ds_handle;
    105 
    106 /*
    107  * Supported DS Capability Versions
    108  */
    109 static ds_ver_t		dr_mem_vers[] = { { 1, 0 } };
    110 #define	DR_MEM_NVERS	(sizeof (dr_mem_vers) / sizeof (dr_mem_vers[0]))
    111 
    112 /*
    113  * DS Capability Description
    114  */
    115 static ds_capability_t dr_mem_cap = {
    116 	DR_MEM_DS_ID,		/* svc_id */
    117 	dr_mem_vers,		/* vers */
    118 	DR_MEM_NVERS		/* nvers */
    119 };
    120 
    121 /*
    122  * DS Callbacks
    123  */
    124 static void dr_mem_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
    125 static void dr_mem_unreg_handler(ds_cb_arg_t arg);
    126 static void dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
    127 
    128 /*
    129  * DS Client Ops Vector
    130  */
    131 static ds_clnt_ops_t dr_mem_ops = {
    132 	dr_mem_reg_handler,	/* ds_reg_cb */
    133 	dr_mem_unreg_handler,	/* ds_unreg_cb */
    134 	dr_mem_data_handler,	/* ds_data_cb */
    135 	NULL			/* cb_arg */
    136 };
    137 
    138 /*
    139  * Operation Results
    140  *
    141  * Used internally to gather results while an operation on a
    142  * list of mblks is in progress. In particular, it is used to
    143  * keep track of which mblks have already failed so that they are
    144  * not processed further, and the manner in which they failed.
    145  */
    146 typedef struct {
    147 	uint64_t	addr;
    148 	uint64_t	size;
    149 	uint32_t	result;
    150 	uint32_t	status;
    151 	char		*string;
    152 } dr_mem_res_t;
    153 
    154 static char *
    155 dr_mem_estr[] = {
    156 	"operation succeeded",		/* DR_MEM_RES_OK */
    157 	"operation failed",		/* DR_MEM_RES_FAILURE */
    158 	"operation was blocked",	/* DR_MEM_RES_BLOCKED */
    159 	"memory not defined in MD",	/* DR_MEM_RES_NOT_IN_MD */
    160 	"memory already in use",	/* DR_MEM_RES_ESPAN */
    161 	"memory access test failed",	/* DR_MEM_RES_EFAULT */
    162 	"resource not available",	/* DR_MEM_RES_ERESOURCE */
    163 	"permanent pages in span",	/* DR_MEM_RES_PERM */
    164 	"memory span busy",		/* DR_MEM_RES_EBUSY */
    165 	"VM viability test failed",	/* DR_MEM_RES_ENOTVIABLE */
    166 	"no pages to unconfigure",	/* DR_MEM_RES_ENOWORK */
    167 	"operation cancelled",		/* DR_MEM_RES_ECANCELLED */
    168 	"operation refused",		/* DR_MEM_RES_EREFUSED */
    169 	"memory span duplicate",	/* DR_MEM_RES_EDUP */
    170 	"invalid argument"		/* DR_MEM_RES_EINVAL */
    171 };
    172 
    173 typedef struct {
    174 	kcondvar_t cond;
    175 	kmutex_t lock;
    176 	int error;
    177 	int done;
    178 } mem_sync_t;
    179 
    180 /*
    181  * Internal Functions
    182  */
    183 static int dr_mem_init(void);
    184 static int dr_mem_fini(void);
    185 
    186 static int dr_mem_list_wrk(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
    187 static int dr_mem_list_query(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
    188 static int dr_mem_del_stat(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
    189 static int dr_mem_del_cancel(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
    190 
    191 static int dr_mem_unconfigure(dr_mem_blk_t *, int *);
    192 static int dr_mem_configure(dr_mem_blk_t *, int *);
    193 static void dr_mem_query(dr_mem_blk_t *, dr_mem_query_t *);
    194 
    195 static dr_mem_res_t *dr_mem_res_array_init(dr_mem_hdr_t *, drctl_rsrc_t *, int);
    196 static void dr_mem_res_array_fini(dr_mem_res_t *res, int nres);
    197 static size_t dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res,
    198     dr_mem_hdr_t **respp);
    199 
    200 static int dr_mem_find(dr_mem_blk_t *mbp);
    201 static mde_cookie_t dr_mem_find_node_md(dr_mem_blk_t *, md_t *, mde_cookie_t *);
    202 
    203 static int mem_add(pfn_t, pgcnt_t);
    204 static int mem_del(pfn_t, pgcnt_t);
    205 
    206 extern int kphysm_add_memory_dynamic(pfn_t, pgcnt_t);
    207 
    208 int
    209 _init(void)
    210 {
    211 	int	status;
    212 
    213 	/* check that Memory DR is enabled */
    214 	if (dr_is_disabled(DR_TYPE_MEM))
    215 		return (ENOTSUP);
    216 
    217 	if ((status = dr_mem_init()) != 0) {
    218 		cmn_err(CE_NOTE, "Memory DR initialization failed");
    219 		return (status);
    220 	}
    221 
    222 	if ((status = mod_install(&modlinkage)) != 0) {
    223 		(void) dr_mem_fini();
    224 	}
    225 
    226 	return (status);
    227 }
    228 
    229 int
    230 _info(struct modinfo *modinfop)
    231 {
    232 	return (mod_info(&modlinkage, modinfop));
    233 }
    234 
    235 int
    236 _fini(void)
    237 {
    238 	int	status;
    239 
    240 	if (dr_mem_allow_unload == 0)
    241 		return (EBUSY);
    242 
    243 	if ((status = mod_remove(&modlinkage)) == 0) {
    244 		(void) dr_mem_fini();
    245 	}
    246 
    247 	return (status);
    248 }
    249 
    250 static int
    251 dr_mem_init(void)
    252 {
    253 	int rv;
    254 
    255 	if ((rv = ds_cap_init(&dr_mem_cap, &dr_mem_ops)) != 0) {
    256 		cmn_err(CE_NOTE, "dr_mem: ds_cap_init failed: %d", rv);
    257 		return (rv);
    258 	}
    259 
    260 	return (0);
    261 }
    262 
    263 static int
    264 dr_mem_fini(void)
    265 {
    266 	int rv;
    267 
    268 	if ((rv = ds_cap_fini(&dr_mem_cap)) != 0) {
    269 		cmn_err(CE_NOTE, "dr_mem: ds_cap_fini failed: %d", rv);
    270 	}
    271 
    272 	return (rv);
    273 }
    274 
    275 static void
    276 dr_mem_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
    277 {
    278 	DR_DBG_MEM("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
    279 	    ver->major, ver->minor, hdl);
    280 
    281 	ds_handle = hdl;
    282 }
    283 
    284 static void
    285 dr_mem_unreg_handler(ds_cb_arg_t arg)
    286 {
    287 	DR_DBG_MEM("unreg_handler: arg=0x%p\n", arg);
    288 
    289 	ds_handle = DS_INVALID_HDL;
    290 }
    291 
    292 /*ARGSUSED*/
    293 static void
    294 dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
    295 {
    296 	dr_mem_hdr_t	*req = buf;
    297 	dr_mem_hdr_t	err_resp;
    298 	dr_mem_hdr_t	*resp = &err_resp;
    299 	int		resp_len = 0;
    300 	int		rv = EINVAL;
    301 
    302 	/*
    303 	 * Sanity check the message
    304 	 */
    305 	if (buflen < sizeof (dr_mem_hdr_t)) {
    306 		DR_DBG_MEM("incoming message short: expected at least %ld "
    307 		    "bytes, received %ld\n", sizeof (dr_mem_hdr_t), buflen);
    308 		goto done;
    309 	}
    310 
    311 	if (req == NULL) {
    312 		DR_DBG_MEM("empty message: expected at least %ld bytes\n",
    313 		    sizeof (dr_mem_hdr_t));
    314 		goto done;
    315 	}
    316 
    317 	DR_DBG_MEM("incoming request:\n");
    318 	DR_DBG_DUMP_MSG(buf, buflen);
    319 
    320 	/*
    321 	 * Process the command
    322 	 */
    323 	switch (req->msg_type) {
    324 	case DR_MEM_CONFIGURE:
    325 	case DR_MEM_UNCONFIGURE:
    326 		if (req->msg_arg == 0) {
    327 			DR_DBG_MEM("No mblks specified for operation\n");
    328 			goto done;
    329 		}
    330 		if ((rv = dr_mem_list_wrk(req, &resp, &resp_len)) != 0) {
    331 			DR_DBG_MEM("%s failed (%d)\n",
    332 			    (req->msg_type == DR_MEM_CONFIGURE) ?
    333 			    "Memory configure" : "Memory unconfigure", rv);
    334 		}
    335 		break;
    336 
    337 	case DR_MEM_UNCONF_STATUS:
    338 		if ((rv = dr_mem_del_stat(req, &resp, &resp_len)) != 0)
    339 			DR_DBG_MEM("Memory delete status failed (%d)\n", rv);
    340 		break;
    341 
    342 	case DR_MEM_UNCONF_CANCEL:
    343 		if ((rv = dr_mem_del_cancel(req, &resp, &resp_len)) != 0)
    344 			DR_DBG_MEM("Memory delete cancel failed (%d)\n", rv);
    345 		break;
    346 
    347 	case DR_MEM_QUERY:
    348 		if (req->msg_arg == 0) {
    349 			DR_DBG_MEM("No mblks specified for operation\n");
    350 			goto done;
    351 		}
    352 		if ((rv = dr_mem_list_query(req, &resp, &resp_len)) != 0)
    353 			DR_DBG_MEM("Memory query failed (%d)\n", rv);
    354 		break;
    355 
    356 	default:
    357 		cmn_err(CE_NOTE, "unsupported memory DR operation (%d)",
    358 		    req->msg_type);
    359 		break;
    360 	}
    361 
    362 done:
    363 	/* check if an error occurred */
    364 	if (resp == &err_resp) {
    365 		resp->req_num = (req) ? req->req_num : 0;
    366 		resp->msg_type = DR_MEM_ERROR;
    367 		resp->msg_arg = rv;
    368 		resp_len = sizeof (dr_mem_hdr_t);
    369 	}
    370 
    371 	DR_DBG_MEM("outgoing response:\n");
    372 	DR_DBG_DUMP_MSG(resp, resp_len);
    373 
    374 	/* send back the response */
    375 	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
    376 		DR_DBG_MEM("ds_send failed\n");
    377 	}
    378 
    379 	/* free any allocated memory */
    380 	if (resp != &err_resp) {
    381 		kmem_free(resp, resp_len);
    382 	}
    383 }
    384 
    385 /*
    386  * Common routine to config or unconfig multiple mblks.
    387  *
    388  * Note: Do not modify result buffer or length on error.
    389  */
    390 static int
    391 dr_mem_list_wrk(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
    392 {
    393 	int		rv;
    394 	int		idx;
    395 	int		count;
    396 	int		result;
    397 	int		status;
    398 	fn_t		dr_fn;
    399 	int		se_hint;
    400 	dr_mem_blk_t	*req_mblks;
    401 	dr_mem_res_t	*res;
    402 	int		drctl_cmd;
    403 	int		drctl_flags = 0;
    404 	drctl_rsrc_t	*drctl_req;
    405 	size_t		drctl_req_len;
    406 	drctl_resp_t	*drctl_resp;
    407 	drctl_rsrc_t	*drctl_rsrc;
    408 	size_t		drctl_resp_len = 0;
    409 	drctl_cookie_t	drctl_res_ck;
    410 
    411 	ASSERT((req != NULL) && (req->msg_arg != 0));
    412 
    413 	count = req->msg_arg;
    414 
    415 	/*
    416 	 * Extract all information that is specific
    417 	 * to the various types of operations.
    418 	 */
    419 	switch (req->msg_type) {
    420 	case DR_MEM_CONFIGURE:
    421 		dr_fn = dr_mem_configure;
    422 		drctl_cmd = DRCTL_MEM_CONFIG_REQUEST;
    423 		se_hint = SE_HINT_INSERT;
    424 		break;
    425 	case DR_MEM_UNCONFIGURE:
    426 		dr_fn = dr_mem_unconfigure;
    427 		drctl_cmd = DRCTL_MEM_UNCONFIG_REQUEST;
    428 		se_hint = SE_HINT_REMOVE;
    429 		break;
    430 	default:
    431 		/* Programming error if we reach this. */
    432 		cmn_err(CE_NOTE, "%s: bad msg_type %d\n",
    433 		    __func__, req->msg_type);
    434 		ASSERT(0);
    435 		return (-1);
    436 	}
    437 
    438 	/* the incoming array of mblks to operate on */
    439 	req_mblks = DR_MEM_CMD_MBLKS(req);
    440 
    441 	/* allocate drctl request msg based on incoming resource count */
    442 	drctl_req_len = sizeof (drctl_rsrc_t) * count;
    443 	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
    444 
    445 	/* copy the size for the drctl call from the incoming request msg */
    446 	for (idx = 0; idx < count; idx++) {
    447 		drctl_req[idx].res_mem_addr = req_mblks[idx].addr;
    448 		drctl_req[idx].res_mem_size = req_mblks[idx].size;
    449 	}
    450 
    451 	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
    452 	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
    453 
    454 	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
    455 
    456 	if (rv != 0) {
    457 		DR_DBG_MEM("%s: drctl_config_init returned: %d\n",
    458 		    __func__, rv);
    459 		kmem_free(drctl_resp, drctl_resp_len);
    460 		kmem_free(drctl_req, drctl_req_len);
    461 		return (rv);
    462 	}
    463 
    464 	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
    465 
    466 	drctl_rsrc = drctl_resp->resp_resources;
    467 
    468 	/* create the result scratch array */
    469 	res = dr_mem_res_array_init(req, drctl_rsrc, count);
    470 
    471 	/* perform the specified operation on each of the mblks */
    472 	for (idx = 0; idx < count; idx++) {
    473 		/*
    474 		 * If no action will be taken against the current
    475 		 * mblk, update the drctl resource information to
    476 		 * ensure that it gets recovered properly during
    477 		 * the drctl fini() call.
    478 		 */
    479 		if (res[idx].result != DR_MEM_RES_OK) {
    480 			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
    481 			continue;
    482 		}
    483 
    484 		/* call the function to perform the actual operation */
    485 		result = (*dr_fn)(&req_mblks[idx], &status);
    486 
    487 		/* save off results of the operation */
    488 		res[idx].result = result;
    489 		res[idx].status = status;
    490 		res[idx].addr = req_mblks[idx].addr;	/* for partial case */
    491 		res[idx].size = req_mblks[idx].size;	/* for partial case */
    492 		res[idx].string = i_ddi_strdup(dr_mem_estr[result], KM_SLEEP);
    493 
    494 		/* save result for drctl fini() reusing init() msg memory */
    495 		drctl_req[idx].status = (result != DR_MEM_RES_OK) ?
    496 		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
    497 
    498 		DR_DBG_MEM("%s: mblk 0x%lx.0x%lx stat %d result %d off '%s'\n",
    499 		    __func__, req_mblks[idx].addr, req_mblks[idx].size,
    500 		    drctl_req[idx].status, result,
    501 		    (res[idx].string) ? res[idx].string : "");
    502 	}
    503 
    504 	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
    505 		DR_DBG_MEM("%s: drctl_config_fini returned: %d\n",
    506 		    __func__, rv);
    507 
    508 	/*
    509 	 * Operation completed without any fatal errors.
    510 	 * Pack the response for transmission.
    511 	 */
    512 	*resp_len = dr_mem_pack_response(req, res, resp);
    513 
    514 	/* notify interested parties about the operation */
    515 	dr_generate_event(DR_TYPE_MEM, se_hint);
    516 
    517 	/*
    518 	 * Deallocate any scratch memory.
    519 	 */
    520 	kmem_free(drctl_resp, drctl_resp_len);
    521 	kmem_free(drctl_req, drctl_req_len);
    522 
    523 	dr_mem_res_array_fini(res, count);
    524 
    525 	return (0);
    526 }
    527 
    528 /*
    529  * Allocate and initialize a result array based on the initial
    530  * drctl operation. A valid result array is always returned.
    531  */
    532 static dr_mem_res_t *
    533 dr_mem_res_array_init(dr_mem_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
    534 {
    535 	int		idx;
    536 	dr_mem_res_t	*res;
    537 	char		*err_str;
    538 	size_t		err_len;
    539 
    540 	/* allocate zero filled buffer to initialize fields */
    541 	res = kmem_zalloc(nrsrc * sizeof (dr_mem_res_t), KM_SLEEP);
    542 
    543 	/*
    544 	 * Fill in the result information for each resource.
    545 	 */
    546 	for (idx = 0; idx < nrsrc; idx++) {
    547 		res[idx].addr = rsrc[idx].res_mem_addr;
    548 		res[idx].size = rsrc[idx].res_mem_size;
    549 		res[idx].result = DR_MEM_RES_OK;
    550 
    551 		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
    552 			continue;
    553 
    554 		/*
    555 		 * Update the state information for this mblk.
    556 		 */
    557 		res[idx].result = DR_MEM_RES_BLOCKED;
    558 		res[idx].status = (req->msg_type == DR_MEM_CONFIGURE) ?
    559 		    DR_MEM_STAT_UNCONFIGURED : DR_MEM_STAT_CONFIGURED;
    560 
    561 		/*
    562 		 * If an error string exists, copy it out of the
    563 		 * message buffer. This eliminates any dependency
    564 		 * on the memory allocated for the message buffer
    565 		 * itself.
    566 		 */
    567 		if (rsrc[idx].offset != NULL) {
    568 			err_str = (char *)rsrc + rsrc[idx].offset;
    569 			err_len = strlen(err_str) + 1;
    570 
    571 			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
    572 			bcopy(err_str, res[idx].string, err_len);
    573 		}
    574 	}
    575 
    576 	return (res);
    577 }
    578 
    579 static void
    580 dr_mem_res_array_fini(dr_mem_res_t *res, int nres)
    581 {
    582 	int	idx;
    583 	size_t	str_len;
    584 
    585 	for (idx = 0; idx < nres; idx++) {
    586 		/* deallocate the error string if present */
    587 		if (res[idx].string) {
    588 			str_len = strlen(res[idx].string) + 1;
    589 			kmem_free(res[idx].string, str_len);
    590 		}
    591 	}
    592 
    593 	/* deallocate the result array itself */
    594 	kmem_free(res, sizeof (dr_mem_res_t) * nres);
    595 }
    596 
    597 /*
    598  * Allocate and pack a response message for transmission based
    599  * on the specified result array. A valid response message and
    600  * valid size information is always returned.
    601  */
    602 static size_t
    603 dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res, dr_mem_hdr_t **respp)
    604 {
    605 	int		idx;
    606 	dr_mem_hdr_t	*resp;
    607 	dr_mem_stat_t	*resp_stat;
    608 	size_t		resp_len;
    609 	uint32_t	curr_off;
    610 	caddr_t		curr_str;
    611 	size_t		str_len;
    612 	size_t		stat_len;
    613 	int		nstat = req->msg_arg;
    614 
    615 	/*
    616 	 * Calculate the size of the response message
    617 	 * and allocate an appropriately sized buffer.
    618 	 */
    619 	resp_len = sizeof (dr_mem_hdr_t);
    620 
    621 	/* add the stat array size */
    622 	stat_len = sizeof (dr_mem_stat_t) * nstat;
    623 	resp_len += stat_len;
    624 
    625 	/* add the size of any error strings */
    626 	for (idx = 0; idx < nstat; idx++) {
    627 		if (res[idx].string != NULL) {
    628 			resp_len += strlen(res[idx].string) + 1;
    629 		}
    630 	}
    631 
    632 	/* allocate the message buffer */
    633 	resp = kmem_zalloc(resp_len, KM_SLEEP);
    634 
    635 	/*
    636 	 * Fill in the header information.
    637 	 */
    638 	resp->req_num = req->req_num;
    639 	resp->msg_type = DR_MEM_OK;
    640 	resp->msg_arg = nstat;
    641 
    642 	/*
    643 	 * Fill in the stat information.
    644 	 */
    645 	resp_stat = DR_MEM_RESP_STATS(resp);
    646 
    647 	/* string offsets start immediately after stat array */
    648 	curr_off = sizeof (dr_mem_hdr_t) + stat_len;
    649 	curr_str = (char *)resp_stat + stat_len;
    650 
    651 	for (idx = 0; idx < nstat; idx++) {
    652 		resp_stat[idx].addr = res[idx].addr;
    653 		resp_stat[idx].size = res[idx].size;
    654 		resp_stat[idx].result = res[idx].result;
    655 		resp_stat[idx].status = res[idx].status;
    656 
    657 		if (res[idx].string != NULL) {
    658 			/* copy over the error string */
    659 			str_len = strlen(res[idx].string) + 1;
    660 			bcopy(res[idx].string, curr_str, str_len);
    661 			resp_stat[idx].string_off = curr_off;
    662 
    663 			curr_off += str_len;
    664 			curr_str += str_len;
    665 		}
    666 	}
    667 
    668 	/* buffer should be exactly filled */
    669 	ASSERT(curr_off == resp_len);
    670 
    671 	*respp = resp;
    672 	return (resp_len);
    673 }
    674 
    675 static void
    676 dr_mem_query(dr_mem_blk_t *mbp, dr_mem_query_t *mqp)
    677 {
    678 	memquery_t mq;
    679 
    680 	DR_DBG_MEM("dr_mem_query...\n");
    681 
    682 
    683 	(void) kphysm_del_span_query(btop(mbp->addr), btop(mbp->size), &mq);
    684 
    685 	if (!mq.phys_pages)
    686 		return;
    687 
    688 	mqp->addr = mbp->addr;
    689 	mqp->mq.phys_pages = ptob(mq.phys_pages);
    690 	mqp->mq.managed = ptob(mq.managed);
    691 	mqp->mq.nonrelocatable = ptob(mq.nonrelocatable);
    692 	mqp->mq.first_nonrelocatable = ptob(mq.first_nonrelocatable);
    693 	mqp->mq.last_nonrelocatable = ptob(mq.last_nonrelocatable);
    694 	/*
    695 	 * Set to the max byte offset within the page.
    696 	 */
    697 	if (mqp->mq.nonrelocatable)
    698 		mqp->mq.last_nonrelocatable += PAGESIZE - 1;
    699 }
    700 
    701 /*
    702  * Do not modify result buffer or length on error.
    703  */
    704 static int
    705 dr_mem_list_query(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
    706 {
    707 	int		idx;
    708 	int		rlen;
    709 	int		nml;
    710 	struct memlist	*ml;
    711 	dr_mem_blk_t	*req_mblks, mb;
    712 	dr_mem_hdr_t	*rp;
    713 	dr_mem_query_t	*stat;
    714 
    715 	drctl_block();
    716 
    717 	/* the incoming array of req_mblks to configure */
    718 	req_mblks = DR_MEM_CMD_MBLKS(req);
    719 
    720 	/* allocate a response message, should be freed by caller */
    721 	nml = 0;
    722 	rlen = sizeof (dr_mem_hdr_t);
    723 	if (req_mblks->addr == NULL && req_mblks->size == 0) {
    724 		/*
    725 		 * Request is for domain's full view of it's memory.
    726 		 */
    727 		memlist_read_lock();
    728 		for (ml = phys_install; ml; ml = ml->next)
    729 			nml++;
    730 
    731 		rlen += nml * sizeof (dr_mem_query_t);
    732 	} else {
    733 		rlen += req->msg_arg * sizeof (dr_mem_query_t);
    734 	}
    735 	rp = kmem_zalloc(rlen, KM_SLEEP);
    736 
    737 	/* fill in the known data */
    738 	rp->req_num = req->req_num;
    739 	rp->msg_type = DR_MEM_OK;
    740 	rp->msg_arg = nml ? nml : req->msg_arg;
    741 
    742 	/* stat array for the response */
    743 	stat = DR_MEM_RESP_QUERY(rp);
    744 
    745 	/* get the status for each of the mblocks */
    746 	if (nml) {
    747 		for (idx = 0, ml = phys_install; ml; ml = ml->next, idx++) {
    748 			mb.addr = ml->address;
    749 			mb.size = ml->size;
    750 			dr_mem_query(&mb, &stat[idx]);
    751 		}
    752 		memlist_read_unlock();
    753 	} else {
    754 		for (idx = 0; idx < req->msg_arg; idx++)
    755 			dr_mem_query(&req_mblks[idx], &stat[idx]);
    756 	}
    757 
    758 	*resp = rp;
    759 	*resp_len = rlen;
    760 
    761 	drctl_unblock();
    762 
    763 	return (0);
    764 }
    765 
    766 static int
    767 cvt_err(int err)
    768 {
    769 	int rv;
    770 
    771 	switch (err) {
    772 	case KPHYSM_OK:
    773 		rv = DR_MEM_RES_OK;
    774 		break;
    775 	case KPHYSM_ESPAN:
    776 		rv = DR_MEM_RES_ESPAN;
    777 		break;
    778 	case KPHYSM_EFAULT:
    779 		rv = DR_MEM_RES_EFAULT;
    780 		break;
    781 	case KPHYSM_ERESOURCE:
    782 		rv = DR_MEM_RES_ERESOURCE;
    783 		break;
    784 	case KPHYSM_ENOTSUP:
    785 	case KPHYSM_ENOHANDLES:
    786 		rv = DR_MEM_RES_FAILURE;
    787 		break;
    788 	case KPHYSM_ENONRELOC:
    789 		rv = DR_MEM_RES_PERM;
    790 		break;
    791 	case KPHYSM_EHANDLE:
    792 		rv = DR_MEM_RES_FAILURE;
    793 		break;
    794 	case KPHYSM_EBUSY:
    795 		rv = DR_MEM_RES_EBUSY;
    796 		break;
    797 	case KPHYSM_ENOTVIABLE:
    798 		rv = DR_MEM_RES_ENOTVIABLE;
    799 		break;
    800 	case KPHYSM_ESEQUENCE:
    801 		rv = DR_MEM_RES_FAILURE;
    802 		break;
    803 	case KPHYSM_ENOWORK:
    804 		rv = DR_MEM_RES_ENOWORK;
    805 		break;
    806 	case KPHYSM_ECANCELLED:
    807 		rv = DR_MEM_RES_ECANCELLED;
    808 		break;
    809 	case KPHYSM_EREFUSED:
    810 		rv = DR_MEM_RES_EREFUSED;
    811 		break;
    812 	case KPHYSM_ENOTFINISHED:
    813 	case KPHYSM_ENOTRUNNING:
    814 		rv = DR_MEM_RES_FAILURE;
    815 		break;
    816 	case KPHYSM_EDUP:
    817 		rv = DR_MEM_RES_EDUP;
    818 		break;
    819 	default:
    820 		rv = DR_MEM_RES_FAILURE;
    821 		break;
    822 	}
    823 
    824 	return (rv);
    825 }
    826 
    827 static int
    828 dr_mem_configure(dr_mem_blk_t *mbp, int *status)
    829 {
    830 	int rv;
    831 	uint64_t addr, size;
    832 
    833 	rv = 0;
    834 	addr = mbp->addr;
    835 	size = mbp->size;
    836 
    837 	DR_DBG_MEM("dr_mem_configure...\n");
    838 
    839 	if (!MBLK_IS_VALID(mbp)) {
    840 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n", addr, size);
    841 		*status = DR_MEM_STAT_UNCONFIGURED;
    842 		rv = DR_MEM_RES_EINVAL;
    843 	} else if (rv = dr_mem_find(mbp)) {
    844 		DR_DBG_MEM("failed to find mblk 0x%lx.0x%lx (%d)\n",
    845 		    addr, size, rv);
    846 		if (rv == EINVAL) {
    847 			*status = DR_MEM_STAT_NOT_PRESENT;
    848 			rv = DR_MEM_RES_NOT_IN_MD;
    849 		} else {
    850 			*status = DR_MEM_STAT_UNCONFIGURED;
    851 			rv = DR_MEM_RES_FAILURE;
    852 		}
    853 	} else {
    854 		rv = mem_add(btop(addr), btop(size));
    855 		DR_DBG_MEM("addr=0x%lx size=0x%lx rv=%d\n", addr, size, rv);
    856 		if (rv) {
    857 			*status = DR_MEM_STAT_UNCONFIGURED;
    858 		} else {
    859 			*status = DR_MEM_STAT_CONFIGURED;
    860 		}
    861 	}
    862 
    863 	return (rv);
    864 }
    865 
    866 static int
    867 dr_mem_unconfigure(dr_mem_blk_t *mbp, int *status)
    868 {
    869 	int rv;
    870 
    871 	DR_DBG_MEM("dr_mem_unconfigure...\n");
    872 
    873 	if (!MBLK_IS_VALID(mbp)) {
    874 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n",
    875 		    mbp->addr, mbp->size);
    876 			*status = DR_MEM_STAT_CONFIGURED;
    877 			rv = DR_MEM_RES_EINVAL;
    878 	} else if (rv = mem_del(btop(mbp->addr), btop(mbp->size))) {
    879 		*status = DR_MEM_STAT_CONFIGURED;
    880 	} else {
    881 		*status = DR_MEM_STAT_UNCONFIGURED;
    882 		rv = DR_MEM_RES_OK;
    883 		DR_DBG_MEM("mblk 0x%lx.0x%lx unconfigured\n",
    884 		    mbp->addr, mbp->size);
    885 	}
    886 	return (rv);
    887 }
    888 
    889 static int
    890 dr_mem_del_stat(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
    891 {
    892 	int			status;
    893 	int			rlen;
    894 	memdelstat_t		del_stat, *stat;
    895 	dr_mem_hdr_t		*rp;
    896 
    897 	/*
    898 	 * If a mem delete is in progress, get its status.
    899 	 */
    900 	status = (dr_mh && (kphysm_del_status(dr_mh, &del_stat) == KPHYSM_OK));
    901 
    902 	/* allocate a response message, should be freed by caller */
    903 	rlen = sizeof (dr_mem_hdr_t);
    904 	rlen += status * sizeof (memdelstat_t);
    905 	rp = kmem_zalloc(rlen, KM_SLEEP);
    906 
    907 	/* fill in the known data */
    908 	rp->req_num = req->req_num;
    909 	rp->msg_type = DR_MEM_OK;
    910 	rp->msg_arg = status;
    911 
    912 	if (status) {
    913 		/* stat struct for the response */
    914 		stat = DR_MEM_RESP_DEL_STAT(rp);
    915 		stat->phys_pages = ptob(del_stat.phys_pages);
    916 		stat->managed = ptob(del_stat.managed);
    917 		stat->collected = ptob(del_stat.collected);
    918 	}
    919 
    920 	*resp = rp;
    921 	*resp_len = rlen;
    922 
    923 	return (0);
    924 }
    925 
    926 static int
    927 dr_mem_del_cancel(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
    928 {
    929 	int		rlen;
    930 	dr_mem_hdr_t	*rp;
    931 
    932 	/* allocate a response message, should be freed by caller */
    933 	rlen = sizeof (dr_mem_hdr_t);
    934 	rp = kmem_zalloc(rlen, KM_SLEEP);
    935 
    936 	/* fill in the known data */
    937 	rp->req_num = req->req_num;
    938 	rp->msg_type = DR_MEM_OK;
    939 	rp->msg_arg = (dr_mh && kphysm_del_cancel(dr_mh) != KPHYSM_OK) ?
    940 	    DR_MEM_RES_EINVAL : DR_MEM_RES_OK;
    941 
    942 	*resp = rp;
    943 	*resp_len = rlen;
    944 
    945 	return (0);
    946 }
    947 
    948 static int
    949 dr_mem_find(dr_mem_blk_t *mbp)
    950 {
    951 	md_t		*mdp = NULL;
    952 	int		num_nodes;
    953 	int		rv = 0;
    954 	int		listsz;
    955 	mde_cookie_t	*listp = NULL;
    956 	mde_cookie_t	memnode;
    957 	char		*found = "found";
    958 
    959 	if ((mdp = md_get_handle()) == NULL) {
    960 		DR_DBG_MEM("unable to initialize machine description\n");
    961 		return (-1);
    962 	}
    963 
    964 	num_nodes = md_node_count(mdp);
    965 	ASSERT(num_nodes > 0);
    966 
    967 	listsz = num_nodes * sizeof (mde_cookie_t);
    968 	listp = kmem_zalloc(listsz, KM_SLEEP);
    969 
    970 	memnode = dr_mem_find_node_md(mbp, mdp, listp);
    971 
    972 	if (memnode == MDE_INVAL_ELEM_COOKIE) {
    973 		rv = EINVAL;
    974 		found = "not found";
    975 	}
    976 
    977 	DR_DBG_MEM("mblk 0x%lx.0x%lx %s\n", mbp->addr, mbp->size, found);
    978 
    979 	kmem_free(listp, listsz);
    980 	(void) md_fini_handle(mdp);
    981 
    982 	return (rv);
    983 }
    984 
    985 /*
    986  * Look up a particular mblk in the MD. Returns the mde_cookie_t
    987  * representing that mblk if present, and MDE_INVAL_ELEM_COOKIE
    988  * otherwise. It is assumed the scratch array has already been
    989  * allocated so that it can accommodate the worst case scenario,
    990  * every node in the MD.
    991  */
    992 static mde_cookie_t
    993 dr_mem_find_node_md(dr_mem_blk_t *mbp, md_t *mdp, mde_cookie_t *listp)
    994 {
    995 	int		idx;
    996 	int		nnodes;
    997 	mde_cookie_t	rootnode;
    998 	uint64_t	base_prop;
    999 	uint64_t	size_prop;
   1000 	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
   1001 
   1002 	rootnode = md_root_node(mdp);
   1003 	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
   1004 
   1005 	/*
   1006 	 * Scan the DAG for all the mem nodes
   1007 	 */
   1008 	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "mblock"),
   1009 	    md_find_name(mdp, "fwd"), listp);
   1010 
   1011 	if (nnodes < 0) {
   1012 		DR_DBG_MEM("Scan for mblks failed\n");
   1013 		return (result);
   1014 	}
   1015 
   1016 	DR_DBG_MEM("dr_mem_find_node_md: found %d mblks in the MD\n", nnodes);
   1017 
   1018 	/*
   1019 	 * Find the mblk of interest
   1020 	 */
   1021 	for (idx = 0; idx < nnodes; idx++) {
   1022 
   1023 		if (md_get_prop_val(mdp, listp[idx], "base", &base_prop)) {
   1024 			DR_DBG_MEM("Missing 'base' property for mblk node %d\n",
   1025 			    idx);
   1026 			break;
   1027 		}
   1028 
   1029 		if (md_get_prop_val(mdp, listp[idx], "size", &size_prop)) {
   1030 			DR_DBG_MEM("Missing 'size' property for mblk node %d\n",
   1031 			    idx);
   1032 			break;
   1033 		}
   1034 
   1035 		if (base_prop <= mbp->addr &&
   1036 		    (base_prop + size_prop) >= (mbp->addr + mbp->size)) {
   1037 			/* found a match */
   1038 			DR_DBG_MEM("dr_mem_find_node_md: found mblk "
   1039 			    "0x%lx.0x%lx in MD\n", mbp->addr, mbp->size);
   1040 			result = listp[idx];
   1041 			break;
   1042 		}
   1043 	}
   1044 
   1045 	if (result == MDE_INVAL_ELEM_COOKIE) {
   1046 		DR_DBG_MEM("mblk 0x%lx.0x%lx not in MD\n",
   1047 		    mbp->addr, mbp->size);
   1048 	}
   1049 
   1050 	return (result);
   1051 }
   1052 
   1053 static int
   1054 mem_add(pfn_t base, pgcnt_t npgs)
   1055 {
   1056 	int rv, rc;
   1057 
   1058 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
   1059 
   1060 	if (npgs == 0)
   1061 		return (DR_MEM_RES_OK);
   1062 
   1063 	rv = kphysm_add_memory_dynamic(base, npgs);
   1064 	DR_DBG_MEM("%s: kphysm_add(0x%lx, 0x%lx) = %d", __func__, base, npgs,
   1065 	    rv);
   1066 	if (rv == KPHYSM_OK) {
   1067 		if (rc = kcage_range_add(base, npgs, KCAGE_DOWN))
   1068 			cmn_err(CE_WARN, "kcage_range_add() = %d", rc);
   1069 	}
   1070 	rv = cvt_err(rv);
   1071 	return (rv);
   1072 }
   1073 
   1074 static void
   1075 del_done(void *arg, int error)
   1076 {
   1077 	mem_sync_t *ms = arg;
   1078 
   1079 	mutex_enter(&ms->lock);
   1080 	ms->error = error;
   1081 	ms->done = 1;
   1082 	cv_signal(&ms->cond);
   1083 	mutex_exit(&ms->lock);
   1084 }
   1085 
   1086 static int
   1087 mem_del(pfn_t base, pgcnt_t npgs)
   1088 {
   1089 	int rv, err, del_range = 0;
   1090 	int convert = 1;
   1091 	mem_sync_t ms;
   1092 	memquery_t mq;
   1093 	memhandle_t mh;
   1094 	struct memlist *ml;
   1095 	struct memlist *d_ml = NULL;
   1096 
   1097 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
   1098 
   1099 	if (npgs == 0)
   1100 		return (DR_MEM_RES_OK);
   1101 
   1102 	if ((rv = kphysm_del_gethandle(&mh)) != KPHYSM_OK) {
   1103 		cmn_err(CE_WARN, "%s: del_gethandle() = %d", __func__, rv);
   1104 		rv = cvt_err(rv);
   1105 		return (rv);
   1106 	}
   1107 	if ((rv = kphysm_del_span_query(base, npgs, &mq))
   1108 	    != KPHYSM_OK) {
   1109 		cmn_err(CE_WARN, "%s: del_span_query() = %d", __func__, rv);
   1110 		goto done;
   1111 	}
   1112 	if (mq.nonrelocatable) {
   1113 		DR_DBG_MEM("%s: non-reloc pages = %ld",
   1114 		    __func__, mq.nonrelocatable);
   1115 		rv  = KPHYSM_ENONRELOC;
   1116 		goto done;
   1117 	}
   1118 	if (rv = kcage_range_delete(base, npgs)) {
   1119 		switch (rv) {
   1120 		case EBUSY:
   1121 			rv = DR_MEM_RES_ENOTVIABLE;
   1122 			break;
   1123 		default:
   1124 			rv = DR_MEM_RES_FAILURE;
   1125 			break;
   1126 		}
   1127 		convert = 0; /* conversion done */
   1128 		cmn_err(CE_WARN, "%s: del_range() = %d", __func__, rv);
   1129 		goto done;
   1130 	} else {
   1131 		del_range++;
   1132 	}
   1133 	if ((rv = kphysm_del_span(mh, base, npgs)) != KPHYSM_OK) {
   1134 		cmn_err(CE_WARN, "%s: del_span() = %d", __func__, rv);
   1135 		goto done;
   1136 	}
   1137 	if ((rv = memlist_add_span(ptob(base), ptob(npgs), &d_ml))
   1138 	    != MEML_SPANOP_OK) {
   1139 		switch (rv) {
   1140 		case MEML_SPANOP_ESPAN:
   1141 			rv = DR_MEM_RES_ESPAN;
   1142 			break;
   1143 		case MEML_SPANOP_EALLOC:
   1144 			rv = DR_MEM_RES_ERESOURCE;
   1145 			break;
   1146 		default:
   1147 			rv = DR_MEM_RES_FAILURE;
   1148 			break;
   1149 		}
   1150 		convert = 0; /* conversion done */
   1151 		cmn_err(CE_WARN, "%s: add_span() = %d", __func__, rv);
   1152 		goto done;
   1153 	}
   1154 
   1155 	DR_DBG_MEM("%s: reserved=0x%lx", __func__, npgs);
   1156 
   1157 	bzero((void *) &ms, sizeof (ms));
   1158 
   1159 	mutex_init(&ms.lock, NULL, MUTEX_DRIVER, NULL);
   1160 	cv_init(&ms.cond, NULL, CV_DRIVER, NULL);
   1161 	mutex_enter(&ms.lock);
   1162 
   1163 	if ((rv = kphysm_del_start(mh, del_done, (void *) &ms)) == KPHYSM_OK) {
   1164 		/*
   1165 		 * Since we've called drctl_config_init, we are the only
   1166 		 * DR ctl operation in progress.  Set dr_mh to the
   1167 		 * delete memhandle for use by stat and cancel.
   1168 		 */
   1169 		ASSERT(dr_mh == NULL);
   1170 		dr_mh = mh;
   1171 
   1172 		/*
   1173 		 * Wait for completion or interrupt.
   1174 		 */
   1175 		while (!ms.done) {
   1176 			if (cv_wait_sig(&ms.cond, &ms.lock) == 0) {
   1177 				/*
   1178 				 * There is a pending signal.
   1179 				 */
   1180 				(void) kphysm_del_cancel(mh);
   1181 				DR_DBG_MEM("%s: cancel", __func__);
   1182 				/*
   1183 				 * Wait for completion.
   1184 				 */
   1185 				while (!ms.done)
   1186 					cv_wait(&ms.cond, &ms.lock);
   1187 			}
   1188 		}
   1189 		dr_mh = NULL;
   1190 		rv = ms.error;
   1191 	} else {
   1192 		DR_DBG_MEM("%s: del_start() = %d", __func__, rv);
   1193 	}
   1194 
   1195 	mutex_exit(&ms.lock);
   1196 	cv_destroy(&ms.cond);
   1197 	mutex_destroy(&ms.lock);
   1198 
   1199 done:
   1200 	if (rv && del_range) {
   1201 		/*
   1202 		 * Add back the spans to the kcage growth list.
   1203 		 */
   1204 		for (ml = d_ml; ml; ml = ml->next)
   1205 			if (err = kcage_range_add(btop(ml->address),
   1206 			    btop(ml->size), KCAGE_DOWN))
   1207 				cmn_err(CE_WARN, "kcage_range_add() = %d", err);
   1208 	}
   1209 	memlist_free_list(d_ml);
   1210 
   1211 	if ((err = kphysm_del_release(mh)) != KPHYSM_OK)
   1212 		cmn_err(CE_WARN, "%s: del_release() = %d", __func__, err);
   1213 	if (convert)
   1214 		rv = cvt_err(rv);
   1215 
   1216 	DR_DBG_MEM("%s: rv=%d", __func__, rv);
   1217 
   1218 	return (rv);
   1219 }
   1220