Home | History | Annotate | Download | only in e1000g
      1 /*
      2  * This file is provided under a CDDLv1 license.  When using or
      3  * redistributing this file, you may do so under this license.
      4  * In redistributing this file this license must be included
      5  * and no other modification of this header file is permitted.
      6  *
      7  * CDDL LICENSE SUMMARY
      8  *
      9  * Copyright(c) 1999 - 2009 Intel Corporation. All rights reserved.
     10  *
     11  * The contents of this file are subject to the terms of Version
     12  * 1.0 of the Common Development and Distribution License (the "License").
     13  *
     14  * You should have received a copy of the License with this software.
     15  * You can obtain a copy of the License at
     16  *	http://www.opensolaris.org/os/licensing.
     17  * See the License for the specific language governing permissions
     18  * and limitations under the License.
     19  */
     20 
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * **********************************************************************
     28  *									*
     29  * Module Name:								*
     30  *   e1000g_tx.c							*
     31  *									*
     32  * Abstract:								*
     33  *   This file contains some routines that take care of Transmit,	*
     34  *   make the hardware to send the data pointed by the packet out	*
     35  *   on to the physical medium.						*
     36  *									*
     37  * **********************************************************************
     38  */
     39 
     40 #include "e1000g_sw.h"
     41 #include "e1000g_debug.h"
     42 
     43 static boolean_t e1000g_send(struct e1000g *, mblk_t *);
     44 static int e1000g_tx_copy(e1000g_tx_ring_t *,
     45     p_tx_sw_packet_t, mblk_t *, boolean_t);
     46 static int e1000g_tx_bind(e1000g_tx_ring_t *,
     47     p_tx_sw_packet_t, mblk_t *);
     48 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t);
     49 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *);
     50 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *,
     51     context_data_t *);
     52 static void e1000g_fill_context_descriptor(context_data_t *,
     53     struct e1000_context_desc *);
     54 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *,
     55     p_tx_sw_packet_t, uint64_t, size_t);
     56 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length,
     57     p_desc_array_t desc_array);
     58 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t);
     59 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t);
     60 static void e1000g_82547_timeout(void *);
     61 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *);
     62 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *);
     63 
     64 #ifndef E1000G_DEBUG
     65 #pragma inline(e1000g_tx_copy)
     66 #pragma inline(e1000g_tx_bind)
     67 #pragma inline(e1000g_retrieve_context)
     68 #pragma inline(e1000g_check_context)
     69 #pragma inline(e1000g_fill_tx_ring)
     70 #pragma inline(e1000g_fill_context_descriptor)
     71 #pragma inline(e1000g_fill_tx_desc)
     72 #pragma inline(e1000g_fill_82544_desc)
     73 #pragma inline(e1000g_tx_workaround_PCIX_82544)
     74 #pragma inline(e1000g_tx_workaround_jumbo_82544)
     75 #pragma inline(e1000g_free_tx_swpkt)
     76 #endif
     77 
     78 /*
     79  * e1000g_free_tx_swpkt	- free up the tx sw packet
     80  *
     81  * Unbind the previously bound DMA handle for a given
     82  * transmit sw packet. And reset the sw packet data.
     83  */
     84 void
     85 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet)
     86 {
     87 	switch (packet->data_transfer_type) {
     88 	case USE_BCOPY:
     89 		packet->tx_buf->len = 0;
     90 		break;
     91 #ifdef __sparc
     92 	case USE_DVMA:
     93 		dvma_unload(packet->tx_dma_handle, 0, -1);
     94 		break;
     95 #endif
     96 	case USE_DMA:
     97 		(void) ddi_dma_unbind_handle(packet->tx_dma_handle);
     98 		break;
     99 	default:
    100 		break;
    101 	}
    102 
    103 	/*
    104 	 * The mblk has been stripped off the sw packet
    105 	 * and will be freed in a triggered soft intr.
    106 	 */
    107 	ASSERT(packet->mp == NULL);
    108 
    109 	packet->data_transfer_type = USE_NONE;
    110 	packet->num_mblk_frag = 0;
    111 	packet->num_desc = 0;
    112 }
    113 
    114 mblk_t *
    115 e1000g_m_tx(void *arg, mblk_t *mp)
    116 {
    117 	struct e1000g *Adapter = (struct e1000g *)arg;
    118 	mblk_t *next;
    119 
    120 	rw_enter(&Adapter->chip_lock, RW_READER);
    121 
    122 	if ((Adapter->e1000g_state & E1000G_SUSPENDED) ||
    123 	    !(Adapter->e1000g_state & E1000G_STARTED) ||
    124 	    (Adapter->link_state != LINK_STATE_UP)) {
    125 		freemsgchain(mp);
    126 		mp = NULL;
    127 	}
    128 
    129 	while (mp != NULL) {
    130 		next = mp->b_next;
    131 		mp->b_next = NULL;
    132 
    133 		if (!e1000g_send(Adapter, mp)) {
    134 			mp->b_next = next;
    135 			break;
    136 		}
    137 
    138 		mp = next;
    139 	}
    140 
    141 	rw_exit(&Adapter->chip_lock);
    142 	return (mp);
    143 }
    144 
    145 /*
    146  * e1000g_send -  send packets onto the wire
    147  *
    148  * Called from e1000g_m_tx with an mblk ready to send. this
    149  * routine sets up the transmit descriptors and sends data to
    150  * the wire. It also pushes the just transmitted packet to
    151  * the used tx sw packet list.
    152  */
    153 static boolean_t
    154 e1000g_send(struct e1000g *Adapter, mblk_t *mp)
    155 {
    156 	p_tx_sw_packet_t packet;
    157 	LIST_DESCRIBER pending_list;
    158 	size_t len;
    159 	size_t msg_size;
    160 	uint32_t frag_count;
    161 	int desc_count;
    162 	uint32_t desc_total;
    163 	uint32_t bcopy_thresh;
    164 	uint32_t hdr_frag_len;
    165 	boolean_t tx_undersize_flag;
    166 	mblk_t *nmp;
    167 	mblk_t *tmp;
    168 	mblk_t *new_mp;
    169 	mblk_t *pre_mp;
    170 	mblk_t *next_mp;
    171 	e1000g_tx_ring_t *tx_ring;
    172 	context_data_t cur_context;
    173 
    174 	tx_ring = Adapter->tx_ring;
    175 	bcopy_thresh = Adapter->tx_bcopy_thresh;
    176 
    177 	/* Get the total size and frags number of the message */
    178 	tx_undersize_flag = B_FALSE;
    179 	frag_count = 0;
    180 	msg_size = 0;
    181 	for (nmp = mp; nmp; nmp = nmp->b_cont) {
    182 		frag_count++;
    183 		msg_size += MBLKL(nmp);
    184 	}
    185 
    186 	/* retrieve and compute information for context descriptor */
    187 	if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) {
    188 		freemsg(mp);
    189 		return (B_TRUE);
    190 	}
    191 
    192 	/*
    193 	 * Make sure the packet is less than the allowed size
    194 	 */
    195 	if (!cur_context.lso_flag &&
    196 	    (msg_size > Adapter->max_frame_size - ETHERFCSL)) {
    197 		/*
    198 		 * For the over size packet, we'll just drop it.
    199 		 * So we return B_TRUE here.
    200 		 */
    201 		E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL,
    202 		    "Tx packet out of bound. length = %d \n", msg_size);
    203 		E1000G_STAT(tx_ring->stat_over_size);
    204 		freemsg(mp);
    205 		return (B_TRUE);
    206 	}
    207 
    208 	/*
    209 	 * Check and reclaim tx descriptors.
    210 	 * This low water mark check should be done all the time as
    211 	 * Transmit interrupt delay can produce Transmit interrupts little
    212 	 * late and that may cause few problems related to reaping Tx
    213 	 * Descriptors... As you may run short of them before getting any
    214 	 * transmit interrupt...
    215 	 */
    216 	if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
    217 		(void) e1000g_recycle(tx_ring);
    218 		E1000G_DEBUG_STAT(tx_ring->stat_recycle);
    219 
    220 		if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
    221 			E1000G_DEBUG_STAT(tx_ring->stat_lack_desc);
    222 			goto tx_no_resource;
    223 		}
    224 	}
    225 
    226 	/*
    227 	 * If the message size is less than the minimum ethernet packet size,
    228 	 * we'll use bcopy to send it, and padd it to 60 bytes later.
    229 	 */
    230 	if (msg_size < ETHERMIN) {
    231 		E1000G_DEBUG_STAT(tx_ring->stat_under_size);
    232 		tx_undersize_flag = B_TRUE;
    233 	}
    234 
    235 	/* Initialize variables */
    236 	desc_count = 1;	/* The initial value should be greater than 0 */
    237 	desc_total = 0;
    238 	new_mp = NULL;
    239 	QUEUE_INIT_LIST(&pending_list);
    240 
    241 	/* Process each mblk fragment and fill tx descriptors */
    242 	/*
    243 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
    244 	 * to be within one descriptor. Here we reallocate and refill the
    245 	 * the header if it's physical memory non-contiguous.
    246 	 */
    247 	if (cur_context.lso_flag) {
    248 		/* find the last fragment of the header */
    249 		len = MBLKL(mp);
    250 		ASSERT(len > 0);
    251 		next_mp = mp;
    252 		pre_mp = NULL;
    253 		while (len < cur_context.hdr_len) {
    254 			pre_mp = next_mp;
    255 			next_mp = next_mp->b_cont;
    256 			len += MBLKL(next_mp);
    257 		}
    258 		/*
    259 		 * If the header and the payload are in different mblks,
    260 		 * we simply force the header to be copied into pre-allocated
    261 		 * page-aligned buffer.
    262 		 */
    263 		if (len == cur_context.hdr_len)
    264 			goto adjust_threshold;
    265 
    266 		hdr_frag_len = cur_context.hdr_len - (len - MBLKL(next_mp));
    267 		/*
    268 		 * There are two cases we need to reallocate a mblk for the
    269 		 * last header fragment:
    270 		 * 1. the header is in multiple mblks and the last fragment
    271 		 * share the same mblk with the payload
    272 		 * 2. the header is in a single mblk shared with the payload
    273 		 * and the header is physical memory non-contiguous
    274 		 */
    275 		if ((next_mp != mp) ||
    276 		    (P2NPHASE((uintptr_t)next_mp->b_rptr, Adapter->sys_page_sz)
    277 		    < cur_context.hdr_len)) {
    278 			E1000G_DEBUG_STAT(tx_ring->stat_lso_header_fail);
    279 			/*
    280 			 * reallocate the mblk for the last header fragment,
    281 			 * expect to bcopy into pre-allocated page-aligned
    282 			 * buffer
    283 			 */
    284 			new_mp = allocb(hdr_frag_len, NULL);
    285 			if (!new_mp)
    286 				return (B_FALSE);
    287 			bcopy(next_mp->b_rptr, new_mp->b_rptr, hdr_frag_len);
    288 			/* link the new header fragment with the other parts */
    289 			new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
    290 			new_mp->b_cont = next_mp;
    291 			if (pre_mp)
    292 				pre_mp->b_cont = new_mp;
    293 			else
    294 				mp = new_mp;
    295 			next_mp->b_rptr += hdr_frag_len;
    296 			frag_count++;
    297 		}
    298 adjust_threshold:
    299 		/*
    300 		 * adjust the bcopy threshhold to guarantee
    301 		 * the header to use bcopy way
    302 		 */
    303 		if (bcopy_thresh < cur_context.hdr_len)
    304 			bcopy_thresh = cur_context.hdr_len;
    305 	}
    306 
    307 	packet = NULL;
    308 	nmp = mp;
    309 	while (nmp) {
    310 		tmp = nmp->b_cont;
    311 
    312 		len = MBLKL(nmp);
    313 		/* Check zero length mblks */
    314 		if (len == 0) {
    315 			E1000G_DEBUG_STAT(tx_ring->stat_empty_frags);
    316 			/*
    317 			 * If there're no packet buffers have been used,
    318 			 * or we just completed processing a buffer, then
    319 			 * skip the empty mblk fragment.
    320 			 * Otherwise, there's still a pending buffer that
    321 			 * needs to be processed (tx_copy).
    322 			 */
    323 			if (desc_count > 0) {
    324 				nmp = tmp;
    325 				continue;
    326 			}
    327 		}
    328 
    329 		/*
    330 		 * Get a new TxSwPacket to process mblk buffers.
    331 		 */
    332 		if (desc_count > 0) {
    333 			mutex_enter(&tx_ring->freelist_lock);
    334 			packet = (p_tx_sw_packet_t)
    335 			    QUEUE_POP_HEAD(&tx_ring->free_list);
    336 			mutex_exit(&tx_ring->freelist_lock);
    337 
    338 			if (packet == NULL) {
    339 				E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
    340 				    "No Tx SwPacket available\n");
    341 				E1000G_STAT(tx_ring->stat_no_swpkt);
    342 				goto tx_send_failed;
    343 			}
    344 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
    345 		}
    346 
    347 		ASSERT(packet);
    348 		/*
    349 		 * If the size of the fragment is less than the tx_bcopy_thresh
    350 		 * we'll use bcopy; Otherwise, we'll use DMA binding.
    351 		 */
    352 		if ((len <= bcopy_thresh) || tx_undersize_flag) {
    353 			desc_count =
    354 			    e1000g_tx_copy(tx_ring, packet, nmp,
    355 			    tx_undersize_flag);
    356 			E1000G_DEBUG_STAT(tx_ring->stat_copy);
    357 		} else {
    358 			desc_count =
    359 			    e1000g_tx_bind(tx_ring, packet, nmp);
    360 			E1000G_DEBUG_STAT(tx_ring->stat_bind);
    361 		}
    362 
    363 		if (desc_count > 0)
    364 			desc_total += desc_count;
    365 		else if (desc_count < 0)
    366 			goto tx_send_failed;
    367 
    368 		nmp = tmp;
    369 	}
    370 
    371 	/* Assign the message to the last sw packet */
    372 	ASSERT(packet);
    373 	ASSERT(packet->mp == NULL);
    374 	packet->mp = mp;
    375 
    376 	/* Try to recycle the tx descriptors again */
    377 	if (tx_ring->tbd_avail < (desc_total + 2)) {
    378 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry);
    379 		(void) e1000g_recycle(tx_ring);
    380 	}
    381 
    382 	mutex_enter(&tx_ring->tx_lock);
    383 
    384 	/*
    385 	 * If the number of available tx descriptors is not enough for transmit
    386 	 * (one redundant descriptor and one hw checksum context descriptor are
    387 	 * included), then return failure.
    388 	 */
    389 	if (tx_ring->tbd_avail < (desc_total + 2)) {
    390 		E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
    391 		    "No Enough Tx descriptors\n");
    392 		E1000G_STAT(tx_ring->stat_no_desc);
    393 		mutex_exit(&tx_ring->tx_lock);
    394 		goto tx_send_failed;
    395 	}
    396 
    397 	desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context);
    398 
    399 	mutex_exit(&tx_ring->tx_lock);
    400 
    401 	ASSERT(desc_count > 0);
    402 
    403 	/* Send successful */
    404 	return (B_TRUE);
    405 
    406 tx_send_failed:
    407 	/* Restore mp to original */
    408 	if (new_mp) {
    409 		if (pre_mp) {
    410 			pre_mp->b_cont = next_mp;
    411 		}
    412 		new_mp->b_cont = NULL;
    413 		freemsg(new_mp);
    414 
    415 		next_mp->b_rptr -= hdr_frag_len;
    416 	}
    417 
    418 	/*
    419 	 * Enable Transmit interrupts, so that the interrupt routine can
    420 	 * call mac_tx_update() when transmit descriptors become available.
    421 	 */
    422 	tx_ring->resched_timestamp = ddi_get_lbolt();
    423 	tx_ring->resched_needed = B_TRUE;
    424 	if (!Adapter->tx_intr_enable)
    425 		e1000g_mask_tx_interrupt(Adapter);
    426 
    427 	/* Free pending TxSwPackets */
    428 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
    429 	while (packet) {
    430 		packet->mp = NULL;
    431 		e1000g_free_tx_swpkt(packet);
    432 		packet = (p_tx_sw_packet_t)
    433 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
    434 	}
    435 
    436 	/* Return pending TxSwPackets to the "Free" list */
    437 	mutex_enter(&tx_ring->freelist_lock);
    438 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
    439 	mutex_exit(&tx_ring->freelist_lock);
    440 
    441 	E1000G_STAT(tx_ring->stat_send_fail);
    442 
    443 	/* Message will be scheduled for re-transmit */
    444 	return (B_FALSE);
    445 
    446 tx_no_resource:
    447 	/*
    448 	 * Enable Transmit interrupts, so that the interrupt routine can
    449 	 * call mac_tx_update() when transmit descriptors become available.
    450 	 */
    451 	tx_ring->resched_timestamp = ddi_get_lbolt();
    452 	tx_ring->resched_needed = B_TRUE;
    453 	if (!Adapter->tx_intr_enable)
    454 		e1000g_mask_tx_interrupt(Adapter);
    455 
    456 	/* Message will be scheduled for re-transmit */
    457 	return (B_FALSE);
    458 }
    459 
    460 static boolean_t
    461 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
    462     size_t msg_size)
    463 {
    464 	uintptr_t ip_start;
    465 	uintptr_t tcp_start;
    466 	mblk_t *nmp;
    467 	uint32_t lsoflags;
    468 	uint32_t mss;
    469 
    470 	bzero(cur_context, sizeof (context_data_t));
    471 
    472 	/* first check lso information */
    473 	lso_info_get(mp, &mss, &lsoflags);
    474 
    475 	/* retrieve checksum info */
    476 	hcksum_retrieve(mp, NULL, NULL, &cur_context->cksum_start,
    477 	    &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
    478 	/* retrieve ethernet header size */
    479 	if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid ==
    480 	    htons(ETHERTYPE_VLAN))
    481 		cur_context->ether_header_size =
    482 		    sizeof (struct ether_vlan_header);
    483 	else
    484 		cur_context->ether_header_size =
    485 		    sizeof (struct ether_header);
    486 
    487 	if (lsoflags & HW_LSO) {
    488 		ASSERT(mss != 0);
    489 
    490 		/* free the invalid packet */
    491 		if (mss == 0 ||
    492 		    !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
    493 		    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
    494 			return (B_FALSE);
    495 		}
    496 		cur_context->mss = (uint16_t)mss;
    497 		cur_context->lso_flag = B_TRUE;
    498 
    499 		/*
    500 		 * Some fields are cleared for the hardware to fill
    501 		 * in. We don't assume Ethernet header, IP header and
    502 		 * TCP header are always in the same mblk fragment,
    503 		 * while we assume each header is always within one
    504 		 * mblk fragment and Ethernet header is always in the
    505 		 * first mblk fragment.
    506 		 */
    507 		nmp = mp;
    508 		ip_start = (uintptr_t)(nmp->b_rptr)
    509 		    + cur_context->ether_header_size;
    510 		if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
    511 			ip_start = (uintptr_t)nmp->b_cont->b_rptr
    512 			    + (ip_start - (uintptr_t)(nmp->b_wptr));
    513 			nmp = nmp->b_cont;
    514 		}
    515 		tcp_start = ip_start +
    516 		    IPH_HDR_LENGTH((ipha_t *)ip_start);
    517 		if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
    518 			tcp_start = (uintptr_t)nmp->b_cont->b_rptr
    519 			    + (tcp_start - (uintptr_t)(nmp->b_wptr));
    520 			nmp = nmp->b_cont;
    521 		}
    522 		cur_context->hdr_len = cur_context->ether_header_size
    523 		    + IPH_HDR_LENGTH((ipha_t *)ip_start)
    524 		    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
    525 		((ipha_t *)ip_start)->ipha_length = 0;
    526 		((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
    527 		/* calculate the TCP packet payload length */
    528 		cur_context->pay_len = msg_size - cur_context->hdr_len;
    529 	}
    530 	return (B_TRUE);
    531 }
    532 
    533 static boolean_t
    534 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context)
    535 {
    536 	boolean_t context_reload;
    537 	context_data_t *pre_context;
    538 	struct e1000g *Adapter;
    539 
    540 	context_reload = B_FALSE;
    541 	pre_context = &tx_ring->pre_context;
    542 	Adapter = tx_ring->adapter;
    543 
    544 	/*
    545 	 * The following code determine if the context descriptor is
    546 	 * needed to be reloaded. The sequence of the conditions is
    547 	 * made by their possibilities of changing.
    548 	 */
    549 	/*
    550 	 * workaround for 82546EB, context descriptor must be reloaded
    551 	 * per LSO/hw_cksum packet if LSO is enabled.
    552 	 */
    553 	if (Adapter->lso_premature_issue &&
    554 	    Adapter->lso_enable &&
    555 	    (cur_context->cksum_flags != 0)) {
    556 
    557 		context_reload = B_TRUE;
    558 	} else if (cur_context->lso_flag) {
    559 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
    560 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
    561 		    (cur_context->pay_len != pre_context->pay_len) ||
    562 		    (cur_context->mss != pre_context->mss) ||
    563 		    (cur_context->hdr_len != pre_context->hdr_len) ||
    564 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
    565 		    (cur_context->cksum_start != pre_context->cksum_start) ||
    566 		    (cur_context->ether_header_size !=
    567 		    pre_context->ether_header_size)) {
    568 
    569 			context_reload = B_TRUE;
    570 		}
    571 	} else if (cur_context->cksum_flags != 0) {
    572 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
    573 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
    574 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
    575 		    (cur_context->cksum_start != pre_context->cksum_start) ||
    576 		    (cur_context->ether_header_size !=
    577 		    pre_context->ether_header_size)) {
    578 
    579 			context_reload = B_TRUE;
    580 		}
    581 	}
    582 
    583 	return (context_reload);
    584 }
    585 
    586 static int
    587 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
    588     context_data_t *cur_context)
    589 {
    590 	struct e1000g *Adapter;
    591 	struct e1000_hw *hw;
    592 	p_tx_sw_packet_t first_packet;
    593 	p_tx_sw_packet_t packet;
    594 	p_tx_sw_packet_t previous_packet;
    595 	boolean_t context_reload;
    596 	struct e1000_tx_desc *first_data_desc;
    597 	struct e1000_tx_desc *next_desc;
    598 	struct e1000_tx_desc *descriptor;
    599 	int desc_count;
    600 	boolean_t buff_overrun_flag;
    601 	int i;
    602 
    603 	Adapter = tx_ring->adapter;
    604 	hw = &Adapter->shared;
    605 
    606 	desc_count = 0;
    607 	first_packet = NULL;
    608 	first_data_desc = NULL;
    609 	descriptor = NULL;
    610 	first_packet = NULL;
    611 	packet = NULL;
    612 	buff_overrun_flag = B_FALSE;
    613 
    614 	next_desc = tx_ring->tbd_next;
    615 
    616 	/* Context descriptor reload check */
    617 	context_reload = e1000g_check_context(tx_ring, cur_context);
    618 
    619 	if (context_reload) {
    620 		first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
    621 
    622 		descriptor = next_desc;
    623 
    624 		e1000g_fill_context_descriptor(cur_context,
    625 		    (struct e1000_context_desc *)descriptor);
    626 
    627 		/* Check the wrap-around case */
    628 		if (descriptor == tx_ring->tbd_last)
    629 			next_desc = tx_ring->tbd_first;
    630 		else
    631 			next_desc++;
    632 
    633 		desc_count++;
    634 	}
    635 
    636 	first_data_desc = next_desc;
    637 
    638 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
    639 	while (packet) {
    640 		ASSERT(packet->num_desc);
    641 
    642 		for (i = 0; i < packet->num_desc; i++) {
    643 			ASSERT(tx_ring->tbd_avail > 0);
    644 
    645 			descriptor = next_desc;
    646 			descriptor->buffer_addr =
    647 			    packet->desc[i].address;
    648 			descriptor->lower.data =
    649 			    packet->desc[i].length;
    650 
    651 			/* Zero out status */
    652 			descriptor->upper.data = 0;
    653 
    654 			descriptor->lower.data |=
    655 			    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
    656 			/* must set RS on every outgoing descriptor */
    657 			descriptor->lower.data |=
    658 			    E1000_TXD_CMD_RS;
    659 
    660 			if (cur_context->lso_flag)
    661 				descriptor->lower.data |= E1000_TXD_CMD_TSE;
    662 
    663 			/* Check the wrap-around case */
    664 			if (descriptor == tx_ring->tbd_last)
    665 				next_desc = tx_ring->tbd_first;
    666 			else
    667 				next_desc++;
    668 
    669 			desc_count++;
    670 
    671 			/*
    672 			 * workaround for 82546EB errata 33, hang in PCI-X
    673 			 * systems due to 2k Buffer Overrun during Transmit
    674 			 * Operation. The workaround applies to all the Intel
    675 			 * PCI-X chips.
    676 			 */
    677 			if (hw->bus.type == e1000_bus_type_pcix &&
    678 			    descriptor == first_data_desc &&
    679 			    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK)
    680 			    > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) {
    681 				/* modified the first descriptor */
    682 				descriptor->lower.data &=
    683 				    ~E1000G_TBD_LENGTH_MASK;
    684 				descriptor->lower.flags.length =
    685 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
    686 
    687 				/* insert a new descriptor */
    688 				ASSERT(tx_ring->tbd_avail > 0);
    689 				next_desc->buffer_addr =
    690 				    packet->desc[0].address +
    691 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
    692 				next_desc->lower.data =
    693 				    packet->desc[0].length -
    694 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
    695 
    696 				/* Zero out status */
    697 				next_desc->upper.data = 0;
    698 
    699 				next_desc->lower.data |=
    700 				    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
    701 				/* must set RS on every outgoing descriptor */
    702 				next_desc->lower.data |=
    703 				    E1000_TXD_CMD_RS;
    704 
    705 				if (cur_context->lso_flag)
    706 					next_desc->lower.data |=
    707 					    E1000_TXD_CMD_TSE;
    708 
    709 				descriptor = next_desc;
    710 
    711 				/* Check the wrap-around case */
    712 				if (next_desc == tx_ring->tbd_last)
    713 					next_desc = tx_ring->tbd_first;
    714 				else
    715 					next_desc++;
    716 
    717 				desc_count++;
    718 				buff_overrun_flag = B_TRUE;
    719 			}
    720 		}
    721 
    722 		if (buff_overrun_flag) {
    723 			packet->num_desc++;
    724 			buff_overrun_flag = B_FALSE;
    725 		}
    726 
    727 		if (first_packet != NULL) {
    728 			/*
    729 			 * Count the checksum context descriptor for
    730 			 * the first SwPacket.
    731 			 */
    732 			first_packet->num_desc++;
    733 			first_packet = NULL;
    734 		}
    735 
    736 		packet->tickstamp = ddi_get_lbolt64();
    737 
    738 		previous_packet = packet;
    739 		packet = (p_tx_sw_packet_t)
    740 		    QUEUE_GET_NEXT(pending_list, &packet->Link);
    741 	}
    742 
    743 	/*
    744 	 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back
    745 	 */
    746 	if (Adapter->lso_premature_issue && cur_context->lso_flag &&
    747 	    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) {
    748 		/* modified the previous descriptor */
    749 		descriptor->lower.data -= 4;
    750 
    751 		/* insert a new descriptor */
    752 		ASSERT(tx_ring->tbd_avail > 0);
    753 		/* the lower 20 bits of lower.data is the length field */
    754 		next_desc->buffer_addr =
    755 		    descriptor->buffer_addr +
    756 		    (descriptor->lower.data & E1000G_TBD_LENGTH_MASK);
    757 		next_desc->lower.data = 4;
    758 
    759 		/* Zero out status */
    760 		next_desc->upper.data = 0;
    761 		/* It must be part of a LSO packet */
    762 		next_desc->lower.data |=
    763 		    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
    764 		    E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE;
    765 
    766 		descriptor = next_desc;
    767 
    768 		/* Check the wrap-around case */
    769 		if (descriptor == tx_ring->tbd_last)
    770 			next_desc = tx_ring->tbd_first;
    771 		else
    772 			next_desc++;
    773 
    774 		desc_count++;
    775 		/* update the number of descriptors */
    776 		previous_packet->num_desc++;
    777 	}
    778 
    779 	ASSERT(descriptor);
    780 
    781 	if (cur_context->cksum_flags) {
    782 		if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
    783 			((struct e1000_data_desc *)first_data_desc)->
    784 			    upper.fields.popts |= E1000_TXD_POPTS_IXSM;
    785 		if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
    786 			((struct e1000_data_desc *)first_data_desc)->
    787 			    upper.fields.popts |= E1000_TXD_POPTS_TXSM;
    788 	}
    789 
    790 	/*
    791 	 * Last Descriptor of Packet needs End Of Packet (EOP), Report
    792 	 * Status (RS) set.
    793 	 */
    794 	if (Adapter->tx_intr_delay) {
    795 		descriptor->lower.data |= E1000_TXD_CMD_IDE |
    796 		    E1000_TXD_CMD_EOP;
    797 	} else {
    798 		descriptor->lower.data |= E1000_TXD_CMD_EOP;
    799 	}
    800 
    801 	/* Set append Ethernet CRC (IFCS) bits */
    802 	if (cur_context->lso_flag) {
    803 		first_data_desc->lower.data |= E1000_TXD_CMD_IFCS;
    804 	} else {
    805 		descriptor->lower.data |= E1000_TXD_CMD_IFCS;
    806 	}
    807 
    808 	/*
    809 	 * Sync the Tx descriptors DMA buffer
    810 	 */
    811 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
    812 	    0, 0, DDI_DMA_SYNC_FORDEV);
    813 
    814 	tx_ring->tbd_next = next_desc;
    815 
    816 	/*
    817 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
    818 	 * FX1000 that this frame is available to transmit.
    819 	 */
    820 	if (hw->mac.type == e1000_82547)
    821 		e1000g_82547_tx_move_tail(tx_ring);
    822 	else
    823 		E1000_WRITE_REG(hw, E1000_TDT(0),
    824 		    (uint32_t)(next_desc - tx_ring->tbd_first));
    825 
    826 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
    827 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
    828 		Adapter->e1000g_state |= E1000G_ERROR;
    829 	}
    830 
    831 	/* Put the pending SwPackets to the "Used" list */
    832 	mutex_enter(&tx_ring->usedlist_lock);
    833 	QUEUE_APPEND(&tx_ring->used_list, pending_list);
    834 	tx_ring->tbd_avail -= desc_count;
    835 	mutex_exit(&tx_ring->usedlist_lock);
    836 
    837 	/* update LSO related data */
    838 	if (context_reload)
    839 		tx_ring->pre_context = *cur_context;
    840 
    841 	return (desc_count);
    842 }
    843 
    844 /*
    845  * e1000g_tx_setup - setup tx data structures
    846  *
    847  * This routine initializes all of the transmit related
    848  * structures. This includes the Transmit descriptors,
    849  * and the tx_sw_packet structures.
    850  */
    851 void
    852 e1000g_tx_setup(struct e1000g *Adapter)
    853 {
    854 	struct e1000_hw *hw;
    855 	p_tx_sw_packet_t packet;
    856 	uint32_t i;
    857 	uint32_t buf_high;
    858 	uint32_t buf_low;
    859 	uint32_t reg_tipg;
    860 	uint32_t reg_tctl;
    861 	int size;
    862 	e1000g_tx_ring_t *tx_ring;
    863 
    864 	hw = &Adapter->shared;
    865 	tx_ring = Adapter->tx_ring;
    866 
    867 	/* init the lists */
    868 	/*
    869 	 * Here we don't need to protect the lists using the
    870 	 * usedlist_lock and freelist_lock, for they have
    871 	 * been protected by the chip_lock.
    872 	 */
    873 	QUEUE_INIT_LIST(&tx_ring->used_list);
    874 	QUEUE_INIT_LIST(&tx_ring->free_list);
    875 
    876 	/* Go through and set up each SW_Packet */
    877 	packet = tx_ring->packet_area;
    878 	for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) {
    879 		/* Initialize this tx_sw_apcket area */
    880 		e1000g_free_tx_swpkt(packet);
    881 		/* Add this tx_sw_packet to the free list */
    882 		QUEUE_PUSH_TAIL(&tx_ring->free_list,
    883 		    &packet->Link);
    884 	}
    885 
    886 	/* Setup TX descriptor pointers */
    887 	tx_ring->tbd_next = tx_ring->tbd_first;
    888 	tx_ring->tbd_oldest = tx_ring->tbd_first;
    889 
    890 	/*
    891 	 * Setup Hardware TX Registers
    892 	 */
    893 	/* Setup the Transmit Control Register (TCTL). */
    894 	reg_tctl = E1000_READ_REG(hw, E1000_TCTL);
    895 	reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN |
    896 	    (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) |
    897 	    (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) |
    898 	    E1000_TCTL_RTLC;
    899 
    900 	/* Enable the MULR bit */
    901 	if (hw->bus.type == e1000_bus_type_pci_express)
    902 		reg_tctl |= E1000_TCTL_MULR;
    903 
    904 	E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl);
    905 
    906 	/* Setup HW Base and Length of Tx descriptor area */
    907 	size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc));
    908 	E1000_WRITE_REG(hw, E1000_TDLEN(0), size);
    909 	size = E1000_READ_REG(hw, E1000_TDLEN(0));
    910 
    911 	buf_low = (uint32_t)tx_ring->tbd_dma_addr;
    912 	buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32);
    913 
    914 	/*
    915 	 * Write the highest location first and work backward to the lowest.
    916 	 * This is necessary for some adapter types to
    917 	 * prevent write combining from occurring.
    918 	 */
    919 	E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high);
    920 	E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low);
    921 
    922 	/* Setup our HW Tx Head & Tail descriptor pointers */
    923 	E1000_WRITE_REG(hw, E1000_TDH(0), 0);
    924 	E1000_WRITE_REG(hw, E1000_TDT(0), 0);
    925 
    926 	/* Set the default values for the Tx Inter Packet Gap timer */
    927 	if ((hw->mac.type == e1000_82542) &&
    928 	    ((hw->revision_id == E1000_REVISION_2) ||
    929 	    (hw->revision_id == E1000_REVISION_3))) {
    930 		reg_tipg = DEFAULT_82542_TIPG_IPGT;
    931 		reg_tipg |=
    932 		    DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
    933 		reg_tipg |=
    934 		    DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
    935 	} else if (hw->mac.type == e1000_80003es2lan) {
    936 		reg_tipg = DEFAULT_82543_TIPG_IPGR1;
    937 		reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
    938 		    E1000_TIPG_IPGR2_SHIFT;
    939 	} else {
    940 		if (hw->phy.media_type == e1000_media_type_fiber)
    941 			reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER;
    942 		else
    943 			reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER;
    944 		reg_tipg |=
    945 		    DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
    946 		reg_tipg |=
    947 		    DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
    948 	}
    949 	E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg);
    950 
    951 	/* Setup Transmit Interrupt Delay Value */
    952 	E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay);
    953 	E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
    954 	    "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay);
    955 
    956 	if (hw->mac.type >= e1000_82540) {
    957 		E1000_WRITE_REG(&Adapter->shared, E1000_TADV,
    958 		    Adapter->tx_intr_abs_delay);
    959 		E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
    960 		    "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay);
    961 	}
    962 
    963 	tx_ring->tbd_avail = Adapter->tx_desc_num;
    964 
    965 	/* Initialize stored context information */
    966 	bzero(&(tx_ring->pre_context), sizeof (context_data_t));
    967 }
    968 
    969 /*
    970  * e1000g_recycle - recycle the tx descriptors and tx sw packets
    971  */
    972 int
    973 e1000g_recycle(e1000g_tx_ring_t *tx_ring)
    974 {
    975 	struct e1000g *Adapter;
    976 	LIST_DESCRIBER pending_list;
    977 	p_tx_sw_packet_t packet;
    978 	mblk_t *mp;
    979 	mblk_t *nmp;
    980 	struct e1000_tx_desc *descriptor;
    981 	int desc_count;
    982 	int64_t delta;
    983 
    984 	/*
    985 	 * This function will examine each TxSwPacket in the 'used' queue
    986 	 * if the e1000g is done with it then the associated resources (Tx
    987 	 * Descriptors) will be "freed" and the TxSwPacket will be
    988 	 * returned to the 'free' queue.
    989 	 */
    990 	Adapter = tx_ring->adapter;
    991 	delta = 0;
    992 
    993 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list);
    994 	if (packet == NULL) {
    995 		Adapter->stall_flag = B_FALSE;
    996 		return (0);
    997 	}
    998 
    999 	desc_count = 0;
   1000 	QUEUE_INIT_LIST(&pending_list);
   1001 
   1002 	/* Sync the Tx descriptor DMA buffer */
   1003 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
   1004 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
   1005 	if (e1000g_check_dma_handle(
   1006 	    tx_ring->tbd_dma_handle) != DDI_FM_OK) {
   1007 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
   1008 		Adapter->e1000g_state |= E1000G_ERROR;
   1009 		return (0);
   1010 	}
   1011 
   1012 	/*
   1013 	 * While there are still TxSwPackets in the used queue check them
   1014 	 */
   1015 	mutex_enter(&tx_ring->usedlist_lock);
   1016 	while ((packet =
   1017 	    (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
   1018 
   1019 		/*
   1020 		 * Get hold of the next descriptor that the e1000g will
   1021 		 * report status back to (this will be the last descriptor
   1022 		 * of a given sw packet). We only want to free the
   1023 		 * sw packet (and it resources) if the e1000g is done
   1024 		 * with ALL of the descriptors.  If the e1000g is done
   1025 		 * with the last one then it is done with all of them.
   1026 		 */
   1027 		ASSERT(packet->num_desc);
   1028 		descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1);
   1029 
   1030 		/* Check for wrap case */
   1031 		if (descriptor > tx_ring->tbd_last)
   1032 			descriptor -= Adapter->tx_desc_num;
   1033 
   1034 		/*
   1035 		 * If the descriptor done bit is set free TxSwPacket and
   1036 		 * associated resources
   1037 		 */
   1038 		if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) {
   1039 			QUEUE_POP_HEAD(&tx_ring->used_list);
   1040 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
   1041 
   1042 			if (descriptor == tx_ring->tbd_last)
   1043 				tx_ring->tbd_oldest =
   1044 				    tx_ring->tbd_first;
   1045 			else
   1046 				tx_ring->tbd_oldest =
   1047 				    descriptor + 1;
   1048 
   1049 			desc_count += packet->num_desc;
   1050 		} else {
   1051 			/*
   1052 			 * Found a sw packet that the e1000g is not done
   1053 			 * with then there is no reason to check the rest
   1054 			 * of the queue.
   1055 			 */
   1056 			delta = ddi_get_lbolt64() - packet->tickstamp;
   1057 			break;
   1058 		}
   1059 	}
   1060 
   1061 	tx_ring->tbd_avail += desc_count;
   1062 	Adapter->tx_pkt_cnt += desc_count;
   1063 
   1064 	mutex_exit(&tx_ring->usedlist_lock);
   1065 
   1066 	if (desc_count == 0) {
   1067 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_none);
   1068 		/*
   1069 		 * If the packet hasn't been sent out for seconds and
   1070 		 * the transmitter is not under paused flowctrl condition,
   1071 		 * the transmitter is considered to be stalled.
   1072 		 */
   1073 		if ((delta > Adapter->stall_threshold) &&
   1074 		    !(E1000_READ_REG(&Adapter->shared,
   1075 		    E1000_STATUS) & E1000_STATUS_TXOFF)) {
   1076 			Adapter->stall_flag = B_TRUE;
   1077 		}
   1078 		return (0);
   1079 	}
   1080 
   1081 	Adapter->stall_flag = B_FALSE;
   1082 
   1083 	mp = NULL;
   1084 	nmp = NULL;
   1085 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
   1086 	ASSERT(packet != NULL);
   1087 	while (packet != NULL) {
   1088 		if (packet->mp != NULL) {
   1089 			ASSERT(packet->mp->b_next == NULL);
   1090 			/* Assemble the message chain */
   1091 			if (mp == NULL) {
   1092 				mp = packet->mp;
   1093 				nmp = packet->mp;
   1094 			} else {
   1095 				nmp->b_next = packet->mp;
   1096 				nmp = packet->mp;
   1097 			}
   1098 			/* Disconnect the message from the sw packet */
   1099 			packet->mp = NULL;
   1100 		}
   1101 
   1102 		/* Free the TxSwPackets */
   1103 		e1000g_free_tx_swpkt(packet);
   1104 
   1105 		packet = (p_tx_sw_packet_t)
   1106 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
   1107 	}
   1108 
   1109 	/* Return the TxSwPackets back to the FreeList */
   1110 	mutex_enter(&tx_ring->freelist_lock);
   1111 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
   1112 	mutex_exit(&tx_ring->freelist_lock);
   1113 
   1114 	if (mp != NULL)
   1115 		freemsgchain(mp);
   1116 
   1117 	return (desc_count);
   1118 }
   1119 /*
   1120  * 82544 Coexistence issue workaround:
   1121  *    There are 2 issues.
   1122  *    1. If a 32 bit split completion happens from P64H2 and another
   1123  *	agent drives a 64 bit request/split completion after ONLY
   1124  *	1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then
   1125  *	82544 has a problem where in to clock all the data in, it
   1126  *	looks at REQ64# signal and since it has changed so fast (i.e. 1
   1127  *	idle clock turn around), it will fail to clock all the data in.
   1128  *	Data coming from certain ending addresses has exposure to this issue.
   1129  *
   1130  * To detect this issue, following equation can be used...
   1131  *	SIZE[3:0] + ADDR[2:0] = SUM[3:0].
   1132  *	If SUM[3:0] is in between 1 to 4, we will have this issue.
   1133  *
   1134  * ROOT CAUSE:
   1135  *	The erratum involves the 82544 PCIX elasticity FIFO implementations as
   1136  *	64-bit FIFO's and flushing of the final partial-bytes corresponding
   1137  *	to the end of a requested read burst. Under a specific burst condition
   1138  *	of ending-data alignment and 32-byte split-completions, the final
   1139  *	byte(s) of split-completion data require an extra clock cycle to flush
   1140  *	into 64-bit FIFO orientation.  An incorrect logic dependency on the
   1141  *	REQ64# signal occurring during during this clock cycle may cause the
   1142  *	residual byte(s) to be lost, thereby rendering the internal DMA client
   1143  *	forever awaiting the final byte(s) for an outbound data-fetch.  The
   1144  *	erratum is confirmed to *only* occur if certain subsequent external
   1145  *	64-bit PCIX bus transactions occur immediately (minimum possible bus
   1146  *	turn- around) following the odd-aligned 32-bit split-completion
   1147  *	containing the final byte(s).  Intel has confirmed that this has been
   1148  *	seen only with chipset/bridges which have the capability to provide
   1149  *	32-bit split-completion data, and in the presence of newer PCIX bus
   1150  *	agents which fully-optimize the inter-transaction turn-around (zero
   1151  *	additional initiator latency when pre-granted bus ownership).
   1152  *
   1153  *   	This issue does not exist in PCI bus mode, when any agent is operating
   1154  *	in 32 bit only mode or on chipsets that do not do 32 bit split
   1155  *	completions for 64 bit read requests (Serverworks chipsets). P64H2 does
   1156  *	32 bit split completions for any read request that has bit 2 set to 1
   1157  *	for the requested address and read request size is more than 8 bytes.
   1158  *
   1159  *   2. Another issue is related to 82544 driving DACs under the similar
   1160  *	scenario (32 bit split completion followed by 64 bit transaction with
   1161  *	only 1 cycle turnaround). This issue is still being root caused. We
   1162  *	think that both of these issues can be avoided if following workaround
   1163  *	is implemented. It seems DAC issues is related to ending addresses being
   1164  *	0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity
   1165  *	FIFO which does not get flushed due to REQ64# dependency. We will only
   1166  *	know the full story after it has been simulated successfully by HW team.
   1167  *
   1168  * WORKAROUND:
   1169  *	Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC)
   1170  */
   1171 static uint32_t
   1172 e1000g_fill_82544_desc(uint64_t address,
   1173     size_t length, p_desc_array_t desc_array)
   1174 {
   1175 	/*
   1176 	 * Since issue is sensitive to length and address.
   1177 	 * Let us first check the address...
   1178 	 */
   1179 	uint32_t safe_terminator;
   1180 
   1181 	if (length <= 4) {
   1182 		desc_array->descriptor[0].address = address;
   1183 		desc_array->descriptor[0].length = (uint32_t)length;
   1184 		desc_array->elements = 1;
   1185 		return (desc_array->elements);
   1186 	}
   1187 	safe_terminator =
   1188 	    (uint32_t)((((uint32_t)address & 0x7) +
   1189 	    (length & 0xF)) & 0xF);
   1190 	/*
   1191 	 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then
   1192 	 * return
   1193 	 */
   1194 	if (safe_terminator == 0 ||
   1195 	    (safe_terminator > 4 && safe_terminator < 9) ||
   1196 	    (safe_terminator > 0xC && safe_terminator <= 0xF)) {
   1197 		desc_array->descriptor[0].address = address;
   1198 		desc_array->descriptor[0].length = (uint32_t)length;
   1199 		desc_array->elements = 1;
   1200 		return (desc_array->elements);
   1201 	}
   1202 
   1203 	desc_array->descriptor[0].address = address;
   1204 	desc_array->descriptor[0].length = length - 4;
   1205 	desc_array->descriptor[1].address = address + (length - 4);
   1206 	desc_array->descriptor[1].length = 4;
   1207 	desc_array->elements = 2;
   1208 	return (desc_array->elements);
   1209 }
   1210 
   1211 static int
   1212 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet,
   1213     mblk_t *mp, boolean_t tx_undersize_flag)
   1214 {
   1215 	size_t len;
   1216 	size_t len1;
   1217 	dma_buffer_t *tx_buf;
   1218 	mblk_t *nmp;
   1219 	boolean_t finished;
   1220 	int desc_count;
   1221 
   1222 	desc_count = 0;
   1223 	tx_buf = packet->tx_buf;
   1224 	len = MBLKL(mp);
   1225 
   1226 	ASSERT((tx_buf->len + len) <= tx_buf->size);
   1227 
   1228 	if (len > 0) {
   1229 		bcopy(mp->b_rptr,
   1230 		    tx_buf->address + tx_buf->len,
   1231 		    len);
   1232 		tx_buf->len += len;
   1233 
   1234 		packet->num_mblk_frag++;
   1235 	}
   1236 
   1237 	nmp = mp->b_cont;
   1238 	if (nmp == NULL) {
   1239 		finished = B_TRUE;
   1240 	} else {
   1241 		len1 = MBLKL(nmp);
   1242 		if ((tx_buf->len + len1) > tx_buf->size)
   1243 			finished = B_TRUE;
   1244 		else if (tx_undersize_flag)
   1245 			finished = B_FALSE;
   1246 		else if (len1 > tx_ring->adapter->tx_bcopy_thresh)
   1247 			finished = B_TRUE;
   1248 		else
   1249 			finished = B_FALSE;
   1250 	}
   1251 
   1252 	if (finished) {
   1253 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy,
   1254 		    (tx_buf->len > len));
   1255 
   1256 		/*
   1257 		 * If the packet is smaller than 64 bytes, which is the
   1258 		 * minimum ethernet packet size, pad the packet to make
   1259 		 * it at least 60 bytes. The hardware will add 4 bytes
   1260 		 * for CRC.
   1261 		 */
   1262 		if (tx_undersize_flag) {
   1263 			ASSERT(tx_buf->len < ETHERMIN);
   1264 
   1265 			bzero(tx_buf->address + tx_buf->len,
   1266 			    ETHERMIN - tx_buf->len);
   1267 			tx_buf->len = ETHERMIN;
   1268 		}
   1269 
   1270 #ifdef __sparc
   1271 		if (packet->dma_type == USE_DVMA)
   1272 			dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV);
   1273 		else
   1274 			(void) ddi_dma_sync(tx_buf->dma_handle, 0,
   1275 			    tx_buf->len, DDI_DMA_SYNC_FORDEV);
   1276 #else
   1277 		(void) ddi_dma_sync(tx_buf->dma_handle, 0,
   1278 		    tx_buf->len, DDI_DMA_SYNC_FORDEV);
   1279 #endif
   1280 
   1281 		packet->data_transfer_type = USE_BCOPY;
   1282 
   1283 		desc_count = e1000g_fill_tx_desc(tx_ring,
   1284 		    packet,
   1285 		    tx_buf->dma_address,
   1286 		    tx_buf->len);
   1287 
   1288 		if (desc_count <= 0)
   1289 			return (-1);
   1290 	}
   1291 
   1292 	return (desc_count);
   1293 }
   1294 
   1295 static int
   1296 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp)
   1297 {
   1298 	int j;
   1299 	int mystat;
   1300 	size_t len;
   1301 	ddi_dma_cookie_t dma_cookie;
   1302 	uint_t ncookies;
   1303 	int desc_count;
   1304 	uint32_t desc_total;
   1305 
   1306 	desc_total = 0;
   1307 	len = MBLKL(mp);
   1308 
   1309 	/*
   1310 	 * ddi_dma_addr_bind_handle() allocates  DMA  resources  for  a
   1311 	 * memory  object such that a device can perform DMA to or from
   1312 	 * the object.  DMA resources  are  allocated  considering  the
   1313 	 * device's  DMA  attributes  as  expressed by ddi_dma_attr(9S)
   1314 	 * (see ddi_dma_alloc_handle(9F)).
   1315 	 *
   1316 	 * ddi_dma_addr_bind_handle() fills in  the  first  DMA  cookie
   1317 	 * pointed  to by cookiep with the appropriate address, length,
   1318 	 * and bus type. *ccountp is set to the number of DMA  cookies
   1319 	 * representing this DMA object. Subsequent DMA cookies must be
   1320 	 * retrieved by calling ddi_dma_nextcookie(9F)  the  number  of
   1321 	 * times specified by *countp - 1.
   1322 	 */
   1323 	switch (packet->dma_type) {
   1324 #ifdef __sparc
   1325 	case USE_DVMA:
   1326 		dvma_kaddr_load(packet->tx_dma_handle,
   1327 		    (caddr_t)mp->b_rptr, len, 0, &dma_cookie);
   1328 
   1329 		dvma_sync(packet->tx_dma_handle, 0,
   1330 		    DDI_DMA_SYNC_FORDEV);
   1331 
   1332 		ncookies = 1;
   1333 		packet->data_transfer_type = USE_DVMA;
   1334 		break;
   1335 #endif
   1336 	case USE_DMA:
   1337 		if ((mystat = ddi_dma_addr_bind_handle(
   1338 		    packet->tx_dma_handle, NULL,
   1339 		    (caddr_t)mp->b_rptr, len,
   1340 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
   1341 		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
   1342 		    &ncookies)) != DDI_DMA_MAPPED) {
   1343 
   1344 			e1000g_log(tx_ring->adapter, CE_WARN,
   1345 			    "Couldn't bind mblk buffer to Tx DMA handle: "
   1346 			    "return: %X, Pkt: %X\n",
   1347 			    mystat, packet);
   1348 			return (-1);
   1349 		}
   1350 
   1351 		/*
   1352 		 * An implicit ddi_dma_sync() is done when the
   1353 		 * ddi_dma_addr_bind_handle() is called. So we
   1354 		 * don't need to explicitly call ddi_dma_sync()
   1355 		 * here any more.
   1356 		 */
   1357 		ASSERT(ncookies);
   1358 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie,
   1359 		    (ncookies > 1));
   1360 
   1361 		/*
   1362 		 * The data_transfer_type value must be set after the handle
   1363 		 * has been bound, for it will be used in e1000g_free_tx_swpkt()
   1364 		 * to decide whether we need to unbind the handle.
   1365 		 */
   1366 		packet->data_transfer_type = USE_DMA;
   1367 		break;
   1368 	default:
   1369 		ASSERT(B_FALSE);
   1370 		break;
   1371 	}
   1372 
   1373 	packet->num_mblk_frag++;
   1374 
   1375 	/*
   1376 	 * Each address could span thru multpile cookie..
   1377 	 * Each cookie will have one descriptor
   1378 	 */
   1379 	for (j = ncookies; j != 0; j--) {
   1380 
   1381 		desc_count = e1000g_fill_tx_desc(tx_ring,
   1382 		    packet,
   1383 		    dma_cookie.dmac_laddress,
   1384 		    dma_cookie.dmac_size);
   1385 
   1386 		if (desc_count <= 0)
   1387 			return (-1);
   1388 
   1389 		desc_total += desc_count;
   1390 
   1391 		/*
   1392 		 * ddi_dma_nextcookie() retrieves subsequent DMA
   1393 		 * cookies for a DMA object.
   1394 		 * ddi_dma_nextcookie() fills in the
   1395 		 * ddi_dma_cookie(9S) structure pointed to by
   1396 		 * cookiep.  The ddi_dma_cookie(9S) structure
   1397 		 * must be allocated prior to calling
   1398 		 * ddi_dma_nextcookie(). The DMA cookie count
   1399 		 * returned by ddi_dma_buf_bind_handle(9F),
   1400 		 * ddi_dma_addr_bind_handle(9F), or
   1401 		 * ddi_dma_getwin(9F) indicates the number of DMA
   1402 		 * cookies a DMA object consists of.  If the
   1403 		 * resulting cookie count, N, is larger than 1,
   1404 		 * ddi_dma_nextcookie() must be called N-1 times
   1405 		 * to retrieve all DMA cookies.
   1406 		 */
   1407 		if (j > 1) {
   1408 			ddi_dma_nextcookie(packet->tx_dma_handle,
   1409 			    &dma_cookie);
   1410 		}
   1411 	}
   1412 
   1413 	return (desc_total);
   1414 }
   1415 
   1416 static void
   1417 e1000g_fill_context_descriptor(context_data_t *cur_context,
   1418     struct e1000_context_desc *context_desc)
   1419 {
   1420 	if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) {
   1421 		context_desc->lower_setup.ip_fields.ipcss =
   1422 		    cur_context->ether_header_size;
   1423 		context_desc->lower_setup.ip_fields.ipcso =
   1424 		    cur_context->ether_header_size +
   1425 		    offsetof(struct ip, ip_sum);
   1426 		context_desc->lower_setup.ip_fields.ipcse =
   1427 		    cur_context->ether_header_size +
   1428 		    cur_context->cksum_start - 1;
   1429 	} else
   1430 		context_desc->lower_setup.ip_config = 0;
   1431 
   1432 	if (cur_context->cksum_flags & HCK_PARTIALCKSUM) {
   1433 		/*
   1434 		 * The packet with same protocol has the following
   1435 		 * stuff and start offset:
   1436 		 * |  Protocol  | Stuff  | Start  | Checksum
   1437 		 * |		| Offset | Offset | Enable
   1438 		 * | IPv4 + TCP |  0x24  |  0x14  |  Yes
   1439 		 * | IPv4 + UDP |  0x1A  |  0x14  |  Yes
   1440 		 * | IPv6 + TCP |  0x20  |  0x10  |  No
   1441 		 * | IPv6 + UDP |  0x14  |  0x10  |  No
   1442 		 */
   1443 		context_desc->upper_setup.tcp_fields.tucss =
   1444 		    cur_context->cksum_start + cur_context->ether_header_size;
   1445 		context_desc->upper_setup.tcp_fields.tucso =
   1446 		    cur_context->cksum_stuff + cur_context->ether_header_size;
   1447 		context_desc->upper_setup.tcp_fields.tucse = 0;
   1448 	} else
   1449 		context_desc->upper_setup.tcp_config = 0;
   1450 
   1451 	if (cur_context->lso_flag) {
   1452 		context_desc->tcp_seg_setup.fields.mss = cur_context->mss;
   1453 		context_desc->tcp_seg_setup.fields.hdr_len =
   1454 		    cur_context->hdr_len;
   1455 		/*
   1456 		 * workaround for 82546EB errata 23, status-writeback
   1457 		 * reporting (RS) should not be set on context or
   1458 		 * Null descriptors
   1459 		 */
   1460 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
   1461 		    | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP
   1462 		    | E1000_TXD_DTYP_C | cur_context->pay_len;
   1463 	} else {
   1464 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
   1465 		    | E1000_TXD_DTYP_C;
   1466 		/*
   1467 		 * Zero out the options for TCP Segmentation Offload
   1468 		 */
   1469 		context_desc->tcp_seg_setup.data = 0;
   1470 	}
   1471 }
   1472 
   1473 static int
   1474 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring,
   1475     p_tx_sw_packet_t packet, uint64_t address, size_t size)
   1476 {
   1477 	struct e1000_hw *hw = &tx_ring->adapter->shared;
   1478 	p_sw_desc_t desc;
   1479 
   1480 	if (hw->mac.type == e1000_82544) {
   1481 		if (hw->bus.type == e1000_bus_type_pcix)
   1482 			return (e1000g_tx_workaround_PCIX_82544(packet,
   1483 			    address, size));
   1484 
   1485 		if (size > JUMBO_FRAG_LENGTH)
   1486 			return (e1000g_tx_workaround_jumbo_82544(packet,
   1487 			    address, size));
   1488 	}
   1489 
   1490 	ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
   1491 
   1492 	desc = &packet->desc[packet->num_desc];
   1493 	desc->address = address;
   1494 	desc->length = (uint32_t)size;
   1495 
   1496 	packet->num_desc++;
   1497 
   1498 	return (1);
   1499 }
   1500 
   1501 static int
   1502 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet,
   1503     uint64_t address, size_t size)
   1504 {
   1505 	p_sw_desc_t desc;
   1506 	int desc_count;
   1507 	long size_left;
   1508 	size_t len;
   1509 	uint32_t counter;
   1510 	uint32_t array_elements;
   1511 	desc_array_t desc_array;
   1512 
   1513 	/*
   1514 	 * Coexist Workaround for cordova: RP: 07/04/03
   1515 	 *
   1516 	 * RP: ERRATA: Workaround ISSUE:
   1517 	 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup
   1518 	 * Eachbuffer in to 8kb pieces until the
   1519 	 * remainder is < 8kb
   1520 	 */
   1521 	size_left = size;
   1522 	desc_count = 0;
   1523 
   1524 	while (size_left > 0) {
   1525 		if (size_left > MAX_TX_BUF_SIZE)
   1526 			len = MAX_TX_BUF_SIZE;
   1527 		else
   1528 			len = size_left;
   1529 
   1530 		array_elements = e1000g_fill_82544_desc(address,
   1531 		    len, &desc_array);
   1532 
   1533 		for (counter = 0; counter < array_elements; counter++) {
   1534 			ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
   1535 			/*
   1536 			 * Put in the buffer address
   1537 			 */
   1538 			desc = &packet->desc[packet->num_desc];
   1539 
   1540 			desc->address =
   1541 			    desc_array.descriptor[counter].address;
   1542 			desc->length =
   1543 			    desc_array.descriptor[counter].length;
   1544 
   1545 			packet->num_desc++;
   1546 			desc_count++;
   1547 		} /* for */
   1548 
   1549 		/*
   1550 		 * Update the buffer address and length
   1551 		 */
   1552 		address += MAX_TX_BUF_SIZE;
   1553 		size_left -= MAX_TX_BUF_SIZE;
   1554 	} /* while */
   1555 
   1556 	return (desc_count);
   1557 }
   1558 
   1559 static int
   1560 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet,
   1561     uint64_t address, size_t size)
   1562 {
   1563 	p_sw_desc_t desc;
   1564 	int desc_count;
   1565 	long size_left;
   1566 	uint32_t offset;
   1567 
   1568 	/*
   1569 	 * Workaround for Jumbo Frames on Cordova
   1570 	 * PSD 06/01/2001
   1571 	 */
   1572 	size_left = size;
   1573 	desc_count = 0;
   1574 	offset = 0;
   1575 	while (size_left > 0) {
   1576 		ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
   1577 
   1578 		desc = &packet->desc[packet->num_desc];
   1579 
   1580 		desc->address = address + offset;
   1581 
   1582 		if (size_left > JUMBO_FRAG_LENGTH)
   1583 			desc->length = JUMBO_FRAG_LENGTH;
   1584 		else
   1585 			desc->length = (uint32_t)size_left;
   1586 
   1587 		packet->num_desc++;
   1588 		desc_count++;
   1589 
   1590 		offset += desc->length;
   1591 		size_left -= JUMBO_FRAG_LENGTH;
   1592 	}
   1593 
   1594 	return (desc_count);
   1595 }
   1596 
   1597 #pragma inline(e1000g_82547_tx_move_tail_work)
   1598 
   1599 static void
   1600 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring)
   1601 {
   1602 	struct e1000_hw *hw;
   1603 	uint16_t hw_tdt;
   1604 	uint16_t sw_tdt;
   1605 	struct e1000_tx_desc *tx_desc;
   1606 	uint16_t length = 0;
   1607 	boolean_t eop = B_FALSE;
   1608 	struct e1000g *Adapter;
   1609 
   1610 	Adapter = tx_ring->adapter;
   1611 	hw = &Adapter->shared;
   1612 
   1613 	hw_tdt = E1000_READ_REG(hw, E1000_TDT(0));
   1614 	sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first;
   1615 
   1616 	while (hw_tdt != sw_tdt) {
   1617 		tx_desc = &(tx_ring->tbd_first[hw_tdt]);
   1618 		length += tx_desc->lower.flags.length;
   1619 		eop = tx_desc->lower.data & E1000_TXD_CMD_EOP;
   1620 		if (++hw_tdt == Adapter->tx_desc_num)
   1621 			hw_tdt = 0;
   1622 
   1623 		if (eop) {
   1624 			if ((Adapter->link_duplex == HALF_DUPLEX) &&
   1625 			    (e1000_fifo_workaround_82547(hw, length)
   1626 			    != E1000_SUCCESS)) {
   1627 				if (tx_ring->timer_enable_82547) {
   1628 					ASSERT(tx_ring->timer_id_82547 == 0);
   1629 					tx_ring->timer_id_82547 =
   1630 					    timeout(e1000g_82547_timeout,
   1631 					    (void *)tx_ring,
   1632 					    drv_usectohz(10000));
   1633 				}
   1634 				return;
   1635 
   1636 			} else {
   1637 				E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt);
   1638 				e1000_update_tx_fifo_head_82547(hw, length);
   1639 				length = 0;
   1640 			}
   1641 		}
   1642 	}
   1643 }
   1644 
   1645 static void
   1646 e1000g_82547_timeout(void *arg)
   1647 {
   1648 	e1000g_tx_ring_t *tx_ring;
   1649 
   1650 	tx_ring = (e1000g_tx_ring_t *)arg;
   1651 
   1652 	mutex_enter(&tx_ring->tx_lock);
   1653 
   1654 	tx_ring->timer_id_82547 = 0;
   1655 	e1000g_82547_tx_move_tail_work(tx_ring);
   1656 
   1657 	mutex_exit(&tx_ring->tx_lock);
   1658 }
   1659 
   1660 static void
   1661 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring)
   1662 {
   1663 	timeout_id_t tid;
   1664 
   1665 	ASSERT(MUTEX_HELD(&tx_ring->tx_lock));
   1666 
   1667 	tid = tx_ring->timer_id_82547;
   1668 	tx_ring->timer_id_82547 = 0;
   1669 	if (tid != 0) {
   1670 		tx_ring->timer_enable_82547 = B_FALSE;
   1671 		mutex_exit(&tx_ring->tx_lock);
   1672 
   1673 		(void) untimeout(tid);
   1674 
   1675 		mutex_enter(&tx_ring->tx_lock);
   1676 	}
   1677 	tx_ring->timer_enable_82547 = B_TRUE;
   1678 	e1000g_82547_tx_move_tail_work(tx_ring);
   1679 }
   1680