1 789 ahrens /* 2 789 ahrens * CDDL HEADER START 3 789 ahrens * 4 789 ahrens * The contents of this file are subject to the terms of the 5 1544 eschrock * Common Development and Distribution License (the "License"). 6 1544 eschrock * You may not use this file except in compliance with the License. 7 789 ahrens * 8 789 ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 789 ahrens * or http://www.opensolaris.org/os/licensing. 10 789 ahrens * See the License for the specific language governing permissions 11 789 ahrens * and limitations under the License. 12 789 ahrens * 13 789 ahrens * When distributing Covered Code, include this CDDL HEADER in each 14 789 ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 789 ahrens * If applicable, add the following below this CDDL HEADER, with the 16 789 ahrens * fields enclosed by brackets "[]" replaced with your own identifying 17 789 ahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18 789 ahrens * 19 789 ahrens * CDDL HEADER END 20 789 ahrens */ 21 789 ahrens /* 22 8632 Bill * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 789 ahrens * Use is subject to license terms. 24 789 ahrens */ 25 789 ahrens 26 789 ahrens #include <sys/zfs_context.h> 27 789 ahrens #include <sys/spa.h> 28 789 ahrens #include <sys/vdev_impl.h> 29 789 ahrens #include <sys/zio.h> 30 5810 ek110237 #include <sys/kstat.h> 31 789 ahrens 32 789 ahrens /* 33 789 ahrens * Virtual device read-ahead caching. 34 789 ahrens * 35 789 ahrens * This file implements a simple LRU read-ahead cache. When the DMU reads 36 789 ahrens * a given block, it will often want other, nearby blocks soon thereafter. 37 789 ahrens * We take advantage of this by reading a larger disk region and caching 38 5810 ek110237 * the result. In the best case, this can turn 128 back-to-back 512-byte 39 5810 ek110237 * reads into a single 64k read followed by 127 cache hits; this reduces 40 789 ahrens * latency dramatically. In the worst case, it can turn an isolated 512-byte 41 5810 ek110237 * read into a 64k read, which doesn't affect latency all that much but is 42 789 ahrens * terribly wasteful of bandwidth. A more intelligent version of the cache 43 789 ahrens * could keep track of access patterns and not do read-ahead unless it sees 44 4634 ek110237 * at least two temporally close I/Os to the same region. Currently, only 45 4634 ek110237 * metadata I/O is inflated. A futher enhancement could take advantage of 46 4634 ek110237 * more semantic information about the I/O. And it could use something 47 4634 ek110237 * faster than an AVL tree; that was chosen solely for convenience. 48 789 ahrens * 49 789 ahrens * There are five cache operations: allocate, fill, read, write, evict. 50 789 ahrens * 51 789 ahrens * (1) Allocate. This reserves a cache entry for the specified region. 52 789 ahrens * We separate the allocate and fill operations so that multiple threads 53 789 ahrens * don't generate I/O for the same cache miss. 54 789 ahrens * 55 789 ahrens * (2) Fill. When the I/O for a cache miss completes, the fill routine 56 789 ahrens * places the data in the previously allocated cache entry. 57 789 ahrens * 58 789 ahrens * (3) Read. Read data from the cache. 59 789 ahrens * 60 789 ahrens * (4) Write. Update cache contents after write completion. 61 789 ahrens * 62 789 ahrens * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry 63 3059 ahrens * if the total cache size exceeds zfs_vdev_cache_size. 64 789 ahrens */ 65 3059 ahrens 66 3059 ahrens /* 67 3059 ahrens * These tunables are for performance analysis. 68 3059 ahrens */ 69 3059 ahrens /* 70 3059 ahrens * All i/os smaller than zfs_vdev_cache_max will be turned into 71 3059 ahrens * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software 72 5810 ek110237 * track buffer). At most zfs_vdev_cache_size bytes will be kept in each 73 3059 ahrens * vdev's vdev_cache. 74 3059 ahrens */ 75 5810 ek110237 int zfs_vdev_cache_max = 1<<14; /* 16KB */ 76 5810 ek110237 int zfs_vdev_cache_size = 10ULL << 20; /* 10MB */ 77 3059 ahrens int zfs_vdev_cache_bshift = 16; 78 3059 ahrens 79 5810 ek110237 #define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ 80 5810 ek110237 81 5810 ek110237 kstat_t *vdc_ksp = NULL; 82 5810 ek110237 83 5810 ek110237 typedef struct vdc_stats { 84 5810 ek110237 kstat_named_t vdc_stat_delegations; 85 5810 ek110237 kstat_named_t vdc_stat_hits; 86 5810 ek110237 kstat_named_t vdc_stat_misses; 87 5810 ek110237 } vdc_stats_t; 88 5810 ek110237 89 5810 ek110237 static vdc_stats_t vdc_stats = { 90 5810 ek110237 { "delegations", KSTAT_DATA_UINT64 }, 91 5810 ek110237 { "hits", KSTAT_DATA_UINT64 }, 92 5810 ek110237 { "misses", KSTAT_DATA_UINT64 } 93 5810 ek110237 }; 94 5810 ek110237 95 5810 ek110237 #define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); 96 789 ahrens 97 789 ahrens static int 98 789 ahrens vdev_cache_offset_compare(const void *a1, const void *a2) 99 789 ahrens { 100 789 ahrens const vdev_cache_entry_t *ve1 = a1; 101 789 ahrens const vdev_cache_entry_t *ve2 = a2; 102 789 ahrens 103 789 ahrens if (ve1->ve_offset < ve2->ve_offset) 104 789 ahrens return (-1); 105 789 ahrens if (ve1->ve_offset > ve2->ve_offset) 106 789 ahrens return (1); 107 789 ahrens return (0); 108 789 ahrens } 109 789 ahrens 110 789 ahrens static int 111 789 ahrens vdev_cache_lastused_compare(const void *a1, const void *a2) 112 789 ahrens { 113 789 ahrens const vdev_cache_entry_t *ve1 = a1; 114 789 ahrens const vdev_cache_entry_t *ve2 = a2; 115 789 ahrens 116 789 ahrens if (ve1->ve_lastused < ve2->ve_lastused) 117 789 ahrens return (-1); 118 789 ahrens if (ve1->ve_lastused > ve2->ve_lastused) 119 789 ahrens return (1); 120 789 ahrens 121 789 ahrens /* 122 789 ahrens * Among equally old entries, sort by offset to ensure uniqueness. 123 789 ahrens */ 124 789 ahrens return (vdev_cache_offset_compare(a1, a2)); 125 789 ahrens } 126 789 ahrens 127 789 ahrens /* 128 789 ahrens * Evict the specified entry from the cache. 129 789 ahrens */ 130 789 ahrens static void 131 789 ahrens vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) 132 789 ahrens { 133 789 ahrens ASSERT(MUTEX_HELD(&vc->vc_lock)); 134 789 ahrens ASSERT(ve->ve_fill_io == NULL); 135 789 ahrens ASSERT(ve->ve_data != NULL); 136 789 ahrens 137 789 ahrens avl_remove(&vc->vc_lastused_tree, ve); 138 789 ahrens avl_remove(&vc->vc_offset_tree, ve); 139 3059 ahrens zio_buf_free(ve->ve_data, VCBS); 140 789 ahrens kmem_free(ve, sizeof (vdev_cache_entry_t)); 141 789 ahrens } 142 789 ahrens 143 789 ahrens /* 144 789 ahrens * Allocate an entry in the cache. At the point we don't have the data, 145 789 ahrens * we're just creating a placeholder so that multiple threads don't all 146 789 ahrens * go off and read the same blocks. 147 789 ahrens */ 148 789 ahrens static vdev_cache_entry_t * 149 789 ahrens vdev_cache_allocate(zio_t *zio) 150 789 ahrens { 151 789 ahrens vdev_cache_t *vc = &zio->io_vd->vdev_cache; 152 3059 ahrens uint64_t offset = P2ALIGN(zio->io_offset, VCBS); 153 789 ahrens vdev_cache_entry_t *ve; 154 789 ahrens 155 789 ahrens ASSERT(MUTEX_HELD(&vc->vc_lock)); 156 789 ahrens 157 3059 ahrens if (zfs_vdev_cache_size == 0) 158 789 ahrens return (NULL); 159 789 ahrens 160 789 ahrens /* 161 789 ahrens * If adding a new entry would exceed the cache size, 162 789 ahrens * evict the oldest entry (LRU). 163 789 ahrens */ 164 3059 ahrens if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > 165 3059 ahrens zfs_vdev_cache_size) { 166 789 ahrens ve = avl_first(&vc->vc_lastused_tree); 167 7754 Jeff if (ve->ve_fill_io != NULL) 168 789 ahrens return (NULL); 169 789 ahrens ASSERT(ve->ve_hits != 0); 170 789 ahrens vdev_cache_evict(vc, ve); 171 789 ahrens } 172 789 ahrens 173 789 ahrens ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); 174 789 ahrens ve->ve_offset = offset; 175 11066 rafael ve->ve_lastused = ddi_get_lbolt(); 176 3059 ahrens ve->ve_data = zio_buf_alloc(VCBS); 177 789 ahrens 178 789 ahrens avl_add(&vc->vc_offset_tree, ve); 179 789 ahrens avl_add(&vc->vc_lastused_tree, ve); 180 789 ahrens 181 789 ahrens return (ve); 182 789 ahrens } 183 789 ahrens 184 789 ahrens static void 185 789 ahrens vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) 186 789 ahrens { 187 3059 ahrens uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 188 789 ahrens 189 789 ahrens ASSERT(MUTEX_HELD(&vc->vc_lock)); 190 789 ahrens ASSERT(ve->ve_fill_io == NULL); 191 789 ahrens 192 11066 rafael if (ve->ve_lastused != ddi_get_lbolt()) { 193 789 ahrens avl_remove(&vc->vc_lastused_tree, ve); 194 11066 rafael ve->ve_lastused = ddi_get_lbolt(); 195 789 ahrens avl_add(&vc->vc_lastused_tree, ve); 196 789 ahrens } 197 789 ahrens 198 789 ahrens ve->ve_hits++; 199 789 ahrens bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); 200 789 ahrens } 201 789 ahrens 202 789 ahrens /* 203 789 ahrens * Fill a previously allocated cache entry with data. 204 789 ahrens */ 205 789 ahrens static void 206 8632 Bill vdev_cache_fill(zio_t *fio) 207 789 ahrens { 208 8632 Bill vdev_t *vd = fio->io_vd; 209 789 ahrens vdev_cache_t *vc = &vd->vdev_cache; 210 8632 Bill vdev_cache_entry_t *ve = fio->io_private; 211 8632 Bill zio_t *pio; 212 789 ahrens 213 8632 Bill ASSERT(fio->io_size == VCBS); 214 789 ahrens 215 789 ahrens /* 216 789 ahrens * Add data to the cache. 217 789 ahrens */ 218 789 ahrens mutex_enter(&vc->vc_lock); 219 789 ahrens 220 8632 Bill ASSERT(ve->ve_fill_io == fio); 221 8632 Bill ASSERT(ve->ve_offset == fio->io_offset); 222 8632 Bill ASSERT(ve->ve_data == fio->io_data); 223 789 ahrens 224 789 ahrens ve->ve_fill_io = NULL; 225 789 ahrens 226 789 ahrens /* 227 789 ahrens * Even if this cache line was invalidated by a missed write update, 228 789 ahrens * any reads that were queued up before the missed update are still 229 789 ahrens * valid, so we can satisfy them from this line before we evict it. 230 789 ahrens */ 231 8632 Bill while ((pio = zio_walk_parents(fio)) != NULL) 232 8632 Bill vdev_cache_hit(vc, ve, pio); 233 789 ahrens 234 8632 Bill if (fio->io_error || ve->ve_missed_update) 235 789 ahrens vdev_cache_evict(vc, ve); 236 789 ahrens 237 789 ahrens mutex_exit(&vc->vc_lock); 238 789 ahrens } 239 789 ahrens 240 789 ahrens /* 241 789 ahrens * Read data from the cache. Returns 0 on cache hit, errno on a miss. 242 789 ahrens */ 243 789 ahrens int 244 789 ahrens vdev_cache_read(zio_t *zio) 245 789 ahrens { 246 789 ahrens vdev_cache_t *vc = &zio->io_vd->vdev_cache; 247 789 ahrens vdev_cache_entry_t *ve, ve_search; 248 3059 ahrens uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); 249 3059 ahrens uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 250 789 ahrens zio_t *fio; 251 789 ahrens 252 789 ahrens ASSERT(zio->io_type == ZIO_TYPE_READ); 253 789 ahrens 254 789 ahrens if (zio->io_flags & ZIO_FLAG_DONT_CACHE) 255 789 ahrens return (EINVAL); 256 789 ahrens 257 3059 ahrens if (zio->io_size > zfs_vdev_cache_max) 258 789 ahrens return (EOVERFLOW); 259 789 ahrens 260 789 ahrens /* 261 789 ahrens * If the I/O straddles two or more cache blocks, don't cache it. 262 789 ahrens */ 263 7837 Matthew if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) 264 789 ahrens return (EXDEV); 265 789 ahrens 266 3059 ahrens ASSERT(cache_phase + zio->io_size <= VCBS); 267 789 ahrens 268 789 ahrens mutex_enter(&vc->vc_lock); 269 789 ahrens 270 789 ahrens ve_search.ve_offset = cache_offset; 271 789 ahrens ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); 272 789 ahrens 273 789 ahrens if (ve != NULL) { 274 789 ahrens if (ve->ve_missed_update) { 275 789 ahrens mutex_exit(&vc->vc_lock); 276 789 ahrens return (ESTALE); 277 789 ahrens } 278 789 ahrens 279 789 ahrens if ((fio = ve->ve_fill_io) != NULL) { 280 789 ahrens zio_vdev_io_bypass(zio); 281 8632 Bill zio_add_child(zio, fio); 282 789 ahrens mutex_exit(&vc->vc_lock); 283 5810 ek110237 VDCSTAT_BUMP(vdc_stat_delegations); 284 789 ahrens return (0); 285 789 ahrens } 286 789 ahrens 287 789 ahrens vdev_cache_hit(vc, ve, zio); 288 789 ahrens zio_vdev_io_bypass(zio); 289 789 ahrens 290 789 ahrens mutex_exit(&vc->vc_lock); 291 5810 ek110237 VDCSTAT_BUMP(vdc_stat_hits); 292 789 ahrens return (0); 293 4634 ek110237 } 294 4634 ek110237 295 789 ahrens ve = vdev_cache_allocate(zio); 296 789 ahrens 297 789 ahrens if (ve == NULL) { 298 789 ahrens mutex_exit(&vc->vc_lock); 299 789 ahrens return (ENOMEM); 300 789 ahrens } 301 789 ahrens 302 7754 Jeff fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, 303 3059 ahrens ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, 304 7754 Jeff ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); 305 789 ahrens 306 789 ahrens ve->ve_fill_io = fio; 307 789 ahrens zio_vdev_io_bypass(zio); 308 8632 Bill zio_add_child(zio, fio); 309 789 ahrens 310 789 ahrens mutex_exit(&vc->vc_lock); 311 789 ahrens zio_nowait(fio); 312 5810 ek110237 VDCSTAT_BUMP(vdc_stat_misses); 313 789 ahrens 314 789 ahrens return (0); 315 789 ahrens } 316 789 ahrens 317 789 ahrens /* 318 789 ahrens * Update cache contents upon write completion. 319 789 ahrens */ 320 789 ahrens void 321 789 ahrens vdev_cache_write(zio_t *zio) 322 789 ahrens { 323 789 ahrens vdev_cache_t *vc = &zio->io_vd->vdev_cache; 324 789 ahrens vdev_cache_entry_t *ve, ve_search; 325 789 ahrens uint64_t io_start = zio->io_offset; 326 789 ahrens uint64_t io_end = io_start + zio->io_size; 327 3059 ahrens uint64_t min_offset = P2ALIGN(io_start, VCBS); 328 3059 ahrens uint64_t max_offset = P2ROUNDUP(io_end, VCBS); 329 789 ahrens avl_index_t where; 330 789 ahrens 331 789 ahrens ASSERT(zio->io_type == ZIO_TYPE_WRITE); 332 789 ahrens 333 789 ahrens mutex_enter(&vc->vc_lock); 334 789 ahrens 335 789 ahrens ve_search.ve_offset = min_offset; 336 789 ahrens ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); 337 789 ahrens 338 789 ahrens if (ve == NULL) 339 789 ahrens ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); 340 789 ahrens 341 789 ahrens while (ve != NULL && ve->ve_offset < max_offset) { 342 789 ahrens uint64_t start = MAX(ve->ve_offset, io_start); 343 3059 ahrens uint64_t end = MIN(ve->ve_offset + VCBS, io_end); 344 789 ahrens 345 789 ahrens if (ve->ve_fill_io != NULL) { 346 789 ahrens ve->ve_missed_update = 1; 347 789 ahrens } else { 348 789 ahrens bcopy((char *)zio->io_data + start - io_start, 349 789 ahrens ve->ve_data + start - ve->ve_offset, end - start); 350 789 ahrens } 351 789 ahrens ve = AVL_NEXT(&vc->vc_offset_tree, ve); 352 789 ahrens } 353 789 ahrens mutex_exit(&vc->vc_lock); 354 789 ahrens } 355 789 ahrens 356 789 ahrens void 357 4451 eschrock vdev_cache_purge(vdev_t *vd) 358 4451 eschrock { 359 4451 eschrock vdev_cache_t *vc = &vd->vdev_cache; 360 4451 eschrock vdev_cache_entry_t *ve; 361 4451 eschrock 362 4451 eschrock mutex_enter(&vc->vc_lock); 363 4451 eschrock while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) 364 4451 eschrock vdev_cache_evict(vc, ve); 365 4451 eschrock mutex_exit(&vc->vc_lock); 366 4451 eschrock } 367 4451 eschrock 368 4451 eschrock void 369 789 ahrens vdev_cache_init(vdev_t *vd) 370 789 ahrens { 371 789 ahrens vdev_cache_t *vc = &vd->vdev_cache; 372 789 ahrens 373 789 ahrens mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); 374 789 ahrens 375 789 ahrens avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, 376 789 ahrens sizeof (vdev_cache_entry_t), 377 789 ahrens offsetof(struct vdev_cache_entry, ve_offset_node)); 378 789 ahrens 379 789 ahrens avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, 380 789 ahrens sizeof (vdev_cache_entry_t), 381 789 ahrens offsetof(struct vdev_cache_entry, ve_lastused_node)); 382 789 ahrens } 383 789 ahrens 384 789 ahrens void 385 789 ahrens vdev_cache_fini(vdev_t *vd) 386 789 ahrens { 387 789 ahrens vdev_cache_t *vc = &vd->vdev_cache; 388 789 ahrens 389 4451 eschrock vdev_cache_purge(vd); 390 789 ahrens 391 789 ahrens avl_destroy(&vc->vc_offset_tree); 392 789 ahrens avl_destroy(&vc->vc_lastused_tree); 393 789 ahrens 394 789 ahrens mutex_destroy(&vc->vc_lock); 395 789 ahrens } 396 5810 ek110237 397 5810 ek110237 void 398 5810 ek110237 vdev_cache_stat_init(void) 399 5810 ek110237 { 400 5810 ek110237 vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", 401 5810 ek110237 KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), 402 5810 ek110237 KSTAT_FLAG_VIRTUAL); 403 5810 ek110237 if (vdc_ksp != NULL) { 404 5810 ek110237 vdc_ksp->ks_data = &vdc_stats; 405 5810 ek110237 kstat_install(vdc_ksp); 406 5810 ek110237 } 407 5810 ek110237 } 408 5810 ek110237 409 5810 ek110237 void 410 5810 ek110237 vdev_cache_stat_fini(void) 411 5810 ek110237 { 412 5810 ek110237 if (vdc_ksp != NULL) { 413 5810 ek110237 kstat_delete(vdc_ksp); 414 5810 ek110237 vdc_ksp = NULL; 415 5810 ek110237 } 416 5810 ek110237 } 417