Home | History | Annotate | Download | only in intrd
      1 #!/usr/perl5/bin/perl
      2 #
      3 # CDDL HEADER START
      4 #
      5 # The contents of this file are subject to the terms of the
      6 # Common Development and Distribution License (the "License").
      7 # You may not use this file except in compliance with the License.
      8 #
      9 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     10 # or http://www.opensolaris.org/os/licensing.
     11 # See the License for the specific language governing permissions
     12 # and limitations under the License.
     13 #
     14 # When distributing Covered Code, include this CDDL HEADER in each
     15 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     16 # If applicable, add the following below this CDDL HEADER, with the
     17 # fields enclosed by brackets "[]" replaced with your own identifying
     18 # information: Portions Copyright [yyyy] [name of copyright owner]
     19 #
     20 # CDDL HEADER END
     21 #
     22 
     23 #
     24 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     25 # Use is subject to license terms.
     26 #
     27 
     28 require 5.8.4;
     29 use strict;
     30 use warnings;
     31 use POSIX;
     32 use File::Basename("basename");
     33 
     34 my $cmdname = basename($0);
     35 
     36 my $using_scengen = 0;	# 1 if using scenario simulator
     37 my $debug = 0;
     38 
     39 my $normal_sleeptime = 10;		# time to sleep between samples
     40 my $idle_sleeptime = 45;		# time to sleep when idle
     41 my $onecpu_sleeptime = (60 * 15);	# used if only 1 CPU on system
     42 my $sleeptime = $normal_sleeptime;	# either normal_ or idle_ or onecpu_
     43 
     44 my $idle_intrload = .1;			# idle if interrupt load < 10%
     45 
     46 my $timerange_toohi    = .01;
     47 my $statslen = 60;	# time period (in secs) to keep in @deltas
     48 
     49 
     50 # Parse arguments. intrd does not accept any public arguments; the two
     51 # arguments below are meant for testing purposes. -D generates a significant
     52 # amount of syslog output. -S <filename> loads the filename as a perl
     53 # script. That file is expected to implement a kstat "simulator" which
     54 # can be used to feed information to intrd and verify intrd's responses.
     55 
     56 while ($_ = shift @ARGV) {
     57 	if ($_ eq "-S" && $#ARGV != -1) {
     58 		$using_scengen = 1;
     59 		do $ARGV[0];	# load simulator
     60 		shift @ARGV;
     61 	} elsif ($_ eq "-D") {
     62 		$debug = 1;
     63 	}
     64 }
     65 
     66 if ($using_scengen == 0) {
     67 	require Sun::Solaris::Kstat;
     68 	require Sun::Solaris::Intrs;
     69 	import Sun::Solaris::Intrs(qw(intrmove is_pcplusmp));
     70 	require Sys::Syslog;
     71 	import Sys::Syslog;
     72 	openlog($cmdname, 'pid', 'daemon');
     73 	setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
     74 	    &Sys::Syslog::LOG_INFO));
     75 }
     76 
     77 my $asserted = 0;
     78 my $assert_level = 'debug';	# syslog level for assertion failures
     79 sub VERIFY($@)
     80 {
     81 	my $bad = (shift() == 0);	# $_[0] == 0 means assert failed
     82 	if ($bad) {
     83 		my $msg = shift();
     84 		syslog($assert_level, "VERIFY: $msg", @_);
     85 		$asserted++;
     86 	}
     87 	return ($bad);
     88 }
     89 
     90 
     91 
     92 
     93 sub getstat($$);
     94 sub generate_delta($$);
     95 sub compress_deltas($);
     96 sub dumpdelta($);
     97 
     98 sub goodness($);
     99 sub imbalanced($$);
    100 sub do_reconfig($);
    101 
    102 sub goodness_cpu($$);		# private function
    103 sub move_intr($$$$);		# private function
    104 sub ivecs_to_string(@);		# private function
    105 sub do_find_goal($$$$);		# private function
    106 sub find_goal($$);		# private function
    107 sub do_reconfig_cpu2cpu($$$$);	# private function
    108 sub do_reconfig_cpu($$$);	# private function
    109 
    110 
    111 #
    112 # What follow are the basic data structures routines of intrd.
    113 #
    114 # getstat() is responsible for reading the kstats and generating a "stat" hash.
    115 #
    116 # generate_delta() is responsible for taking two "stat" hashes and creating
    117 # a new "delta" hash that represents what has changed over time.
    118 #
    119 # compress_deltas() is responsible for taking a list of deltas and generating
    120 # a single delta hash that encompasses all the time periods described by the
    121 # deltas.
    122 
    123 
    124 #
    125 # getstat() is handed a reference to a kstat and generates a hash, returned
    126 # by reference, containing all the fields from the kstats which we need.
    127 # If it returns the scalar 0, it failed to gather the kstats, and the caller
    128 # should react accordingly.
    129 #
    130 # getstat() is also responsible for maintaining a reasonable $sleeptime.
    131 #
    132 # {"snaptime"}          kstat's snaptime
    133 # {<cpuid>}             one hash reference per online cpu
    134 #  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
    135 #  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
    136 #  ->{"ivecs"}
    137 #     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
    138 #        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
    139 #        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
    140 #        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
    141 #        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
    142 #        ->{"num_ino"}  == num inos of single device instance sharing this entry
    143 #				Will be > 1 on pcplusmp X86 systems for devices
    144 #				with multiple MSI interrupts.
    145 #        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
    146 #        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
    147 #        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
    148 #
    149 
    150 sub getstat($$)
    151 {
    152 	my ($ks, $pcplusmp_sys) = @_;
    153 
    154 	my $cpucnt = 0;
    155 	my %stat = ();
    156 	my ($minsnap, $maxsnap);
    157 
    158 	# Hash of hash which matches (MSI device, ino) combos to kstats.
    159 	my %msidevs = ();
    160 
    161 	# kstats are not generated atomically. Each kstat hierarchy will
    162 	# have been generated within the kernel at a different time. On a
    163 	# thrashing system, we may not run quickly enough in order to get
    164 	# coherent kstat timing information across all the kstats. To
    165 	# determine if this is occurring, $minsnap/$maxsnap are used to
    166 	# find the breadth between the first and last snaptime of all the
    167 	# kstats we access. $maxsnap - $minsnap roughly represents the
    168 	# total time taken up in getstat(). If this time approaches the
    169 	# time between snapshots, our results may not be useful.
    170 
    171 	$minsnap = -1;		# snaptime is always a positive number
    172 	$maxsnap = $minsnap;
    173 
    174 	# Iterate over the cpus in cpu:<cpuid>::. Check
    175 	# cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
    176 	# processor is "on-line". If not, it isn't accepting interrupts
    177 	# and doesn't concern us.
    178 	#
    179 	# Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
    180 
    181 	while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
    182 		next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
    183 		#"state" fld of kstat w/
    184 		#		  modname    inst name-"cpuinfo0"
    185 		my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
    186 		next if ($state !~ /^on-line\0/);
    187 		my $cpu_sys = $cpst->{sys};
    188 
    189 		$stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
    190 				    $cpu_sys->{cpu_nsec_user} +
    191 				    $cpu_sys->{cpu_nsec_kernel});
    192 		$stat{$cpu}{crtime} = $cpu_sys->{crtime};
    193 		$stat{$cpu}{ivecs} = {};
    194 
    195 		if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
    196 			$minsnap = $cpu_sys->{snaptime};
    197 		}
    198 		if ($cpu_sys->{snaptime} > $maxsnap) {
    199 			$maxsnap = $cpu_sys->{snaptime};
    200 		}
    201 		$cpucnt++;
    202 	}
    203 
    204 	if ($cpucnt <= 1) {
    205 		$sleeptime = $onecpu_sleeptime;
    206 		return (0);	# nothing to do with 1 CPU
    207 	}
    208 
    209 	# Iterate over the ivecs. If the cpu is not on-line, ignore the
    210 	# ivecs mapped to it, if any.
    211 	#
    212 	# Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
    213 	# ino, name, and buspath. Check $minsnap/$maxsnap.
    214 
    215 	foreach my $inst (values(%{$ks->{pci_intrs}})) {
    216 		my $intrcfg = (values(%$inst))[0]; 
    217 		my $cpu = $intrcfg->{cpu};
    218 
    219 		next unless exists $stat{$cpu};
    220 		next if ($intrcfg->{type} =~ /^disabled\0/);
    221 
    222 		# Perl looks beyond NULL chars in pattern matching.
    223 		# Truncate name field at the first NULL
    224 		$intrcfg->{name} =~ s/\0.*$//;
    225 
    226 		if ($intrcfg->{snaptime} < $minsnap) {
    227 			$minsnap = $intrcfg->{snaptime};
    228 		} elsif ($intrcfg->{snaptime} > $maxsnap) {
    229 			$maxsnap = $intrcfg->{snaptime};
    230 		}
    231 
    232 		my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
    233 		if (exists $stat{$cpu}{ivecs}{$cookie}) {
    234 			my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
    235 
    236 			$cookiestats->{time} += $intrcfg->{time};
    237 			$cookiestats->{name} .= "/$intrcfg->{name}";
    238 
    239 			# If this new interrupt sharing $cookie represents a
    240 			# change from an earlier getstat, make sure that
    241 			# generate_delta will see the change by setting
    242 			# crtime to the most recent crtime of its components.
    243 
    244 			if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
    245 				$cookiestats->{crtime} = $intrcfg->{crtime};
    246 			}
    247 			$cookiestats->{ihs}++;
    248 			next;
    249 		}
    250 		$stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
    251 		$stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
    252 		$stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
    253 		$stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
    254 		$stat{$cpu}{ivecs}{$cookie}{num_ino} = 1;
    255 		$stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
    256 		$stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
    257 		$stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
    258 
    259 		if ($pcplusmp_sys && ($intrcfg->{type} =~ /^msi\0/)) {
    260 			if (!(exists($msidevs{$intrcfg->{name}}))) {
    261 				$msidevs{$intrcfg->{name}} = {};
    262 			}
    263 			$msidevs{$intrcfg->{name}}{$intrcfg->{ino}} =
    264 			    \$stat{$cpu}{ivecs}{$cookie};
    265 		}
    266 	}
    267 
    268 	# All MSI interrupts of a device instance share a single MSI address.
    269 	# On X86 systems with an APIC, this MSI address is interpreted as CPU
    270 	# routing info by the APIC.  For this reason, on these platforms, all
    271 	# interrupts for MSI devices must be moved to the same CPU at the same
    272 	# time.
    273 	#
    274 	# Since all interrupts will be on the same CPU on these platforms, all
    275 	# interrupts can be consolidated into one ivec entry.  For such devices,
    276 	# num_ino will be > 1 to denote that a group move is needed.  
    277 
    278 	# Loop thru all MSI devices on X86 pcplusmp systems.
    279 	# Nop on other systems.
    280 	foreach my $msidevkey (sort keys %msidevs) {
    281 
    282 		# Loop thru inos of the device, sorted by lowest value first
    283 		# For each cookie found for a device, incr num_ino for the
    284 		# lowest cookie and remove other cookies.
    285 
    286 		# Assumes PIL is the same for first and current cookies
    287 
    288 		my $first_ino = -1;
    289 		my $first_cookiep;
    290 		my $curr_cookiep;
    291 		foreach my $inokey (sort keys %{$msidevs{$msidevkey}}) {
    292 			$curr_cookiep = $msidevs{$msidevkey}{$inokey};
    293 			if ($first_ino == -1) {
    294 				$first_ino = $inokey;
    295 				$first_cookiep = $curr_cookiep;
    296 			} else {
    297 				$$first_cookiep->{num_ino}++;
    298 				$$first_cookiep->{time} +=
    299 				    $$curr_cookiep->{time};
    300 				if ($$curr_cookiep->{crtime} >
    301 				    $$first_cookiep->{crtime}) {
    302 					$$first_cookiep->{crtime} =
    303 					    $$curr_cookiep->{crtime};
    304 				}
    305 				# Invalidate this cookie, less complicated and
    306 				# more efficient than deleting it.
    307 				$$curr_cookiep->{num_ino} = 0;
    308 			}
    309 		}
    310 	}
    311 
    312 	# We define the timerange as the amount of time spent gathering the
    313 	# various kstats, divided by our sleeptime. If we take a lot of time
    314 	# to access the kstats, and then we create a delta comparing these
    315 	# kstats with a prior set of kstats, that delta will cover
    316 	# substaintially different amount of time depending upon which
    317 	# interrupt or CPU is being examined.
    318 	#
    319 	# By checking the timerange here, we guarantee that any deltas
    320 	# created from these kstats will contain self-consistent data,
    321 	# in that all CPUs and interrupts cover a similar span of time.
    322 	#
    323 	# $timerange_toohi is the upper bound. Any timerange above
    324 	# this is thrown out as garbage. If the stat is safely within this
    325 	# bound, we treat the stat as representing an instant in time, rather
    326 	# than the time range it actually spans. We arbitrarily choose minsnap
    327 	# as the snaptime of the stat.
    328 
    329 	$stat{snaptime} = $minsnap;
    330 	my $timerange = ($maxsnap - $minsnap) / $sleeptime;
    331 	return (0) if ($timerange > $timerange_toohi);	# i.e. failure
    332 	return (\%stat);
    333 }
    334 
    335 #
    336 # dumpdelta takes a reference to our "delta" structure:
    337 # {"missing"}           "1" if the delta's component stats had inconsistencies
    338 # {"minsnap"}           time of the first kstat snaptime used in this delta
    339 # {"maxsnap"}           time of the last kstat snaptime used in this delta
    340 # {"goodness"}          cost function applied to this delta
    341 # {"avgintrload"}       avg of interrupt load across cpus, as a percentage
    342 # {"avgintrnsec"}       avg number of nsec spent in interrupts, per cpu
    343 # {<cpuid>}             iterates over on-line cpus
    344 #  ->{"intrs"}          cpu's movable intr time (sum of "time" for each ivec)
    345 #  ->{"tot"}            CPU load from all sources in nsec
    346 #  ->{"bigintr"}        largest value of {ivecs}{<ivec#>}{time} from below
    347 #  ->{"intrload"}       intrs / tot
    348 #  ->{"ivecs"}          
    349 #     ->{<ivec#>}       iterates over ivecs for this cpu
    350 #        ->{"time"}     time used by this interrupt (in nsec)
    351 #        ->{"pil"}      pil level of this interrupt
    352 #        ->{"ino"}      interrupt number (or base vector if MSI group)
    353 #        ->{"buspath"}  filename of the directory of the device's bus
    354 #        ->{"name"}     device name
    355 #        ->{"ihs"}      number of different handlers sharing this ino
    356 #        ->{"num_ino"}  number of interrupt vectors in MSI group
    357 #
    358 # It prints out the delta structure in a nice, human readable display.
    359 #
    360 
    361 sub dumpdelta($)
    362 {
    363 	my ($delta) = @_;
    364 
    365 	# print global info
    366 
    367 	syslog('debug', "dumpdelta:");
    368 	syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
    369 	syslog('debug', " avgintrload: %5.2f%%  avgintrnsec: %d",
    370 	       $delta->{avgintrload} * 100, $delta->{avgintrnsec});
    371 	syslog('debug', "    goodness: %5.2f%%", $delta->{goodness} * 100)
    372 	    if exists($delta->{goodness});
    373 
    374 	# iterate over cpus
    375 
    376 	while (my ($cpu, $cpst) = each %$delta) {
    377 		next if !ref($cpst);		# skip non-cpuid entries
    378 		my $tot = $cpst->{tot};
    379 		syslog('debug', "    cpu %3d intr %7.3f%%  (bigintr %7.3f%%)",
    380 		       $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
    381 		syslog('debug', "        intrs %d, bigintr %d",
    382 		       $cpst->{intrs}, $cpst->{bigintr});
    383 
    384 		# iterate over ivecs on this cpu
    385 
    386 		while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
    387 			syslog('debug', "    %15s:\"%s\": %7.3f%%  %d",
    388 			    ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" :
    389 			    $ivst->{name}), $ivec,
    390 			    $ivst->{time}*100 / $tot, $ivst->{time});
    391 		}
    392 	}
    393 }
    394 
    395 #
    396 # generate_delta($stat, $newstat) takes two stat references, returned from
    397 # getstat(), and creates a %delta. %delta (not surprisingly) contains the
    398 # same basic info as stat and newstat, but with the timestamps as deltas
    399 # instead of absolute times. We return a reference to the delta.
    400 #
    401 
    402 sub generate_delta($$)
    403 {
    404 	my ($stat, $newstat) = @_;
    405 
    406 	my %delta = ();
    407 	my $intrload;
    408 	my $intrnsec;
    409 	my $cpus;
    410 
    411 	# Take the worstcase timerange
    412 	$delta{minsnap} = $stat->{snaptime};
    413 	$delta{maxsnap} = $newstat->{snaptime};
    414 	if (VERIFY($delta{maxsnap} > $delta{minsnap},
    415 	    "generate_delta: stats aren't ascending")) {
    416 		$delta{missing} = 1;
    417 		return (\%delta);
    418 	}
    419 
    420 	# if there are a different number of cpus in the stats, set missing
    421 
    422 	$delta{missing} = (keys(%$stat) != keys(%$newstat));
    423 	if (VERIFY($delta{missing} == 0,
    424 	    "generate_delta: number of CPUs changed")) {
    425 		return (\%delta);
    426 	}
    427 
    428 	# scan through every cpu in %newstat and compare against %stat
    429 
    430 	while (my ($cpu, $newcpst) = each %$newstat) {
    431 		next if !ref($newcpst);		# skip non-cpuid fields
    432 
    433 		# If %stat is missing a cpu from %newstat, then it was just
    434 		# onlined. Mark missing.
    435 
    436 		if (VERIFY(exists $stat->{$cpu} &&
    437 		    $stat->{$cpu}{crtime} == $newcpst->{crtime},
    438 		    "generate_delta: cpu $cpu changed")) {
    439 			$delta{missing} = 1;
    440 			return (\%delta);
    441 		}
    442 		my $cpst = $stat->{$cpu};
    443 		$delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
    444 		if (VERIFY($delta{$cpu}{tot} >= 0,
    445 		    "generate_delta: deltas are not ascending?")) {
    446 			$delta{missing} = 1;
    447 			delete($delta{$cpu});
    448 			return (\%delta);
    449 		}
    450 		# Avoid remote chance of division by zero
    451 		$delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
    452 		$delta{$cpu}{intrs} = 0;
    453 		$delta{$cpu}{bigintr} = 0;
    454 
    455 		my %ivecs = ();
    456 		$delta{$cpu}{ivecs} = \%ivecs;
    457 
    458 		# if the number of ivecs differs, set missing
    459 
    460 		if (VERIFY(keys(%{$cpst->{ivecs}}) ==
    461 			   keys(%{$newcpst->{ivecs}}),
    462 			   "generate_delta: cpu $cpu has more/less".
    463 			   " interrupts")) {
    464 			$delta{missing} = 1;
    465 			return (\%delta);
    466 		}
    467 
    468 		while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {
    469 
    470 			# Unused cookie, corresponding to an MSI vector which
    471 			# is part of a group.  The whole group is accounted for
    472 			# by a different cookie.
    473 			next if ($newivec->{num_ino} == 0);
    474 
    475 			# If this ivec doesn't exist in $stat, or if $stat
    476 			# shows a different crtime, set missing.
    477 			if (VERIFY(exists $cpst->{ivecs}{$inum} &&
    478 				   $cpst->{ivecs}{$inum}{crtime} ==
    479 				   $newivec->{crtime},
    480 				   "generate_delta: cpu $cpu inum $inum".
    481 				   " has changed")) {
    482 				$delta{missing} = 1;
    483 				return (\%delta);
    484 			}
    485 			my $ivec = $cpst->{ivecs}{$inum};
    486 
    487 			# Create $delta{$cpu}{ivecs}{$inum}.
    488 
    489 			my %dltivec = ();
    490 			$delta{$cpu}{ivecs}{$inum} = \%dltivec;
    491 
    492 			# calculate time used by this interrupt
    493 
    494 			my $time = $newivec->{time} - $ivec->{time};
    495 			if (VERIFY($time >= 0,
    496 				   "generate_delta: ivec went backwards?")) {
    497 				$delta{missing} = 1;
    498 				delete($delta{$cpu}{ivecs}{$inum});
    499 				return (\%delta);
    500 			}
    501 			$delta{$cpu}{intrs} += $time;
    502 			$dltivec{time} = $time;
    503 			if ($time > $delta{$cpu}{bigintr}) {
    504 				$delta{$cpu}{bigintr} = $time;
    505 			}
    506 
    507 			# Transfer over basic info about the kstat. We
    508 			# don't have to worry about discrepancies between
    509 			# ivec and newivec because we verified that both
    510 			# have the same crtime.
    511 
    512 			$dltivec{pil} = $newivec->{pil};
    513 			$dltivec{ino} = $newivec->{ino};
    514 			$dltivec{buspath} = $newivec->{buspath};
    515 			$dltivec{name} = $newivec->{name};
    516 			$dltivec{ihs} = $newivec->{ihs};
    517 			$dltivec{num_ino} = $newivec->{num_ino};
    518 		}
    519 		if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
    520 			# Ewww! Hopefully just a rounding error.
    521 			# Make something up.
    522 			$delta{$cpu}{tot} = $delta{$cpu}{intrs};
    523 		}
    524 		$delta{$cpu}{intrload} =
    525 		       $delta{$cpu}{intrs} / $delta{$cpu}{tot};
    526 		$intrload += $delta{$cpu}{intrload};
    527 		$intrnsec += $delta{$cpu}{intrs};
    528 		$cpus++;
    529 	}
    530 	if ($cpus > 0) {
    531 		$delta{avgintrload} = $intrload / $cpus;
    532 		$delta{avgintrnsec} = $intrnsec / $cpus;
    533 	} else {
    534 		$delta{avgintrload} = 0;
    535 		$delta{avgintrnsec} = 0;
    536 	}
    537 	return (\%delta);
    538 }
    539 
    540 
    541 # compress_delta takes a list of deltas, and returns a single new delta
    542 # which represents the combined information from all the deltas. The deltas
    543 # provided are assumed to be sequential in time. The resulting compressed
    544 # delta looks just like any other delta. This new delta is also more accurate
    545 # since its statistics are averaged over a longer period than any of the
    546 # original deltas.
    547 
    548 sub compress_deltas ($)
    549 {
    550 	my ($deltas) = @_;
    551 
    552 	my %newdelta = ();
    553 	my ($intrs, $tot);
    554 	my $cpus = 0;
    555 	my ($high_intrload) = 0;
    556 
    557 	if (VERIFY($#$deltas != -1,
    558 		   "compress_deltas: list of delta is empty?")) {
    559 		return (0);
    560 	}
    561 	$newdelta{minsnap} = $deltas->[0]{minsnap};
    562 	$newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
    563 	$newdelta{missing} = 0;
    564 
    565 	foreach my $delta (@$deltas) {
    566 		if (VERIFY($delta->{missing} == 0,
    567 		    "compressing bad deltas?")) {
    568 			return (0);
    569 		}
    570 		while (my ($cpuid, $cpu) = each %$delta) {
    571 			next if !ref($cpu);
    572 
    573 			$intrs += $cpu->{intrs};
    574 			$tot += $cpu->{tot};
    575 			$newdelta{$cpuid}{intrs} += $cpu->{intrs};
    576 			$newdelta{$cpuid}{tot} += $cpu->{tot};
    577 			if (!exists $newdelta{$cpuid}{ivecs}) {
    578 				my %ivecs = ();
    579 				$newdelta{$cpuid}{ivecs} = \%ivecs;
    580 			}
    581 			while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
    582 				my $newivecs = $newdelta{$cpuid}{ivecs};
    583 				$newivecs->{$inum}{time} += $ivec->{time};
    584 				$newivecs->{$inum}{pil} = $ivec->{pil};
    585 				$newivecs->{$inum}{ino} = $ivec->{ino};
    586 				$newivecs->{$inum}{buspath} = $ivec->{buspath};
    587 				$newivecs->{$inum}{name} = $ivec->{name};
    588 				$newivecs->{$inum}{ihs} = $ivec->{ihs};
    589 				$newivecs->{$inum}{num_ino} = $ivec->{num_ino};
    590 			}
    591 		}
    592 	}
    593 	foreach my $cpu (values(%newdelta)) {
    594 		next if !ref($cpu); # ignore non-cpu fields
    595 		$cpus++;
    596 
    597 		my $bigintr = 0;
    598 		foreach my $ivec (values(%{$cpu->{ivecs}})) {
    599 			if ($ivec->{time} > $bigintr) {
    600 				$bigintr = $ivec->{time};
    601 			}
    602 		}
    603 		$cpu->{bigintr} = $bigintr;
    604 		$cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
    605 		if ($high_intrload < $cpu->{intrload}) {
    606 			$high_intrload = $cpu->{intrload};
    607 		}
    608 		$cpu->{tot} = 1 if $cpu->{tot} <= 0;
    609 	}
    610 	if ($cpus == 0) {
    611 		$newdelta{avgintrnsec} = 0;
    612 		$newdelta{avgintrload} = 0;
    613 	} else {
    614 		$newdelta{avgintrnsec} = $intrs / $cpus;
    615 		$newdelta{avgintrload} = $intrs / $tot;
    616 	}
    617 	$sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime :
    618 	    $normal_sleeptime;
    619 	return (\%newdelta);
    620 }
    621 
    622 
    623 
    624 
    625 
    626 # What follow are the core functions responsible for examining the deltas
    627 # generated above and deciding what to do about them.
    628 #
    629 # goodness() and its helper goodness_cpu() return a heuristic which describe
    630 # how good (or bad) the current interrupt balance is. The value returned will
    631 # be between 0 and 1, with 0 representing maximum goodness, and 1 representing
    632 # maximum badness.
    633 #
    634 # imbalanced() compares a current and historical value of goodness, and
    635 # determines if there has been enough change to warrant evaluating a
    636 # reconfiguration of the interrupts
    637 #
    638 # do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
    639 # find_goal(), do_find_goal(), and move_intr(), are responsible for examining
    640 # a delta and determining the best possible assignment of interrupts to CPUs.
    641 #
    642 # It is important that do_reconfig() be in alignment with goodness(). If
    643 # do_reconfig were to generate a new interrupt distribution that worsened
    644 # goodness, we could get into a pathological loop with intrd fighting itself,
    645 # constantly deciding that things are imbalanced, and then changing things
    646 # only to make them worse.
    647 
    648 
    649 
    650 # any goodness over $goodness_unsafe_load is considered really bad
    651 # goodness must drop by at least $goodness_mindelta for a reconfig
    652 
    653 my $goodness_unsafe_load = .9;
    654 my $goodness_mindelta = .1;
    655 
    656 # goodness(%delta) examines a delta and return its "goodness". goodness will
    657 # be between 0 (best) and 1 (major bad). goodness is determined by evaluating
    658 # the goodness of each individual cpu, and returning the worst case. This
    659 # helps on systems with many CPUs, where otherwise a single pathological CPU
    660 # might otherwise be ignored because the average was OK.
    661 #
    662 # To calculate the goodness of an individual CPU, we start by looking at its
    663 # load due to interrupts. If the load is above a certain high threshold and
    664 # there is more than one interrupt assigned to this CPU, we set goodness
    665 # to worst-case. If the load is below the average interrupt load of all CPUs,
    666 # then we return best-case, since what's to complain about?
    667 #
    668 # Otherwise we look at how much the load is above the average, and return
    669 # that as the goodness, with one caveat: we never return more than the CPU's
    670 # interrupt load ignoring its largest single interrupt source. This is
    671 # because a CPU with one high-load interrupt, and no other interrupts, is
    672 # perfectly balanced. Nothing can be done to improve the situation, and thus
    673 # it is perfectly balanced even if the interrupt's load is 100%.
    674 
    675 sub goodness($)
    676 {
    677 	my ($delta) = @_;
    678 
    679 	return (1) if $delta->{missing} > 0;
    680 
    681 	my $high_goodness = 0;
    682 	my $goodness;
    683 
    684 	foreach my $cpu (values(%$delta)) {
    685 		next if !ref($cpu);		# skip non-cpuid fields
    686 
    687 		$goodness = goodness_cpu($cpu, $delta->{avgintrload});
    688 		if (VERIFY($goodness >= 0 && $goodness <= 1,
    689 			   "goodness: cpu goodness out of range?")) {
    690 			dumpdelta($delta);
    691 			return (1);
    692 		}
    693 		if ($goodness == 1) {
    694 			return (1);	# worst case, no need to continue
    695 		}
    696 		if ($goodness > $high_goodness) {
    697 			$high_goodness = $goodness;
    698 		}
    699 	}
    700 	return ($high_goodness);
    701 }
    702 
    703 sub goodness_cpu($$)		# private function
    704 {
    705 	my ($cpu, $avgintrload) = @_;
    706 
    707 	my $goodness;
    708 	my $load = $cpu->{intrs} / $cpu->{tot};
    709 
    710 	return (0) if ($load < $avgintrload);	# low loads are perfectly good
    711 
    712 	# Calculate $load_no_bigintr, which represents the load
    713 	# due to interrupts, excluding the one biggest interrupt.
    714 	# This is the most gain we can get on this CPU from
    715 	# offloading interrupts.
    716 
    717 	my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};
    718 
    719 	# A major imbalance is indicated if a CPU is saturated
    720 	# with interrupt handling, and it has more than one
    721 	# source of interrupts. Those other interrupts could be
    722 	# starved if of a lower pil. Return a goodness of 1,
    723 	# which is the worst possible return value,
    724 	# which will effectively contaminate this entire delta.
    725 
    726 	my $cnt = keys(%{$cpu->{ivecs}});
    727 
    728 	if ($load > $goodness_unsafe_load && $cnt > 1) {
    729 		return (1);
    730 	}
    731 	$goodness = $load - $avgintrload;
    732 	if ($goodness > $load_no_bigintr) {
    733 		$goodness = $load_no_bigintr;
    734 	}
    735 	return ($goodness);
    736 }
    737 
    738 
    739 # imbalanced() is used by the main routine to determine if the goodness
    740 # has shifted far enough from our last baseline to warrant a reassignment
    741 # of interrupts. A very high goodness indicates that a CPU is way out of
    742 # whack. If the goodness has varied too much since the baseline, then
    743 # perhaps a reconfiguration is worth considering.
    744 
    745 sub imbalanced ($$)
    746 {
    747 	my ($goodness, $baseline) = @_;
    748 
    749 	# Return 1 if we are pathological, or creeping away from the baseline
    750 
    751 	return (1) if $goodness > .50;
    752 	return (1) if abs($goodness - $baseline) > $goodness_mindelta;
    753 	return (0);
    754 }
    755 
    756 # do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
    757 # decision-making functions responsible for generating a new interrupt
    758 # distribution. They are designed with the definition of goodness() in
    759 # mind, i.e. they use the same definition of "good distribution" as does
    760 # goodness().
    761 #
    762 # do_reconfig() is responsible for deciding whether a redistribution is
    763 # actually warranted. If the goodness is already pretty good, it doesn't
    764 # waste the CPU time to generate a new distribution. If it
    765 # calculates a new distribution and finds that it is not sufficiently
    766 # improved from the prior distirbution, it will not do the redistribution,
    767 # mainly to avoid the disruption to system performance caused by
    768 # rejuggling interrupts.
    769 #
    770 # Its main loop works by going through a list of cpus sorted from
    771 # highest to lowest interrupt load. It removes the highest-load cpus
    772 # one at a time and hands them off to do_reconfig_cpu(). This function
    773 # then re-sorts the remaining CPUs from lowest to highest interrupt load,
    774 # and one at a time attempts to rejuggle interrupts between the original
    775 # high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
    776 # considered finished as soon as its interrupt load is within
    777 # $goodness_mindelta of the average interrupt load. Such a CPU will have
    778 # a goodness of below the $goodness_mindelta threshold.
    779 
    780 #
    781 # move_intr(\%delta, $inum, $oldcpu, $newcpu)
    782 # used by reconfiguration code to move an interrupt between cpus within
    783 # a delta. This manipulates data structures, and does not actually move
    784 # the interrupt on the running system.
    785 #
    786 sub move_intr($$$$)		# private function
    787 {
    788 	my ($delta, $inum, $oldcpuid, $newcpuid) = @_;
    789 
    790 	my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};
    791 
    792 	# Remove ivec from old cpu
    793 
    794 	my $oldcpu = $delta->{$oldcpuid};
    795 	$oldcpu->{intrs} -= $ivec->{time};
    796 	$oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
    797 	delete($oldcpu->{ivecs}{$inum});
    798 
    799 	VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
    800 	VERIFY($ivec->{time} <= $oldcpu->{bigintr},
    801 	       "move_intr: intr's time > bigintr?");
    802 
    803 	if ($ivec->{time} >= $oldcpu->{bigintr}) {
    804 		my $bigtime = 0;
    805 
    806 		foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
    807 			$bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
    808 		}
    809 		$oldcpu->{bigintr} = $bigtime;
    810 	}
    811 
    812 	# Add ivec onto new cpu
    813 
    814 	my $newcpu = $delta->{$newcpuid};
    815 
    816 	$ivec->{nowcpu} = $newcpuid;
    817 	$newcpu->{intrs} += $ivec->{time};
    818 	$newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
    819 	$newcpu->{ivecs}{$inum} = $ivec;
    820 
    821 	$newcpu->{bigintr} = $ivec->{time}
    822 		if $ivec->{time} > $newcpu->{bigintr};
    823 }
    824 
    825 sub move_intr_check($$$)	# private function
    826 {
    827 	my ($delta, $oldcpuid, $newcpuid) = @_;
    828 
    829 	VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
    830 	       "Moved interrupts left 100+%% load on src cpu");
    831 	VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
    832 	       "Moved interrupts left 100+%% load on tgt cpu");
    833 }
    834 
    835 sub ivecs_to_string(@)		# private function
    836 {
    837 	my $str = "";
    838 	foreach my $ivec (@_) {
    839 		$str = "$str $ivec->{inum}";
    840 	}
    841 	return ($str);
    842 }
    843 
    844 
    845 sub do_reconfig($)
    846 {
    847 	my ($delta) = @_;
    848 
    849 	my $goodness = $delta->{goodness};
    850 
    851 	# We can't improve goodness to better than 0. We should stop here
    852 	# if, even if we achieve a goodness of 0, the improvement is still
    853 	# too small to merit the action.
    854 
    855 	if ($goodness - 0 < $goodness_mindelta) {
    856 		syslog('debug', "goodness good enough, don't reconfig");
    857 		return (0);
    858 	}
    859 
    860 	syslog('notice', "Optimizing interrupt assignments");
    861 
    862 	if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
    863 	    "have a delta with missing")) {
    864 		return (-1);
    865 	}
    866 
    867 	# Make a list of all cpuids, and also add some extra information
    868 	# to the ivec structures.
    869 
    870 	my @cpusortlist = ();
    871 
    872 	while (my ($cpuid, $cpu) = each %$delta) {
    873 		next if !ref($cpu);	# skip non-cpu entries
    874 
    875 		push(@cpusortlist, $cpuid);
    876 		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
    877 			$ivec->{origcpu} = $cpuid;
    878 			$ivec->{nowcpu} = $cpuid;
    879 			$ivec->{inum} = $inum;
    880 		}
    881 	}
    882 
    883 	# Sort the list of CPUs from highest to lowest interrupt load.
    884 	# Remove the top CPU from that list and attempt to redistribute
    885 	# its interrupts. If the CPU has a goodness below a threshold,
    886 	# just ignore the CPU and move to the next one. If the CPU's
    887 	# load falls below the average load plus that same threshold,
    888 	# then there are no CPUs left worth reconfiguring, and we're done.
    889 
    890 	while (@cpusortlist) {
    891 		# Re-sort cpusortlist each time, since do_reconfig_cpu can
    892 		# move interrupts around.
    893 
    894 		@cpusortlist =
    895 		    sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
    896 		    @cpusortlist);
    897 
    898 		my $cpu = shift(@cpusortlist);
    899 		if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
    900 		    ($delta->{$cpu}{intrload} <=
    901 		    $delta->{avgintrload} + $goodness_mindelta)) {
    902 			syslog('debug', "finished reconfig: cpu $cpu load ".
    903 			    "$delta->{$cpu}{intrload} avgload ".
    904 			    "$delta->{avgintrload}");
    905 			last;
    906 		}
    907 		if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
    908 		    $goodness_mindelta) {
    909 			next;
    910 		}
    911 		do_reconfig_cpu($delta, \@cpusortlist, $cpu);
    912 	}
    913 
    914 	# How good a job did we do? If the improvement was minimal, and
    915 	# our goodness wasn't pathological (and thus needing any help it
    916 	# can get), then don't bother moving the interrupts.
    917 
    918 	my $newgoodness = goodness($delta);
    919 	VERIFY($newgoodness <= $goodness,
    920 	       "reconfig: result has worse goodness?");
    921 
    922 	if (($goodness != 1 || $newgoodness == 1) &&
    923 	    $goodness - $newgoodness < $goodness_mindelta) {
    924 		syslog('debug', "goodness already near optimum, ".
    925 		       "don't reconfig");
    926 		return (0);
    927 	}
    928 	syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
    929 	       $newgoodness*100);
    930 
    931 	# Time to move those interrupts!
    932 
    933 	my $ret = 1;
    934 	my $warned = 0;
    935 	while (my ($cpuid, $cpu) = each %$delta) {
    936 		next if $cpuid =~ /\D/;
    937 		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
    938 			next if ($ivec->{origcpu} == $cpuid);
    939 
    940 			if (!intrmove($ivec->{buspath}, $ivec->{ino},
    941 			    $cpuid, $ivec->{num_ino})) {
    942 				syslog('warning', "Unable to move interrupts")
    943 				    if $warned++ == 0;
    944 				syslog('debug', "Unable to move buspath ".
    945 				    "$ivec->{buspath} ino $ivec->{ino} to ".
    946 				    "cpu $cpuid");
    947 				$ret = -1;
    948 			}
    949 		}
    950 	}
    951 
    952 	syslog('notice', "Interrupt assignments optimized");
    953 	return ($ret);
    954 }
    955 
    956 sub do_reconfig_cpu($$$)	# private function
    957 {
    958 	my ($delta, $cpusortlist, $oldcpuid) = @_;
    959 
    960 	# We have been asked to rejuggle interrupts between $oldcpuid and
    961 	# other CPUs found on $cpusortlist so as to improve the load on
    962 	# $oldcpuid. We reverse $cpusortlist to get our own copy of the
    963 	# list, sorted from lowest to highest interrupt load. One at a
    964 	# time, shift a CPU off of this list of CPUs, and attempt to
    965 	# rejuggle interrupts between the two CPUs. Don't do this if the
    966 	# other CPU has a higher load than oldcpuid. We're done rejuggling
    967 	# once $oldcpuid's goodness falls below a threshold.
    968 
    969 	syslog('debug', "reconfiguring $oldcpuid");
    970 
    971 	my $cpu = $delta->{$oldcpuid};
    972 	my $avgintrload = $delta->{avgintrload};
    973 
    974 	my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
    975 	while ($#cputargetlist != -1) {
    976  		last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;
    977 
    978 		my $tgtcpuid = shift(@cputargetlist);
    979 		my $tgt = $delta->{$tgtcpuid};
    980 		my $load = $cpu->{intrload};
    981 		my $tgtload = $tgt->{intrload};
    982 		last if $tgtload > $load;
    983 		do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
    984 	}
    985 }
    986 
    987 sub do_reconfig_cpu2cpu($$$$)	# private function
    988 {
    989 	my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;
    990 
    991 	# We've been asked to consider interrupt juggling between srccpuid
    992 	# (with a high interrupt load) and tgtcpuid (with a lower interrupt
    993 	# load). First, make a single list with all of the ivecs from both
    994 	# CPUs, and sort the list from highest to lowest load.
    995 
    996 	syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");
    997 
    998 	# Gather together all the ivecs and sort by load
    999 
   1000 	my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
   1001 	    values(%{$delta->{$tgtcpuid}{ivecs}}));
   1002 	return if $#ivecs == -1;
   1003 
   1004 	@ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);
   1005 
   1006 	# Our "goal" load for srccpuid is the average load across all CPUs.
   1007 	# find_goal() will find determine the optimum selection of the
   1008 	# available interrupts which comes closest to this goal without
   1009 	# falling below the goal.
   1010 
   1011 	my $goal = $delta->{avgintrnsec};
   1012 
   1013 	# We know that the interrupt load on tgtcpuid is less than that on
   1014 	# srccpuid, but its load could still be above avgintrnsec. Don't
   1015 	# choose a goal which would bring srccpuid below the load on tgtcpuid.
   1016 
   1017 	my $avgnsec =
   1018 	    ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
   1019 	if ($goal < $avgnsec) {
   1020 		$goal = $avgnsec;
   1021 	}
   1022 
   1023 	# If the largest of the interrupts is on srccpuid, leave it there.
   1024 	# This can help minimize the disruption caused by moving interrupts.
   1025 
   1026 	if ($ivecs[0]->{origcpu} == $srccpuid) {
   1027 		syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
   1028 		$goal -= $ivecs[0]->{time};
   1029 		shift(@ivecs);
   1030 	}
   1031 
   1032 	syslog('debug', "GOAL: inums should total $goal");
   1033 	find_goal(\@ivecs, $goal);
   1034 
   1035 	# find_goal() returned its results to us by setting $ivec->{goal} if
   1036 	# the ivec should be on srccpuid, or clearing it for tgtcpuid.
   1037 	# Call move_intr() to update our $delta with the new results.
   1038 
   1039 	foreach my $ivec (@ivecs) {
   1040 		syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
   1041 		VERIFY($ivec->{nowcpu} == $srccpuid ||
   1042 		    $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
   1043 		    "interrupt not currently on src or tgt cpu");
   1044 
   1045 		if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
   1046 			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
   1047 			    $srccpuid);
   1048 		} elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
   1049 			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
   1050 			    $tgtcpuid);
   1051 		}
   1052 	}
   1053 	move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts
   1054 
   1055 	my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
   1056 	VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
   1057 	    "cpu2cpu: new load didn't end up in expected range");
   1058 }
   1059 
   1060 
   1061 # find_goal() and its helper do_find_goal() are used to find the best
   1062 # combination of interrupts in order to generate a load that is as close
   1063 # as possible to a goal load without falling below that goal. Before returning
   1064 # to its caller, find_goal() sets a new value in the hash of each interrupt,
   1065 # {goal}, which if set signifies that this interrupt is one of the interrupts
   1066 # identified as part of the set of interrupts which best meet the goal.
   1067 #
   1068 # The arguments to find_goal are a list of ivecs (hash references), sorted
   1069 # by descending {time}, and the goal load. The goal is relative to {time}.
   1070 # The best fit is determined by performing a depth-first search. do_find_goal
   1071 # is the recursive subroutine which carries out the search.
   1072 #
   1073 # It is passed an index as an argument, originally 0. On a given invocation,
   1074 # it is only to consider interrupts in the ivecs array starting at that index.
   1075 # It then considers two possibilities:
   1076 #   1) What is the best goal-fit if I include ivecs[index]?
   1077 #   2) What is the best goal-fit if I exclude ivecs[index]?
   1078 # To determine case 1, it subtracts the load of ivecs[index] from the goal,
   1079 # and calls itself recursively with that new goal and index++.
   1080 # To determine case 2, it calls itself recursively with the same goal and
   1081 # index++.
   1082 #
   1083 # It then compares the two results, decide which one best meets the goals,
   1084 # and returns the result. The return value is the best-fit's interrupt load,
   1085 # followed by a list of all the interrupts which make up that best-fit.
   1086 #
   1087 # As an optimization, a second array loads[] is created which mirrors ivecs[].
   1088 # loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
   1089 # by do_find_goal to avoid recursing all the way to the end of the ivecs
   1090 # array if including all remaining interrupts will still leave the best-fit
   1091 # at below goal load. If so, it then includes all remaining interrupts on
   1092 # the goal list and returns.
   1093 #
   1094 sub find_goal($$)		# private function
   1095 {
   1096 	my ($ivecs, $goal) = @_;
   1097 
   1098 	my @goals;
   1099 	my $load;
   1100 	my $ivec;
   1101 
   1102 	if ($goal <= 0) {
   1103 		@goals = ();	# the empty set will best meet the goal
   1104 	} else {
   1105 		syslog('debug', "finding goal from intrs %s",
   1106 		    ivecs_to_string(@$ivecs));
   1107 
   1108 		# Generate @loads array
   1109 
   1110 		my $tot = 0;
   1111 		foreach $ivec (@$ivecs) {
   1112 			$tot += $ivec->{time};
   1113 		}
   1114 		my @loads = ();
   1115 		foreach $ivec (@$ivecs) {
   1116 			push(@loads, $tot);
   1117 			$tot -= $ivec->{time};
   1118 		}
   1119 		($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
   1120 		VERIFY($load >= $goal, "find_goal didn't meet goals");
   1121 	}
   1122 	syslog('debug', "goals found: %s", ivecs_to_string(@goals));
   1123 
   1124 	# Set or clear $ivec->{goal} for each ivec, based on returned @goals
   1125 
   1126 	foreach $ivec (@$ivecs) {
   1127 		if ($#goals > -1 && $ivec == $goals[0]) {
   1128 			syslog('debug', "inum $ivec->{inum} on source cpu");
   1129 			$ivec->{goal} = 1;
   1130 			shift(@goals);
   1131 		} else {
   1132 			syslog('debug', "inum $ivec->{inum} on target cpu");
   1133 			$ivec->{goal} = 0;
   1134 		}
   1135 	}
   1136 }
   1137 
   1138 
   1139 sub do_find_goal($$$$)		# private function
   1140 {
   1141 	my ($ivecs, $loads, $goal, $idx) = @_;
   1142 
   1143 	if ($idx > $#{$ivecs}) {
   1144 		return (0);
   1145 	}
   1146 	syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");
   1147 
   1148 	my $load = $ivecs->[$idx]{time};
   1149 	my @goals_with = ();
   1150 	my @goals_without = ();
   1151 	my ($with, $without);
   1152 
   1153 	# If we include all remaining items and we're still below goal,
   1154 	# stop here. We can just return a result that includes $idx and all
   1155 	# subsequent ivecs. Since this will still be below goal, there's
   1156 	# nothing better to be done.
   1157 
   1158 	if ($loads->[$idx] <= $goal) {
   1159 		syslog('debug',
   1160 		    "$idx: including all remaining intrs %s with load %d",
   1161 		    ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
   1162 		    $loads->[$idx]);
   1163 		return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
   1164 	}
   1165 
   1166 	# Evaluate the "with" option, i.e. the best matching goal which
   1167 	# includes $ivecs->[$idx]. If idx's load is more than our goal load,
   1168 	# stop here. Once we're above the goal, there is no need to consider
   1169 	# further interrupts since they'll only take us further from the goal.
   1170 
   1171 	if ($goal <= $load) {
   1172 		$with = $load;	# stop here
   1173 	} else {
   1174 		($with, @goals_with) =
   1175 		    do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
   1176 		$with += $load;
   1177 	}
   1178 	syslog('debug', "$idx: with-load $with intrs %s",
   1179 	       ivecs_to_string($ivecs->[$idx], @goals_with));
   1180 
   1181 	# Evaluate the "without" option, i.e. the best matching goal which
   1182 	# excludes $ivecs->[$idx].
   1183 
   1184 	($without, @goals_without) =
   1185 	    &do_find_goal($ivecs, $loads, $goal, $idx + 1);
   1186 	syslog('debug', "$idx: without-load $without intrs %s",
   1187 	       ivecs_to_string(@goals_without));
   1188 
   1189 	# We now have our "with" and "without" options, and we choose which
   1190 	# best fits the goal. If one is greater than goal and the other is
   1191 	# below goal, we choose the one that is greater. If they are both 
   1192 	# below goal, then we choose the one that is greater. If they are
   1193 	# both above goal, then we choose the smaller.
   1194 
   1195 	my $which;		# 0 == with, 1 == without
   1196 	if ($with >= $goal && $without < $goal) {
   1197 		$which = 0;
   1198 	} elsif ($with < $goal && $without >= $goal) {
   1199 		$which = 1;
   1200 	} elsif ($with >= $goal && $without >= $goal) {
   1201 		$which = ($without < $with);
   1202 	} else {
   1203 		$which = ($without > $with);
   1204 	}
   1205 
   1206 	# Return the load of our best case scenario, followed by all the ivecs
   1207 	# which compose that goal.
   1208 
   1209 	if ($which == 1) {	# without
   1210 		syslog('debug', "$idx: going without");
   1211 		return ($without, @goals_without);
   1212 	} else {
   1213 		syslog('debug', "$idx: going with");
   1214 		return ($with, $ivecs->[$idx], @goals_with);
   1215 	}
   1216 	# Not reached
   1217 }
   1218 
   1219 
   1220 
   1221 
   1222 syslog('debug', "intrd is starting".($debug ? " (debug)" : ""));
   1223 
   1224 my @deltas = ();
   1225 my $deltas_tottime = 0;		# sum of maxsnap-minsnap across @deltas
   1226 my $avggoodness;
   1227 my $baseline_goodness = 0;
   1228 my $compdelta;
   1229 
   1230 my $do_reconfig;
   1231 
   1232 # temp variables
   1233 my $goodness;
   1234 my $deltatime;
   1235 my $olddelta;
   1236 my $olddeltatime;
   1237 my $delta;
   1238 my $newstat;
   1239 my $below_statslen;
   1240 my $newtime;
   1241 my $ret;
   1242 
   1243 
   1244 my $gotsig = 0;
   1245 $SIG{INT} = sub { $gotsig = 1; };     # don't die in the middle of retargeting
   1246 $SIG{HUP} = $SIG{INT};
   1247 $SIG{TERM} = $SIG{INT};
   1248 
   1249 my $ks;
   1250 if ($using_scengen == 0) {
   1251 	$ks = Sun::Solaris::Kstat->new();
   1252 } else {
   1253 	$ks = myks_update();	# supplied by the simulator
   1254 }
   1255 
   1256 # If no pci_intrs kstats were found, we need to exit, but we can't because
   1257 # SMF will restart us and/or report an error to the administrator. But
   1258 # there's nothing an administrator can do. So print out a message for SMF
   1259 # logs and silently pause forever.
   1260 
   1261 if (!exists($ks->{pci_intrs})) {
   1262 	print STDERR "$cmdname: no interrupts were found; ".
   1263 	    "your PCI bus may not yet be supported\n";
   1264 	pause() while $gotsig == 0;
   1265 	exit 0;
   1266 }
   1267 
   1268 # See if this is a system with a pcplusmp APIC.
   1269 # Such systems will get special handling.
   1270 # Assume that if one bus has a pcplusmp APIC that they all do.
   1271 
   1272 # Get a list of pci_intrs kstats.
   1273 my @elem = values(%{$ks->{pci_intrs}});
   1274 my $elem0 = $elem[0];
   1275 my $elemval = (values(%$elem0))[0];
   1276 
   1277 # Use its buspath to query the system.  It is assumed that either all or none
   1278 # of the busses on a system are hosted by the pcplusmp APIC.
   1279 my $pcplusmp_sys = is_pcplusmp($elemval->{buspath});
   1280 
   1281 my $stat = getstat($ks, $pcplusmp_sys);
   1282 
   1283 for (;;) {
   1284 	sub clear_deltas {
   1285 		@deltas = ();
   1286 		$deltas_tottime = 0;
   1287 		$stat = 0;   # prevent next gen_delta() from setting {missing}
   1288 	}
   1289 
   1290 	# 1. Sleep, update the kstats, and save the new stats in $newstat.
   1291 
   1292 	exit 0 if $gotsig;		# if we got ^C / SIGTERM, exit
   1293 	if ($using_scengen == 0) {
   1294 		sleep($sleeptime);
   1295 		exit 0 if $gotsig;	# if we got ^C / SIGTERM, exit
   1296 		$ks->update();
   1297 	} else {
   1298 		$ks = myks_update();
   1299 	}
   1300 	$newstat = getstat($ks, $pcplusmp_sys);
   1301 
   1302 	# $stat or $newstat could be zero if they're uninitialized, or if
   1303 	# getstat() failed. If $stat is zero, move $newstat to $stat, sleep
   1304 	# and try again. If $newstat is zero, then we also sleep and try
   1305 	# again, hoping the problem will clear up.
   1306 
   1307 	next if (!ref $newstat);
   1308 	if (!ref $stat) {
   1309 		$stat = $newstat;
   1310 		next;
   1311 	}
   1312 
   1313 	# 2. Compare $newstat with the prior set of values, result in %$delta.
   1314 
   1315 	$delta = generate_delta($stat, $newstat);
   1316 	dumpdelta($delta) if $debug;	# Dump most recent stats to stdout.
   1317 	$stat = $newstat;	# The new stats now become the old stats.
   1318 
   1319 
   1320 	# 3. If $delta->{missing}, then there has been a reconfiguration of
   1321 	# either cpus or interrupts (probably both). We need to toss out our
   1322 	# old set of statistics and start from scratch.
   1323 	#
   1324 	# Also, if the delta covers a very long range of time, then we've
   1325 	# been experiencing a system overload that has resulted in intrd
   1326 	# not being allowed to run effectively for a while now. As above,
   1327 	# toss our old statistics and start from scratch.
   1328 
   1329 	$deltatime = $delta->{maxsnap} - $delta->{minsnap};
   1330 	if ($delta->{missing} > 0 || $deltatime > $statslen) {
   1331 		clear_deltas();
   1332 		syslog('debug', "evaluating interrupt assignments");
   1333 		next;
   1334 	}
   1335 
   1336 
   1337 	# 4. Incorporate new delta into the list of deltas, and associated
   1338 	# statistics. If we've just now received $statslen deltas, then it's
   1339 	# time to evaluate a reconfiguration.
   1340 
   1341 	$below_statslen = ($deltas_tottime < $statslen);
   1342 	$deltas_tottime += $deltatime;
   1343 	$do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
   1344 	push(@deltas, $delta);
   1345 
   1346 	# 5. Remove old deltas if total time is more than $statslen. We use
   1347 	# @deltas as a moving average of the last $statslen seconds. Shift
   1348 	# off the olders deltas, but only if that doesn't cause us to fall
   1349 	# below $statslen seconds.
   1350 
   1351 	while (@deltas > 1) {
   1352 		$olddelta = $deltas[0];
   1353 		$olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
   1354 		$newtime = $deltas_tottime - $olddeltatime;
   1355 		last if ($newtime < $statslen);
   1356 
   1357 		shift(@deltas);
   1358 		$deltas_tottime = $newtime;
   1359 	}
   1360 
   1361 	# 6. The brains of the operation are here. First, check if we're
   1362 	# imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
   1363 	# either because of imbalance or above in step 4, we evaluate a
   1364 	# new configuration.
   1365 	#
   1366 	# First, take @deltas and generate a single "compressed" delta
   1367 	# which summarizes them all. Pass that to do_reconfig and see
   1368 	# what it does with it:
   1369 	#
   1370 	# $ret == -1 : failure
   1371 	# $ret ==  0 : current config is optimal (or close enough)
   1372 	# $ret ==  1 : reconfiguration has occurred
   1373 	#
   1374 	# If $ret is -1 or 1, dump all our deltas and start from scratch.
   1375 	# Step 4 above will set do_reconfig soon thereafter.
   1376 	#
   1377 	# If $ret is 0, then nothing has happened because we're already
   1378 	# good enough. Set baseline_goodness to current goodness.
   1379 
   1380 	$compdelta = compress_deltas(\@deltas);
   1381 	if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
   1382 		clear_deltas();
   1383 		next;
   1384 	}
   1385 	$compdelta->{goodness} = goodness($compdelta);
   1386 	dumpdelta($compdelta) if $debug;
   1387 
   1388 	$goodness = $compdelta->{goodness};
   1389 	syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);
   1390 
   1391 	if ($deltas_tottime >= $statslen &&
   1392 	    imbalanced($goodness, $baseline_goodness)) {
   1393 		$do_reconfig = 1;
   1394 	}
   1395 
   1396 	if ($do_reconfig) {
   1397 		$ret = do_reconfig($compdelta);
   1398 
   1399 		if ($ret != 0) {
   1400 			clear_deltas();
   1401 			syslog('debug', "do_reconfig FAILED!") if $ret == -1;
   1402 		} else {
   1403 			syslog('debug', "setting new baseline of $goodness");
   1404 			$baseline_goodness = $goodness;
   1405 		}
   1406 	}
   1407 	syslog('debug', "---------------------------------------");
   1408 }		
   1409