You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
450 lines
14 KiB
450 lines
14 KiB
# -*- coding: utf-8 -*-
|
|
# Copyright 2015 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""Module to optimize the scheduling of benchmark_run tasks."""
|
|
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import sys
|
|
import traceback
|
|
|
|
from collections import defaultdict
|
|
from threading import Lock
|
|
from threading import Thread
|
|
|
|
import test_flag
|
|
|
|
from machine_image_manager import MachineImageManager
|
|
from cros_utils import command_executer
|
|
from cros_utils import logger
|
|
|
|
|
|
class DutWorker(Thread):
|
|
"""Working thread for a dut."""
|
|
|
|
def __init__(self, dut, sched):
|
|
super(DutWorker, self).__init__(name='DutWorker-{}'.format(dut.name))
|
|
self._dut = dut
|
|
self._sched = sched
|
|
self._stat_num_br_run = 0
|
|
self._stat_num_reimage = 0
|
|
self._stat_annotation = ''
|
|
self._logger = logger.GetLogger(self._sched.get_experiment().log_dir)
|
|
self.daemon = True
|
|
self._terminated = False
|
|
self._active_br = None
|
|
# Race condition accessing _active_br between _execute_benchmark_run and
|
|
# _terminate, so lock it up.
|
|
self._active_br_lock = Lock()
|
|
|
|
def terminate(self):
|
|
self._terminated = True
|
|
with self._active_br_lock:
|
|
if self._active_br is not None:
|
|
# BenchmarkRun.Terminate() terminates any running testcase via
|
|
# suite_runner.Terminate and updates timeline.
|
|
self._active_br.Terminate()
|
|
|
|
def run(self):
|
|
"""Do the "run-test->(optionally reimage)->run-test" chore.
|
|
|
|
Note - 'br' below means 'benchmark_run'.
|
|
"""
|
|
|
|
# Firstly, handle benchmarkruns that have cache hit.
|
|
br = self._sched.get_cached_benchmark_run()
|
|
while br:
|
|
try:
|
|
self._stat_annotation = 'finishing cached {}'.format(br)
|
|
br.run()
|
|
except RuntimeError:
|
|
traceback.print_exc(file=sys.stdout)
|
|
br = self._sched.get_cached_benchmark_run()
|
|
|
|
# Secondly, handle benchmarkruns that needs to be run on dut.
|
|
self._setup_dut_label()
|
|
try:
|
|
self._logger.LogOutput('{} started.'.format(self))
|
|
while not self._terminated:
|
|
br = self._sched.get_benchmark_run(self._dut)
|
|
if br is None:
|
|
# No br left for this label. Considering reimaging.
|
|
label = self._sched.allocate_label(self._dut)
|
|
if label is None:
|
|
# No br even for other labels. We are done.
|
|
self._logger.LogOutput('ImageManager found no label '
|
|
'for dut, stopping working '
|
|
'thread {}.'.format(self))
|
|
break
|
|
if self._reimage(label):
|
|
# Reimage to run other br fails, dut is doomed, stop
|
|
# this thread.
|
|
self._logger.LogWarning('Re-image failed, dut '
|
|
'in an unstable state, stopping '
|
|
'working thread {}.'.format(self))
|
|
break
|
|
else:
|
|
# Execute the br.
|
|
self._execute_benchmark_run(br)
|
|
finally:
|
|
self._stat_annotation = 'finished'
|
|
# Thread finishes. Notify scheduler that I'm done.
|
|
self._sched.dut_worker_finished(self)
|
|
|
|
def _reimage(self, label):
|
|
"""Reimage image to label.
|
|
|
|
Args:
|
|
label: the label to remimage onto dut.
|
|
|
|
Returns:
|
|
0 if successful, otherwise 1.
|
|
"""
|
|
|
|
# Termination could happen anywhere, check it.
|
|
if self._terminated:
|
|
return 1
|
|
|
|
if self._sched.get_experiment().skylab:
|
|
self._logger.LogOutput('Skylab mode, do not image before testing.')
|
|
self._dut.label = label
|
|
return 0
|
|
|
|
self._logger.LogOutput('Reimaging {} using {}'.format(self, label))
|
|
self._stat_num_reimage += 1
|
|
self._stat_annotation = 'reimaging using "{}"'.format(label.name)
|
|
try:
|
|
# Note, only 1 reimage at any given time, this is guaranteed in
|
|
# ImageMachine, so no sync needed below.
|
|
retval = self._sched.get_experiment().machine_manager.ImageMachine(
|
|
self._dut, label)
|
|
|
|
if retval:
|
|
return 1
|
|
except RuntimeError:
|
|
return 1
|
|
|
|
self._dut.label = label
|
|
return 0
|
|
|
|
def _execute_benchmark_run(self, br):
|
|
"""Execute a single benchmark_run.
|
|
|
|
Note - this function never throws exceptions.
|
|
"""
|
|
|
|
# Termination could happen anywhere, check it.
|
|
if self._terminated:
|
|
return
|
|
|
|
self._logger.LogOutput('{} started working on {}'.format(self, br))
|
|
self._stat_num_br_run += 1
|
|
self._stat_annotation = 'executing {}'.format(br)
|
|
# benchmark_run.run does not throws, but just play it safe here.
|
|
try:
|
|
assert br.owner_thread is None
|
|
br.owner_thread = self
|
|
with self._active_br_lock:
|
|
self._active_br = br
|
|
br.run()
|
|
finally:
|
|
self._sched.get_experiment().BenchmarkRunFinished(br)
|
|
with self._active_br_lock:
|
|
self._active_br = None
|
|
|
|
def _setup_dut_label(self):
|
|
"""Try to match dut image with a certain experiment label.
|
|
|
|
If such match is found, we just skip doing reimage and jump to execute
|
|
some benchmark_runs.
|
|
"""
|
|
|
|
checksum_file = '/usr/local/osimage_checksum_file'
|
|
try:
|
|
rv, checksum, _ = command_executer.GetCommandExecuter().\
|
|
CrosRunCommandWOutput(
|
|
'cat ' + checksum_file,
|
|
chromeos_root=self._sched.get_labels(0).chromeos_root,
|
|
machine=self._dut.name,
|
|
print_to_console=False)
|
|
if rv == 0:
|
|
checksum = checksum.strip()
|
|
for l in self._sched.get_labels():
|
|
if l.checksum == checksum:
|
|
self._logger.LogOutput("Dut '{}' is pre-installed with '{}'".format(
|
|
self._dut.name, l))
|
|
self._dut.label = l
|
|
return
|
|
except RuntimeError:
|
|
traceback.print_exc(file=sys.stdout)
|
|
self._dut.label = None
|
|
|
|
def __str__(self):
|
|
return 'DutWorker[dut="{}", label="{}"]'.format(
|
|
self._dut.name, self._dut.label.name if self._dut.label else 'None')
|
|
|
|
def dut(self):
|
|
return self._dut
|
|
|
|
def status_str(self):
|
|
"""Report thread status."""
|
|
|
|
return ('Worker thread "{}", label="{}", benchmark_run={}, '
|
|
'reimage={}, now {}'.format(
|
|
self._dut.name,
|
|
'None' if self._dut.label is None else self._dut.label.name,
|
|
self._stat_num_br_run, self._stat_num_reimage,
|
|
self._stat_annotation))
|
|
|
|
|
|
class BenchmarkRunCacheReader(Thread):
|
|
"""The thread to read cache for a list of benchmark_runs.
|
|
|
|
On creation, each instance of this class is given a br_list, which is a
|
|
subset of experiment._benchmark_runs.
|
|
"""
|
|
|
|
def __init__(self, schedv2, br_list):
|
|
super(BenchmarkRunCacheReader, self).__init__()
|
|
self._schedv2 = schedv2
|
|
self._br_list = br_list
|
|
self._logger = self._schedv2.get_logger()
|
|
|
|
def run(self):
|
|
for br in self._br_list:
|
|
try:
|
|
br.ReadCache()
|
|
if br.cache_hit:
|
|
self._logger.LogOutput('Cache hit - {}'.format(br))
|
|
with self._schedv2.lock_on('_cached_br_list'):
|
|
self._schedv2.get_cached_run_list().append(br)
|
|
else:
|
|
self._logger.LogOutput('Cache not hit - {}'.format(br))
|
|
except RuntimeError:
|
|
traceback.print_exc(file=sys.stderr)
|
|
|
|
|
|
class Schedv2(object):
|
|
"""New scheduler for crosperf."""
|
|
|
|
def __init__(self, experiment):
|
|
self._experiment = experiment
|
|
self._logger = logger.GetLogger(experiment.log_dir)
|
|
|
|
# Create shortcuts to nested data structure. "_duts" points to a list of
|
|
# locked machines. _labels points to a list of all labels.
|
|
self._duts = self._experiment.machine_manager.GetMachines()
|
|
self._labels = self._experiment.labels
|
|
|
|
# Bookkeeping for synchronization.
|
|
self._workers_lock = Lock()
|
|
# pylint: disable=unnecessary-lambda
|
|
self._lock_map = defaultdict(lambda: Lock())
|
|
|
|
# Test mode flag
|
|
self._in_test_mode = test_flag.GetTestMode()
|
|
|
|
# Read benchmarkrun cache.
|
|
self._read_br_cache()
|
|
|
|
# Mapping from label to a list of benchmark_runs.
|
|
self._label_brl_map = dict((l, []) for l in self._labels)
|
|
for br in self._experiment.benchmark_runs:
|
|
assert br.label in self._label_brl_map
|
|
# Only put no-cache-hit br into the map.
|
|
if br not in self._cached_br_list:
|
|
self._label_brl_map[br.label].append(br)
|
|
|
|
# Use machine image manager to calculate initial label allocation.
|
|
self._mim = MachineImageManager(self._labels, self._duts)
|
|
self._mim.compute_initial_allocation()
|
|
|
|
# Create worker thread, 1 per dut.
|
|
self._active_workers = [DutWorker(dut, self) for dut in self._duts]
|
|
self._finished_workers = []
|
|
|
|
# Termination flag.
|
|
self._terminated = False
|
|
|
|
def run_sched(self):
|
|
"""Start all dut worker threads and return immediately."""
|
|
|
|
for w in self._active_workers:
|
|
w.start()
|
|
|
|
def _read_br_cache(self):
|
|
"""Use multi-threading to read cache for all benchmarkruns.
|
|
|
|
We do this by firstly creating a few threads, and then assign each
|
|
thread a segment of all brs. Each thread will check cache status for
|
|
each br and put those with cache into '_cached_br_list'.
|
|
"""
|
|
|
|
self._cached_br_list = []
|
|
n_benchmarkruns = len(self._experiment.benchmark_runs)
|
|
if n_benchmarkruns <= 4:
|
|
# Use single thread to read cache.
|
|
self._logger.LogOutput(('Starting to read cache status for '
|
|
'{} benchmark runs ...').format(n_benchmarkruns))
|
|
BenchmarkRunCacheReader(self, self._experiment.benchmark_runs).run()
|
|
return
|
|
|
|
# Split benchmarkruns set into segments. Each segment will be handled by
|
|
# a thread. Note, we use (x+3)/4 to mimic math.ceil(x/4).
|
|
n_threads = max(2, min(20, (n_benchmarkruns + 3) // 4))
|
|
self._logger.LogOutput(('Starting {} threads to read cache status for '
|
|
'{} benchmark runs ...').format(
|
|
n_threads, n_benchmarkruns))
|
|
benchmarkruns_per_thread = (n_benchmarkruns + n_threads - 1) // n_threads
|
|
benchmarkrun_segments = []
|
|
for i in range(n_threads - 1):
|
|
start = i * benchmarkruns_per_thread
|
|
end = (i + 1) * benchmarkruns_per_thread
|
|
benchmarkrun_segments.append(self._experiment.benchmark_runs[start:end])
|
|
benchmarkrun_segments.append(
|
|
self._experiment.benchmark_runs[(n_threads - 1) *
|
|
benchmarkruns_per_thread:])
|
|
|
|
# Assert: aggregation of benchmarkrun_segments equals to benchmark_runs.
|
|
assert sum(len(x) for x in benchmarkrun_segments) == n_benchmarkruns
|
|
|
|
# Create and start all readers.
|
|
cache_readers = [
|
|
BenchmarkRunCacheReader(self, x) for x in benchmarkrun_segments
|
|
]
|
|
|
|
for x in cache_readers:
|
|
x.start()
|
|
|
|
# Wait till all readers finish.
|
|
for x in cache_readers:
|
|
x.join()
|
|
|
|
# Summarize.
|
|
self._logger.LogOutput(
|
|
'Total {} cache hit out of {} benchmark_runs.'.format(
|
|
len(self._cached_br_list), n_benchmarkruns))
|
|
|
|
def get_cached_run_list(self):
|
|
return self._cached_br_list
|
|
|
|
def get_label_map(self):
|
|
return self._label_brl_map
|
|
|
|
def get_experiment(self):
|
|
return self._experiment
|
|
|
|
def get_labels(self, i=None):
|
|
if i is None:
|
|
return self._labels
|
|
return self._labels[i]
|
|
|
|
def get_logger(self):
|
|
return self._logger
|
|
|
|
def get_cached_benchmark_run(self):
|
|
"""Get a benchmark_run with 'cache hit'.
|
|
|
|
Returns:
|
|
The benchmark that has cache hit, if any. Otherwise none.
|
|
"""
|
|
|
|
with self.lock_on('_cached_br_list'):
|
|
if self._cached_br_list:
|
|
return self._cached_br_list.pop()
|
|
return None
|
|
|
|
def get_benchmark_run(self, dut):
|
|
"""Get a benchmark_run (br) object for a certain dut.
|
|
|
|
Args:
|
|
dut: the dut for which a br is returned.
|
|
|
|
Returns:
|
|
A br with its label matching that of the dut. If no such br could be
|
|
found, return None (this usually means a reimage is required for the
|
|
dut).
|
|
"""
|
|
|
|
# If terminated, stop providing any br.
|
|
if self._terminated:
|
|
return None
|
|
|
|
# If dut bears an unrecognized label, return None.
|
|
if dut.label is None:
|
|
return None
|
|
|
|
# If br list for the dut's label is empty (that means all brs for this
|
|
# label have been done), return None.
|
|
with self.lock_on(dut.label):
|
|
brl = self._label_brl_map[dut.label]
|
|
if not brl:
|
|
return None
|
|
# Return the first br.
|
|
return brl.pop(0)
|
|
|
|
def allocate_label(self, dut):
|
|
"""Allocate a label to a dut.
|
|
|
|
The work is delegated to MachineImageManager.
|
|
|
|
The dut_worker calling this method is responsible for reimage the dut to
|
|
this label.
|
|
|
|
Args:
|
|
dut: the new label that is to be reimaged onto the dut.
|
|
|
|
Returns:
|
|
The label or None.
|
|
"""
|
|
|
|
if self._terminated:
|
|
return None
|
|
|
|
return self._mim.allocate(dut, self)
|
|
|
|
def dut_worker_finished(self, dut_worker):
|
|
"""Notify schedv2 that the dut_worker thread finished.
|
|
|
|
Args:
|
|
dut_worker: the thread that is about to end.
|
|
"""
|
|
|
|
self._logger.LogOutput('{} finished.'.format(dut_worker))
|
|
with self._workers_lock:
|
|
self._active_workers.remove(dut_worker)
|
|
self._finished_workers.append(dut_worker)
|
|
|
|
def is_complete(self):
|
|
return len(self._active_workers) == 0
|
|
|
|
def lock_on(self, my_object):
|
|
return self._lock_map[my_object]
|
|
|
|
def terminate(self):
|
|
"""Mark flag so we stop providing br/reimages.
|
|
|
|
Also terminate each DutWorker, so they refuse to execute br or reimage.
|
|
"""
|
|
|
|
self._terminated = True
|
|
for dut_worker in self._active_workers:
|
|
dut_worker.terminate()
|
|
|
|
def threads_status_as_string(self):
|
|
"""Report the dut worker threads status."""
|
|
|
|
status = '{} active threads, {} finished threads.\n'.format(
|
|
len(self._active_workers), len(self._finished_workers))
|
|
status += ' Active threads:'
|
|
for dw in self._active_workers:
|
|
status += '\n ' + dw.status_str()
|
|
if self._finished_workers:
|
|
status += '\n Finished threads:'
|
|
for dw in self._finished_workers:
|
|
status += '\n ' + dw.status_str()
|
|
return status
|