You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
203 lines
6.6 KiB
203 lines
6.6 KiB
#!/usr/bin/env python3
|
|
# Copyright (C) 2019 The Android Open Source Project
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
''' Worker main loop. Pulls jobs from the DB and runs them in the sandbox
|
|
|
|
It also handles timeouts and graceful container termination.
|
|
'''
|
|
|
|
import logging
|
|
import os
|
|
import random
|
|
import signal
|
|
import socket
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import traceback
|
|
|
|
from config import DB, JOB_TIMEOUT_SEC
|
|
from common_utils import req, utc_now_iso, init_logging
|
|
from common_utils import ConcurrentModificationError, SCOPES
|
|
|
|
CUR_DIR = os.path.dirname(__file__)
|
|
SCOPES.append('https://www.googleapis.com/auth/firebase.database')
|
|
SCOPES.append('https://www.googleapis.com/auth/userinfo.email')
|
|
WORKER_NAME = '%s-%s' % (os.getenv('WORKER_HOST', 'local').split('-')[-1],
|
|
socket.gethostname())
|
|
sigterm = threading.Event()
|
|
|
|
|
|
def try_acquire_job(job_id):
|
|
''' Transactionally acquire the given job.
|
|
|
|
Returns the job JSON object if it managed to acquire and put it into the
|
|
STARTED state, None if another worker got there first.
|
|
'''
|
|
logging.debug('Trying to acquire job %s', job_id)
|
|
|
|
uri = '%s/jobs/%s.json' % (DB, job_id)
|
|
job, etag = req('GET', uri, req_etag=True)
|
|
if job['status'] != 'QUEUED':
|
|
return None # Somebody else took it
|
|
try:
|
|
job['status'] = 'STARTED'
|
|
job['time_started'] = utc_now_iso()
|
|
job['worker'] = WORKER_NAME
|
|
req('PUT', uri, body=job, etag=etag)
|
|
return job
|
|
except ConcurrentModificationError:
|
|
return None
|
|
|
|
|
|
def make_worker_obj(status, job_id=None):
|
|
return {
|
|
'job_id': job_id,
|
|
'status': status,
|
|
'last_update': utc_now_iso(),
|
|
'host': os.getenv('WORKER_HOST', '')
|
|
}
|
|
|
|
|
|
def worker_loop():
|
|
''' Pulls a job from the queue and runs it invoking run_job.py '''
|
|
uri = '%s/jobs_queued.json?orderBy="$key"&limitToLast=10' % DB
|
|
jobs = req('GET', uri)
|
|
if not jobs:
|
|
return
|
|
|
|
# Work out the worker number from the hostname. We try to distribute the load
|
|
# (via the time.sleep below) so that we fill first all the worker-1 of each
|
|
# vm, then worker-2 and so on. This is designed so that if there is only one
|
|
# CL (hence N jobs) in the queue, each VM gets only one job, maximizing the
|
|
# cpu efficiency of each VM.
|
|
try:
|
|
worker_num = int(socket.gethostname().split('-')[-1])
|
|
except ValueError:
|
|
worker_num = 1
|
|
|
|
# Transactionally acquire a job. Deal with races (two workers trying to
|
|
# acquire the same job).
|
|
job = None
|
|
job_id = None
|
|
for job_id in sorted(jobs.keys(), reverse=True):
|
|
job = try_acquire_job(job_id)
|
|
if job is not None:
|
|
break
|
|
logging.info('Raced while trying to acquire job %s, retrying', job_id)
|
|
time.sleep(worker_num * 2 + random.random())
|
|
if job is None:
|
|
logging.error('Failed to acquire a job')
|
|
return
|
|
|
|
logging.info('Starting job %s', job_id)
|
|
|
|
# Update the db, move the job to the running queue.
|
|
patch_obj = {
|
|
'jobs_queued/' + job_id: {}, # = DELETE
|
|
'jobs_running/' + job_id: {
|
|
'worker': WORKER_NAME
|
|
},
|
|
'workers/' + WORKER_NAME: make_worker_obj('RUNNING', job_id=job_id)
|
|
}
|
|
req('PATCH', '%s.json' % DB, body=patch_obj)
|
|
|
|
cmd = [os.path.join(CUR_DIR, 'run_job.py'), job_id]
|
|
|
|
# Propagate the worker's PERFETTO_ vars and merge with the job-specific vars.
|
|
env = dict(os.environ, **{k: str(v) for (k, v) in job['env'].items()})
|
|
job_runner = subprocess.Popen(cmd, env=env)
|
|
|
|
# Run the job in a python subprocess, to isolate the main loop from logs
|
|
# uploader failures.
|
|
res = None
|
|
cancelled = False
|
|
timed_out = False
|
|
time_started = time.time()
|
|
time_last_db_poll = time_started
|
|
polled_status = 'STARTED'
|
|
while res is None:
|
|
time.sleep(0.25)
|
|
res = job_runner.poll()
|
|
now = time.time()
|
|
if now - time_last_db_poll > 10: # Throttle DB polling.
|
|
polled_status = req('GET', '%s/jobs/%s/status.json' % (DB, job_id))
|
|
time_last_db_poll = now
|
|
if now - time_started > JOB_TIMEOUT_SEC:
|
|
logging.info('Job %s timed out, terminating', job_id)
|
|
timed_out = True
|
|
job_runner.terminate()
|
|
if (sigterm.is_set() or polled_status != 'STARTED') and not cancelled:
|
|
logging.info('Job %s cancelled, terminating', job_id)
|
|
cancelled = True
|
|
job_runner.terminate()
|
|
|
|
status = ('INTERRUPTED' if sigterm.is_set() else 'CANCELLED' if cancelled else
|
|
'TIMED_OUT' if timed_out else 'COMPLETED' if res == 0 else 'FAILED')
|
|
logging.info('Job %s %s with code %s', job_id, status, res)
|
|
|
|
# Update the DB, unless the job has been cancelled. The "is not None"
|
|
# condition deals with a very niche case, that is, avoid creating a partial
|
|
# job entry after doing a full clear of the DB (which is super rare, happens
|
|
# only when re-deploying the CI).
|
|
if polled_status is not None:
|
|
patch = {
|
|
'jobs/%s/status' % job_id: status,
|
|
'jobs/%s/exit_code' % job_id: {} if res is None else res,
|
|
'jobs/%s/time_ended' % job_id: utc_now_iso(),
|
|
'jobs_running/%s' % job_id: {}, # = DELETE
|
|
}
|
|
req('PATCH', '%s.json' % (DB), body=patch)
|
|
|
|
|
|
def sig_handler(_, __):
|
|
logging.warning('Interrupted by signal, exiting worker')
|
|
sigterm.set()
|
|
|
|
|
|
def main():
|
|
init_logging()
|
|
logging.info('Worker started')
|
|
signal.signal(signal.SIGTERM, sig_handler)
|
|
signal.signal(signal.SIGINT, sig_handler)
|
|
|
|
while not sigterm.is_set():
|
|
logging.debug('Starting poll cycle')
|
|
try:
|
|
worker_loop()
|
|
req('PUT',
|
|
'%s/workers/%s.json' % (DB, WORKER_NAME),
|
|
body=make_worker_obj('IDLE'))
|
|
except:
|
|
logging.error('Exception in worker loop:\n%s', traceback.format_exc())
|
|
if sigterm.is_set():
|
|
break
|
|
|
|
# Synchronize sleeping with the wall clock. This is so all VMs wake up at
|
|
# the same time. See comment on distributing load above in this file.
|
|
poll_time_sec = 5
|
|
time.sleep(poll_time_sec - (time.time() % poll_time_sec))
|
|
|
|
# The use case here is the VM being terminated by the GCE infrastructure.
|
|
# We mark the worker as terminated and the job as cancelled so we don't wait
|
|
# forever for it.
|
|
logging.warning('Exiting the worker loop, got signal: %s', sigterm.is_set())
|
|
req('PUT',
|
|
'%s/workers/%s.json' % (DB, WORKER_NAME),
|
|
body=make_worker_obj('TERMINATED'))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|