#!/usr/bin/env python3 # Copyright (C) 2019 The Android Open Source Project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ''' Worker main loop. Pulls jobs from the DB and runs them in the sandbox It also handles timeouts and graceful container termination. ''' import logging import os import random import signal import socket import subprocess import threading import time import traceback from config import DB, JOB_TIMEOUT_SEC from common_utils import req, utc_now_iso, init_logging from common_utils import ConcurrentModificationError, SCOPES CUR_DIR = os.path.dirname(__file__) SCOPES.append('https://www.googleapis.com/auth/firebase.database') SCOPES.append('https://www.googleapis.com/auth/userinfo.email') WORKER_NAME = '%s-%s' % (os.getenv('WORKER_HOST', 'local').split('-')[-1], socket.gethostname()) sigterm = threading.Event() def try_acquire_job(job_id): ''' Transactionally acquire the given job. Returns the job JSON object if it managed to acquire and put it into the STARTED state, None if another worker got there first. ''' logging.debug('Trying to acquire job %s', job_id) uri = '%s/jobs/%s.json' % (DB, job_id) job, etag = req('GET', uri, req_etag=True) if job['status'] != 'QUEUED': return None # Somebody else took it try: job['status'] = 'STARTED' job['time_started'] = utc_now_iso() job['worker'] = WORKER_NAME req('PUT', uri, body=job, etag=etag) return job except ConcurrentModificationError: return None def make_worker_obj(status, job_id=None): return { 'job_id': job_id, 'status': status, 'last_update': utc_now_iso(), 'host': os.getenv('WORKER_HOST', '') } def worker_loop(): ''' Pulls a job from the queue and runs it invoking run_job.py ''' uri = '%s/jobs_queued.json?orderBy="$key"&limitToLast=10' % DB jobs = req('GET', uri) if not jobs: return # Work out the worker number from the hostname. We try to distribute the load # (via the time.sleep below) so that we fill first all the worker-1 of each # vm, then worker-2 and so on. This is designed so that if there is only one # CL (hence N jobs) in the queue, each VM gets only one job, maximizing the # cpu efficiency of each VM. try: worker_num = int(socket.gethostname().split('-')[-1]) except ValueError: worker_num = 1 # Transactionally acquire a job. Deal with races (two workers trying to # acquire the same job). job = None job_id = None for job_id in sorted(jobs.keys(), reverse=True): job = try_acquire_job(job_id) if job is not None: break logging.info('Raced while trying to acquire job %s, retrying', job_id) time.sleep(worker_num * 2 + random.random()) if job is None: logging.error('Failed to acquire a job') return logging.info('Starting job %s', job_id) # Update the db, move the job to the running queue. patch_obj = { 'jobs_queued/' + job_id: {}, # = DELETE 'jobs_running/' + job_id: { 'worker': WORKER_NAME }, 'workers/' + WORKER_NAME: make_worker_obj('RUNNING', job_id=job_id) } req('PATCH', '%s.json' % DB, body=patch_obj) cmd = [os.path.join(CUR_DIR, 'run_job.py'), job_id] # Propagate the worker's PERFETTO_ vars and merge with the job-specific vars. env = dict(os.environ, **{k: str(v) for (k, v) in job['env'].items()}) job_runner = subprocess.Popen(cmd, env=env) # Run the job in a python subprocess, to isolate the main loop from logs # uploader failures. res = None cancelled = False timed_out = False time_started = time.time() time_last_db_poll = time_started polled_status = 'STARTED' while res is None: time.sleep(0.25) res = job_runner.poll() now = time.time() if now - time_last_db_poll > 10: # Throttle DB polling. polled_status = req('GET', '%s/jobs/%s/status.json' % (DB, job_id)) time_last_db_poll = now if now - time_started > JOB_TIMEOUT_SEC: logging.info('Job %s timed out, terminating', job_id) timed_out = True job_runner.terminate() if (sigterm.is_set() or polled_status != 'STARTED') and not cancelled: logging.info('Job %s cancelled, terminating', job_id) cancelled = True job_runner.terminate() status = ('INTERRUPTED' if sigterm.is_set() else 'CANCELLED' if cancelled else 'TIMED_OUT' if timed_out else 'COMPLETED' if res == 0 else 'FAILED') logging.info('Job %s %s with code %s', job_id, status, res) # Update the DB, unless the job has been cancelled. The "is not None" # condition deals with a very niche case, that is, avoid creating a partial # job entry after doing a full clear of the DB (which is super rare, happens # only when re-deploying the CI). if polled_status is not None: patch = { 'jobs/%s/status' % job_id: status, 'jobs/%s/exit_code' % job_id: {} if res is None else res, 'jobs/%s/time_ended' % job_id: utc_now_iso(), 'jobs_running/%s' % job_id: {}, # = DELETE } req('PATCH', '%s.json' % (DB), body=patch) def sig_handler(_, __): logging.warning('Interrupted by signal, exiting worker') sigterm.set() def main(): init_logging() logging.info('Worker started') signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGINT, sig_handler) while not sigterm.is_set(): logging.debug('Starting poll cycle') try: worker_loop() req('PUT', '%s/workers/%s.json' % (DB, WORKER_NAME), body=make_worker_obj('IDLE')) except: logging.error('Exception in worker loop:\n%s', traceback.format_exc()) if sigterm.is_set(): break # Synchronize sleeping with the wall clock. This is so all VMs wake up at # the same time. See comment on distributing load above in this file. poll_time_sec = 5 time.sleep(poll_time_sec - (time.time() % poll_time_sec)) # The use case here is the VM being terminated by the GCE infrastructure. # We mark the worker as terminated and the job as cancelled so we don't wait # forever for it. logging.warning('Exiting the worker loop, got signal: %s', sigterm.is_set()) req('PUT', '%s/workers/%s.json' % (DB, WORKER_NAME), body=make_worker_obj('TERMINATED')) if __name__ == '__main__': main()