You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1041 lines
34 KiB

/*
* ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
*/
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for details.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <algorithm>
#include <atomic>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <inttypes.h>
#include <iostream>
#include <list>
#include <mutex>
#include <sstream>
#include <stack>
#include <string>
#include <unordered_map>
#include <vector>
#if (defined __APPLE__ && defined __MACH__)
#include <dlfcn.h>
#endif
#include "omp-tools.h"
#include <sys/resource.h>
// Define attribute that indicates that the fall through from the previous
// case label is intentional and should not be diagnosed by a compiler
// Code from libcxx/include/__config
// Use a function like macro to imply that it must be followed by a semicolon
#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
#define KMP_FALLTHROUGH() [[fallthrough]]
#elif __has_cpp_attribute(clang::fallthrough)
#define KMP_FALLTHROUGH() [[clang::fallthrough]]
#elif __has_attribute(fallthrough) || __GNUC__ >= 7
#define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
#else
#define KMP_FALLTHROUGH() ((void)0)
#endif
static int runOnTsan;
static int hasReductionCallback;
class ArcherFlags {
public:
#if (LLVM_VERSION) >= 40
int flush_shadow{0};
#endif
int print_max_rss{0};
int verbose{0};
int enabled{1};
int ignore_serial{0};
ArcherFlags(const char *env) {
if (env) {
std::vector<std::string> tokens;
std::string token;
std::string str(env);
std::istringstream iss(str);
while (std::getline(iss, token, ' '))
tokens.push_back(token);
for (std::vector<std::string>::iterator it = tokens.begin();
it != tokens.end(); ++it) {
#if (LLVM_VERSION) >= 40
if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
continue;
#endif
if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
continue;
if (sscanf(it->c_str(), "verbose=%d", &verbose))
continue;
if (sscanf(it->c_str(), "enable=%d", &enabled))
continue;
if (sscanf(it->c_str(), "ignore_serial=%d", &ignore_serial))
continue;
std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
<< std::endl;
}
}
}
};
class TsanFlags {
public:
int ignore_noninstrumented_modules;
TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
if (env) {
std::vector<std::string> tokens;
std::string str(env);
auto end = str.end();
auto it = str.begin();
auto is_sep = [](char c) {
return c == ' ' || c == ',' || c == ':' || c == '\n' || c == '\t' ||
c == '\r';
};
while (it != end) {
auto next_it = std::find_if(it, end, is_sep);
tokens.emplace_back(it, next_it);
it = next_it;
if (it != end) {
++it;
}
}
for (const auto &token : tokens) {
// we are interested in ignore_noninstrumented_modules to print a
// warning
if (sscanf(token.c_str(), "ignore_noninstrumented_modules=%d",
&ignore_noninstrumented_modules))
continue;
}
}
}
};
#if (LLVM_VERSION) >= 40
extern "C" {
int __attribute__((weak)) __archer_get_omp_status();
void __attribute__((weak)) __tsan_flush_memory() {}
}
#endif
ArcherFlags *archer_flags;
// The following definitions are pasted from "llvm/Support/Compiler.h" to allow
// the code
// to be compiled with other compilers like gcc:
#ifndef TsanHappensBefore
// Thread Sanitizer is a tool that finds races in code.
// See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
// tsan detects these exact functions by name.
extern "C" {
#if (defined __APPLE__ && defined __MACH__)
static void AnnotateHappensAfter(const char *file, int line,
const volatile void *cv) {
void (*fptr)(const char *, int, const volatile void *);
fptr = (void (*)(const char *, int, const volatile void *))dlsym(
RTLD_DEFAULT, "AnnotateHappensAfter");
(*fptr)(file, line, cv);
}
static void AnnotateHappensBefore(const char *file, int line,
const volatile void *cv) {
void (*fptr)(const char *, int, const volatile void *);
fptr = (void (*)(const char *, int, const volatile void *))dlsym(
RTLD_DEFAULT, "AnnotateHappensBefore");
(*fptr)(file, line, cv);
}
static void AnnotateIgnoreWritesBegin(const char *file, int line) {
void (*fptr)(const char *, int);
fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
"AnnotateIgnoreWritesBegin");
(*fptr)(file, line);
}
static void AnnotateIgnoreWritesEnd(const char *file, int line) {
void (*fptr)(const char *, int);
fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
"AnnotateIgnoreWritesEnd");
(*fptr)(file, line);
}
static void AnnotateNewMemory(const char *file, int line,
const volatile void *cv, size_t size) {
void (*fptr)(const char *, int, const volatile void *, size_t);
fptr = (void (*)(const char *, int, const volatile void *, size_t))dlsym(
RTLD_DEFAULT, "AnnotateNewMemory");
(*fptr)(file, line, cv, size);
}
static int RunningOnValgrind() {
int (*fptr)();
fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
if (fptr && fptr != RunningOnValgrind)
runOnTsan = 0;
return 0;
}
#else
void __attribute__((weak))
AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
void __attribute__((weak))
AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
void __attribute__((weak))
AnnotateIgnoreWritesBegin(const char *file, int line) {}
void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
}
void __attribute__((weak))
AnnotateNewMemory(const char *file, int line, const volatile void *cv,
size_t size) {}
int __attribute__((weak)) RunningOnValgrind() {
runOnTsan = 0;
return 0;
}
void __attribute__((weak)) __tsan_func_entry(const void *call_pc) {}
void __attribute__((weak)) __tsan_func_exit(void) {}
#endif
}
// This marker is used to define a happens-before arc. The race detector will
// infer an arc from the begin to the end when they share the same pointer
// argument.
#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
// This marker defines the destination of a happens-before arc.
#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
// Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
// Resume checking for racy writes.
#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
// We don't really delete the clock for now
#define TsanDeleteClock(cv)
// newMemory
#define TsanNewMemory(addr, size) \
AnnotateNewMemory(__FILE__, __LINE__, addr, size)
#define TsanFreeMemory(addr, size) \
AnnotateNewMemory(__FILE__, __LINE__, addr, size)
#endif
// Function entry/exit
#define TsanFuncEntry(pc) __tsan_func_entry(pc)
#define TsanFuncExit() __tsan_func_exit()
/// Required OMPT inquiry functions.
static ompt_get_parallel_info_t ompt_get_parallel_info;
static ompt_get_thread_data_t ompt_get_thread_data;
typedef uint64_t ompt_tsan_clockid;
static uint64_t my_next_id() {
static uint64_t ID = 0;
uint64_t ret = __sync_fetch_and_add(&ID, 1);
return ret;
}
// Data structure to provide a threadsafe pool of reusable objects.
// DataPool<Type of objects, Size of blockalloc>
template <typename T, int N> struct DataPool {
std::mutex DPMutex;
std::stack<T *> DataPointer;
std::list<void *> memory;
int total;
void newDatas() {
// prefix the Data with a pointer to 'this', allows to return memory to
// 'this',
// without explicitly knowing the source.
//
// To reduce lock contention, we use thread local DataPools, but Data
// objects move to other threads.
// The strategy is to get objects from local pool. Only if the object moved
// to another
// thread, we might see a penalty on release (returnData).
// For "single producer" pattern, a single thread creates tasks, these are
// executed by other threads.
// The master will have a high demand on TaskData, so return after use.
struct pooldata {
DataPool<T, N> *dp;
T data;
};
// We alloc without initialize the memory. We cannot call constructors.
// Therefore use malloc!
pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N);
memory.push_back(datas);
for (int i = 0; i < N; i++) {
datas[i].dp = this;
DataPointer.push(&(datas[i].data));
}
total += N;
}
T *getData() {
T *ret;
DPMutex.lock();
if (DataPointer.empty())
newDatas();
ret = DataPointer.top();
DataPointer.pop();
DPMutex.unlock();
return ret;
}
void returnData(T *data) {
DPMutex.lock();
DataPointer.push(data);
DPMutex.unlock();
}
void getDatas(int n, T **datas) {
DPMutex.lock();
for (int i = 0; i < n; i++) {
if (DataPointer.empty())
newDatas();
datas[i] = DataPointer.top();
DataPointer.pop();
}
DPMutex.unlock();
}
void returnDatas(int n, T **datas) {
DPMutex.lock();
for (int i = 0; i < n; i++) {
DataPointer.push(datas[i]);
}
DPMutex.unlock();
}
DataPool() : DPMutex(), DataPointer(), total(0) {}
~DataPool() {
// we assume all memory is returned when the thread finished / destructor is
// called
for (auto i : memory)
if (i)
free(i);
}
};
// This function takes care to return the data to the originating DataPool
// A pointer to the originating DataPool is stored just before the actual data.
template <typename T, int N> static void retData(void *data) {
((DataPool<T, N> **)data)[-1]->returnData((T *)data);
}
struct ParallelData;
__thread DataPool<ParallelData, 4> *pdp;
/// Data structure to store additional information for parallel regions.
struct ParallelData {
// Parallel fork is just another barrier, use Barrier[1]
/// Two addresses for relationships with barriers.
ompt_tsan_clockid Barrier[2];
const void *codePtr;
void *GetParallelPtr() { return &(Barrier[1]); }
void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }
ParallelData(const void *codeptr) : codePtr(codeptr) {}
~ParallelData() {
TsanDeleteClock(&(Barrier[0]));
TsanDeleteClock(&(Barrier[1]));
}
// overload new/delete to use DataPool for memory management.
void *operator new(size_t size) { return pdp->getData(); }
void operator delete(void *p, size_t) { retData<ParallelData, 4>(p); }
};
static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
return reinterpret_cast<ParallelData *>(parallel_data->ptr);
}
struct Taskgroup;
__thread DataPool<Taskgroup, 4> *tgp;
/// Data structure to support stacking of taskgroups and allow synchronization.
struct Taskgroup {
/// Its address is used for relationships of the taskgroup's task set.
ompt_tsan_clockid Ptr;
/// Reference to the parent taskgroup.
Taskgroup *Parent;
Taskgroup(Taskgroup *Parent) : Parent(Parent) {}
~Taskgroup() { TsanDeleteClock(&Ptr); }
void *GetPtr() { return &Ptr; }
// overload new/delete to use DataPool for memory management.
void *operator new(size_t size) { return tgp->getData(); }
void operator delete(void *p, size_t) { retData<Taskgroup, 4>(p); }
};
struct TaskData;
__thread DataPool<TaskData, 4> *tdp;
/// Data structure to store additional information for tasks.
struct TaskData {
/// Its address is used for relationships of this task.
ompt_tsan_clockid Task;
/// Child tasks use its address to declare a relationship to a taskwait in
/// this task.
ompt_tsan_clockid Taskwait;
/// Whether this task is currently executing a barrier.
bool InBarrier;
/// Whether this task is an included task.
int TaskType{0};
/// Index of which barrier to use next.
char BarrierIndex;
/// Count how often this structure has been put into child tasks + 1.
std::atomic_int RefCount;
/// Reference to the parent that created this task.
TaskData *Parent;
/// Reference to the implicit task in the stack above this task.
TaskData *ImplicitTask;
/// Reference to the team of this task.
ParallelData *Team;
/// Reference to the current taskgroup that this task either belongs to or
/// that it just created.
Taskgroup *TaskGroup;
/// Dependency information for this task.
ompt_dependence_t *Dependencies;
/// Number of dependency entries.
unsigned DependencyCount;
void *PrivateData;
size_t PrivateDataSize;
int execution;
int freed;
TaskData(TaskData *Parent, int taskType)
: InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team),
TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) {
if (Parent != nullptr) {
Parent->RefCount++;
// Copy over pointer to taskgroup. This task may set up its own stack
// but for now belongs to its parent's taskgroup.
TaskGroup = Parent->TaskGroup;
}
}
TaskData(ParallelData *Team, int taskType)
: InBarrier(false), TaskType(taskType), BarrierIndex(0), RefCount(1),
Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr),
DependencyCount(0), execution(1), freed(0) {}
~TaskData() {
TsanDeleteClock(&Task);
TsanDeleteClock(&Taskwait);
}
bool isIncluded() { return TaskType & ompt_task_undeferred; }
bool isUntied() { return TaskType & ompt_task_untied; }
bool isFinal() { return TaskType & ompt_task_final; }
bool isMergable() { return TaskType & ompt_task_mergeable; }
bool isMerged() { return TaskType & ompt_task_merged; }
bool isExplicit() { return TaskType & ompt_task_explicit; }
bool isImplicit() { return TaskType & ompt_task_implicit; }
bool isInitial() { return TaskType & ompt_task_initial; }
bool isTarget() { return TaskType & ompt_task_target; }
void *GetTaskPtr() { return &Task; }
void *GetTaskwaitPtr() { return &Taskwait; }
// overload new/delete to use DataPool for memory management.
void *operator new(size_t size) { return tdp->getData(); }
void operator delete(void *p, size_t) { retData<TaskData, 4>(p); }
};
static inline TaskData *ToTaskData(ompt_data_t *task_data) {
return reinterpret_cast<TaskData *>(task_data->ptr);
}
static inline void *ToInAddr(void *OutAddr) {
// FIXME: This will give false negatives when a second variable lays directly
// behind a variable that only has a width of 1 byte.
// Another approach would be to "negate" the address or to flip the
// first bit...
return reinterpret_cast<char *>(OutAddr) + 1;
}
/// Store a mutex for each wait_id to resolve race condition with callbacks.
std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
std::mutex LocksMutex;
static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
ompt_data_t *thread_data) {
pdp = new DataPool<ParallelData, 4>;
TsanNewMemory(pdp, sizeof(pdp));
tgp = new DataPool<Taskgroup, 4>;
TsanNewMemory(tgp, sizeof(tgp));
tdp = new DataPool<TaskData, 4>;
TsanNewMemory(tdp, sizeof(tdp));
thread_data->value = my_next_id();
}
static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
delete pdp;
delete tgp;
delete tdp;
}
/// OMPT event callbacks for handling parallel regions.
static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
const ompt_frame_t *parent_task_frame,
ompt_data_t *parallel_data,
uint32_t requested_team_size, int flag,
const void *codeptr_ra) {
ParallelData *Data = new ParallelData(codeptr_ra);
parallel_data->ptr = Data;
TsanHappensBefore(Data->GetParallelPtr());
if (archer_flags->ignore_serial && ToTaskData(parent_task_data)->isInitial())
TsanIgnoreWritesEnd();
}
static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
ompt_data_t *task_data, int flag,
const void *codeptr_ra) {
if (archer_flags->ignore_serial && ToTaskData(task_data)->isInitial())
TsanIgnoreWritesBegin();
ParallelData *Data = ToParallelData(parallel_data);
TsanHappensAfter(Data->GetBarrierPtr(0));
TsanHappensAfter(Data->GetBarrierPtr(1));
delete Data;
#if (LLVM_VERSION >= 40)
if (&__archer_get_omp_status) {
if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
__tsan_flush_memory();
}
#endif
}
static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
ompt_data_t *parallel_data,
ompt_data_t *task_data,
unsigned int team_size,
unsigned int thread_num, int type) {
switch (endpoint) {
case ompt_scope_begin:
if (type & ompt_task_initial) {
parallel_data->ptr = new ParallelData(nullptr);
}
task_data->ptr = new TaskData(ToParallelData(parallel_data), type);
TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
break;
case ompt_scope_end: {
TaskData *Data = ToTaskData(task_data);
assert(Data->freed == 0 && "Implicit task end should only be called once!");
Data->freed = 1;
assert(Data->RefCount == 1 &&
"All tasks should have finished at the implicit barrier!");
delete Data;
TsanFuncExit();
break;
}
case ompt_scope_beginend:
// Should not occur according to OpenMP 5.1
// Tested in OMPT tests
break;
}
}
static void ompt_tsan_sync_region(ompt_sync_region_t kind,
ompt_scope_endpoint_t endpoint,
ompt_data_t *parallel_data,
ompt_data_t *task_data,
const void *codeptr_ra) {
TaskData *Data = ToTaskData(task_data);
switch (endpoint) {
case ompt_scope_begin:
case ompt_scope_beginend:
TsanFuncEntry(codeptr_ra);
switch (kind) {
case ompt_sync_region_barrier_implementation:
case ompt_sync_region_barrier_implicit:
case ompt_sync_region_barrier_explicit:
case ompt_sync_region_barrier_implicit_parallel:
case ompt_sync_region_barrier_implicit_workshare:
case ompt_sync_region_barrier_teams:
case ompt_sync_region_barrier: {
char BarrierIndex = Data->BarrierIndex;
TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));
if (hasReductionCallback < ompt_set_always) {
// We ignore writes inside the barrier. These would either occur during
// 1. reductions performed by the runtime which are guaranteed to be
// race-free.
// 2. execution of another task.
// For the latter case we will re-enable tracking in task_switch.
Data->InBarrier = true;
TsanIgnoreWritesBegin();
}
break;
}
case ompt_sync_region_taskwait:
break;
case ompt_sync_region_taskgroup:
Data->TaskGroup = new Taskgroup(Data->TaskGroup);
break;
case ompt_sync_region_reduction:
// should never be reached
break;
}
if (endpoint == ompt_scope_begin)
break;
KMP_FALLTHROUGH();
case ompt_scope_end:
TsanFuncExit();
switch (kind) {
case ompt_sync_region_barrier_implementation:
case ompt_sync_region_barrier_implicit:
case ompt_sync_region_barrier_explicit:
case ompt_sync_region_barrier_implicit_parallel:
case ompt_sync_region_barrier_implicit_workshare:
case ompt_sync_region_barrier_teams:
case ompt_sync_region_barrier: {
if (hasReductionCallback < ompt_set_always) {
// We want to track writes after the barrier again.
Data->InBarrier = false;
TsanIgnoreWritesEnd();
}
char BarrierIndex = Data->BarrierIndex;
// Barrier will end after it has been entered by all threads.
if (parallel_data)
TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));
// It is not guaranteed that all threads have exited this barrier before
// we enter the next one. So we will use a different address.
// We are however guaranteed that this current barrier is finished
// by the time we exit the next one. So we can then reuse the first
// address.
Data->BarrierIndex = (BarrierIndex + 1) % 2;
break;
}
case ompt_sync_region_taskwait: {
if (Data->execution > 1)
TsanHappensAfter(Data->GetTaskwaitPtr());
break;
}
case ompt_sync_region_taskgroup: {
assert(Data->TaskGroup != nullptr &&
"Should have at least one taskgroup!");
TsanHappensAfter(Data->TaskGroup->GetPtr());
// Delete this allocated taskgroup, all descendent task are finished by
// now.
Taskgroup *Parent = Data->TaskGroup->Parent;
delete Data->TaskGroup;
Data->TaskGroup = Parent;
break;
}
case ompt_sync_region_reduction:
// Should not occur according to OpenMP 5.1
// Tested in OMPT tests
break;
}
break;
}
}
static void ompt_tsan_reduction(ompt_sync_region_t kind,
ompt_scope_endpoint_t endpoint,
ompt_data_t *parallel_data,
ompt_data_t *task_data,
const void *codeptr_ra) {
switch (endpoint) {
case ompt_scope_begin:
switch (kind) {
case ompt_sync_region_reduction:
TsanIgnoreWritesBegin();
break;
default:
break;
}
break;
case ompt_scope_end:
switch (kind) {
case ompt_sync_region_reduction:
TsanIgnoreWritesEnd();
break;
default:
break;
}
break;
case ompt_scope_beginend:
// Should not occur according to OpenMP 5.1
// Tested in OMPT tests
// Would have no implications for DR detection
break;
}
}
/// OMPT event callbacks for handling tasks.
static void ompt_tsan_task_create(
ompt_data_t *parent_task_data, /* id of parent task */
const ompt_frame_t *parent_frame, /* frame data for parent task */
ompt_data_t *new_task_data, /* id of created task */
int type, int has_dependences,
const void *codeptr_ra) /* pointer to outlined function */
{
TaskData *Data;
assert(new_task_data->ptr == NULL &&
"Task data should be initialized to NULL");
if (type & ompt_task_initial) {
ompt_data_t *parallel_data;
int team_size = 1;
ompt_get_parallel_info(0, &parallel_data, &team_size);
ParallelData *PData = new ParallelData(nullptr);
parallel_data->ptr = PData;
Data = new TaskData(PData, type);
new_task_data->ptr = Data;
} else if (type & ompt_task_undeferred) {
Data = new TaskData(ToTaskData(parent_task_data), type);
new_task_data->ptr = Data;
} else if (type & ompt_task_explicit || type & ompt_task_target) {
Data = new TaskData(ToTaskData(parent_task_data), type);
new_task_data->ptr = Data;
// Use the newly created address. We cannot use a single address from the
// parent because that would declare wrong relationships with other
// sibling tasks that may be created before this task is started!
TsanHappensBefore(Data->GetTaskPtr());
ToTaskData(parent_task_data)->execution++;
}
}
static void __ompt_tsan_release_task(TaskData *task) {
while (task != nullptr && --task->RefCount == 0) {
TaskData *Parent = task->Parent;
if (task->DependencyCount > 0) {
delete[] task->Dependencies;
}
delete task;
task = Parent;
}
}
static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
ompt_task_status_t prior_task_status,
ompt_data_t *second_task_data) {
//
// The necessary action depends on prior_task_status:
//
// ompt_task_early_fulfill = 5,
// -> ignored
//
// ompt_task_late_fulfill = 6,
// -> first completed, first freed, second ignored
//
// ompt_task_complete = 1,
// ompt_task_cancel = 3,
// -> first completed, first freed, second starts
//
// ompt_task_detach = 4,
// ompt_task_yield = 2,
// ompt_task_switch = 7
// -> first suspended, second starts
//
if (prior_task_status == ompt_task_early_fulfill)
return;
TaskData *FromTask = ToTaskData(first_task_data);
// Legacy handling for missing reduction callback
if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
// We want to ignore writes in the runtime code during barriers,
// but not when executing tasks with user code!
TsanIgnoreWritesEnd();
}
// The late fulfill happens after the detached task finished execution
if (prior_task_status == ompt_task_late_fulfill)
TsanHappensAfter(FromTask->GetTaskPtr());
// task completed execution
if (prior_task_status == ompt_task_complete ||
prior_task_status == ompt_task_cancel ||
prior_task_status == ompt_task_late_fulfill) {
// Included tasks are executed sequentially, no need to track
// synchronization
if (!FromTask->isIncluded()) {
// Task will finish before a barrier in the surrounding parallel region
// ...
ParallelData *PData = FromTask->Team;
TsanHappensBefore(
PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));
// ... and before an eventual taskwait by the parent thread.
TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());
if (FromTask->TaskGroup != nullptr) {
// This task is part of a taskgroup, so it will finish before the
// corresponding taskgroup_end.
TsanHappensBefore(FromTask->TaskGroup->GetPtr());
}
}
// release dependencies
for (unsigned i = 0; i < FromTask->DependencyCount; i++) {
ompt_dependence_t *Dependency = &FromTask->Dependencies[i];
// in dependencies block following inout and out dependencies!
TsanHappensBefore(ToInAddr(Dependency->variable.ptr));
if (Dependency->dependence_type == ompt_dependence_type_out ||
Dependency->dependence_type == ompt_dependence_type_inout) {
TsanHappensBefore(Dependency->variable.ptr);
}
}
// free the previously running task
__ompt_tsan_release_task(FromTask);
}
// For late fulfill of detached task, there is no task to schedule to
if (prior_task_status == ompt_task_late_fulfill) {
return;
}
TaskData *ToTask = ToTaskData(second_task_data);
// Legacy handling for missing reduction callback
if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
// We re-enter runtime code which currently performs a barrier.
TsanIgnoreWritesBegin();
}
// task suspended
if (prior_task_status == ompt_task_switch ||
prior_task_status == ompt_task_yield ||
prior_task_status == ompt_task_detach) {
// Task may be resumed at a later point in time.
TsanHappensBefore(FromTask->GetTaskPtr());
ToTask->ImplicitTask = FromTask->ImplicitTask;
assert(ToTask->ImplicitTask != NULL &&
"A task belongs to a team and has an implicit task on the stack");
}
// Handle dependencies on first execution of the task
if (ToTask->execution == 0) {
ToTask->execution++;
for (unsigned i = 0; i < ToTask->DependencyCount; i++) {
ompt_dependence_t *Dependency = &ToTask->Dependencies[i];
TsanHappensAfter(Dependency->variable.ptr);
// in and inout dependencies are also blocked by prior in dependencies!
if (Dependency->dependence_type == ompt_dependence_type_out ||
Dependency->dependence_type == ompt_dependence_type_inout) {
TsanHappensAfter(ToInAddr(Dependency->variable.ptr));
}
}
}
// 1. Task will begin execution after it has been created.
// 2. Task will resume after it has been switched away.
TsanHappensAfter(ToTask->GetTaskPtr());
}
static void ompt_tsan_dependences(ompt_data_t *task_data,
const ompt_dependence_t *deps, int ndeps) {
if (ndeps > 0) {
// Copy the data to use it in task_switch and task_end.
TaskData *Data = ToTaskData(task_data);
Data->Dependencies = new ompt_dependence_t[ndeps];
std::memcpy(Data->Dependencies, deps, sizeof(ompt_dependence_t) * ndeps);
Data->DependencyCount = ndeps;
// This callback is executed before this task is first started.
TsanHappensBefore(Data->GetTaskPtr());
}
}
/// OMPT event callbacks for handling locking.
static void ompt_tsan_mutex_acquired(ompt_mutex_t kind, ompt_wait_id_t wait_id,
const void *codeptr_ra) {
// Acquire our own lock to make sure that
// 1. the previous release has finished.
// 2. the next acquire doesn't start before we have finished our release.
LocksMutex.lock();
std::mutex &Lock = Locks[wait_id];
LocksMutex.unlock();
Lock.lock();
TsanHappensAfter(&Lock);
}
static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
const void *codeptr_ra) {
LocksMutex.lock();
std::mutex &Lock = Locks[wait_id];
LocksMutex.unlock();
TsanHappensBefore(&Lock);
Lock.unlock();
}
// callback , signature , variable to store result , required support level
#define SET_OPTIONAL_CALLBACK_T(event, type, result, level) \
do { \
ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event; \
result = ompt_set_callback(ompt_callback_##event, \
(ompt_callback_t)tsan_##event); \
if (result < level) \
printf("Registered callback '" #event "' is not supported at " #level \
" (%i)\n", \
result); \
} while (0)
#define SET_CALLBACK_T(event, type) \
do { \
int res; \
SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always); \
} while (0)
#define SET_CALLBACK(event) SET_CALLBACK_T(event, event)
static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
ompt_data_t *tool_data) {
const char *options = getenv("TSAN_OPTIONS");
TsanFlags tsan_flags(options);
ompt_set_callback_t ompt_set_callback =
(ompt_set_callback_t)lookup("ompt_set_callback");
if (ompt_set_callback == NULL) {
std::cerr << "Could not set callback, exiting..." << std::endl;
std::exit(1);
}
ompt_get_parallel_info =
(ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
if (ompt_get_parallel_info == NULL) {
fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
"exiting...\n");
exit(1);
}
SET_CALLBACK(thread_begin);
SET_CALLBACK(thread_end);
SET_CALLBACK(parallel_begin);
SET_CALLBACK(implicit_task);
SET_CALLBACK(sync_region);
SET_CALLBACK(parallel_end);
SET_CALLBACK(task_create);
SET_CALLBACK(task_schedule);
SET_CALLBACK(dependences);
SET_CALLBACK_T(mutex_acquired, mutex);
SET_CALLBACK_T(mutex_released, mutex);
SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback,
ompt_set_never);
if (!tsan_flags.ignore_noninstrumented_modules)
fprintf(stderr,
"Warning: please export "
"TSAN_OPTIONS='ignore_noninstrumented_modules=1' "
"to avoid false positive reports from the OpenMP runtime!\n");
if (archer_flags->ignore_serial)
TsanIgnoreWritesBegin();
return 1; // success
}
static void ompt_tsan_finalize(ompt_data_t *tool_data) {
if (archer_flags->ignore_serial)
TsanIgnoreWritesEnd();
if (archer_flags->print_max_rss) {
struct rusage end;
getrusage(RUSAGE_SELF, &end);
printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
}
if (archer_flags)
delete archer_flags;
}
extern "C" ompt_start_tool_result_t *
ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
const char *options = getenv("ARCHER_OPTIONS");
archer_flags = new ArcherFlags(options);
if (!archer_flags->enabled) {
if (archer_flags->verbose)
std::cout << "Archer disabled, stopping operation" << std::endl;
delete archer_flags;
return NULL;
}
static ompt_start_tool_result_t ompt_start_tool_result = {
&ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
runOnTsan = 1;
RunningOnValgrind();
if (!runOnTsan) // if we are not running on TSAN, give a different tool the
// chance to be loaded
{
if (archer_flags->verbose)
std::cout << "Archer detected OpenMP application without TSan "
"stopping operation"
<< std::endl;
delete archer_flags;
return NULL;
}
if (archer_flags->verbose)
std::cout << "Archer detected OpenMP application with TSan, supplying "
"OpenMP synchronization semantics"
<< std::endl;
return &ompt_start_tool_result;
}