/* * Copyright (c) Hisilicon Technologies Co., Ltd.. 2021-2021. All rights reserved. * Description: part [procdaemon] for archive proc informations. * Create: 2022-09-20 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dft_event.h" #include "proc_node.h" #include "procd_base.h" static int g_max_rotation_count = -1; static int g_max_rotation_size = -1; static float g_max_cpu_utility = 0.0f; static pthread_t g_cycle_archive_tids[MAX_LEN] = {-1}; static pthread_t g_variation_archive_tids[MAX_LEN] = {-1}; static proc_node g_nodes[MAX_LEN]; typedef struct timer_val { long *timestamp; proc_node *proc; } timer_val; static int proc_archive(int node_idx, pthread_t *g_cycle_archive_tid, pthread_t *g_variation_archive_tid); static void inline set_thread_name_from_proc_path(char *proc_path) { char *proc_name = TD_NULL; proc_name = strrchr(proc_path, '/'); prctl(PR_SET_NAME, proc_name); } // 获得时间戳(用于落盘内容分隔) static int get_timestamp_for_archive_content(char *timestamp) { time_t t; struct tm *st = TD_NULL; int ret = TD_FAILURE; (void)time(&t); st = localtime(&t); ret = sprintf_s(timestamp, MAX_LEN, ">>> %04d-%02d-%02d %02d:%02d:%02d\n", st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec); if (ret == TD_FAILURE) { log_err("in get_timestamp_for_archive_content, sprintf_s failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 返回时间戳(用于计时器判断时间) static inline long get_timestamp_for_timer() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * SEC_TO_NSEC + tv.tv_usec; } static int get_timestamp_for_log(char *timestamp) { time_t t; struct tm *st = TD_NULL; int ret = TD_FAILURE; (void)time(&t); st = localtime(&t); ret = sprintf_s(timestamp, MAX_LEN, "%04d-%02d-%02d %02d:%02d:%02d", st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec); if (ret == TD_FAILURE) { log_err("in get_timestamp_for_log, sprintf_s failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 获得指定路径文件的大小,单位KB static inline int get_file_size(char *file_path) { struct stat stat_buf; stat(file_path, &stat_buf); int size = stat_buf.st_size; return size / BYTE_KB; } // 检测字符串str是不是以字符串prefix开头 static int starts_with(const char *str, const char *prefix) { if (!str || !prefix) { return TD_FAILURE; } if (strncmp(str, prefix, strlen(prefix)) == 0) { return TD_SUCCESS; } return TD_FAILURE; } // 拼接源文件的全路径 static int get_proc_archive_path(proc_node *proc, char *archive_dir_path) { char *proc_name = TD_NULL; int ret = TD_FAILURE; char proc_path[MAX_LEN] = {'\0'}; if (proc == TD_NULL) { log_err("in get_proc_archive_path, proc is null ptr.\n"); goto OUT; } ret = strcpy_s(proc_path, MAX_LEN, proc->path); if (ret == TD_FAILURE) { log_err("in get_proc_archive_path, strcpy_s failed\n"); goto OUT; } // 拼接proc落盘文件夹路径 proc_name = strrchr(proc_path, '/'); // proc_name = /klad ret = strcpy_s(archive_dir_path, MAX_LEN, PROC_SNAP_ROOT_DIR); if (ret == TD_FAILURE) { log_err("in get_proc_archive_path, strcpy_s failed\n"); goto OUT; } ret = strcat_s(archive_dir_path, MAX_LEN, proc_name); if (ret == TD_FAILURE) { log_err("in get_proc_archive_path, strcat_s failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 创建procsnap落盘的根目录 static int make_snapshot_root_dir() { int ret = TD_FAILURE; DIR *dir = TD_NULL; dir = opendir(PROC_SNAP_ROOT_DIR); if (dir == TD_NULL) { ret = mkdir(PROC_SNAP_ROOT_DIR, DIR_PERM); if (ret == TD_FAILURE) { log_err("in make_snapshot_root_dir, mkdir failed\n"); goto OUT; } } else { closedir(dir); } return TD_SUCCESS; OUT: return TD_FAILURE; } // 为每个落盘proc创建相应的文件夹 static int make_snapshot_dir(proc_node *proc) { int ret = TD_FAILURE; char archive_dir_path[MAX_LEN] = {'\0'}; DIR *dir = TD_NULL; if (proc == TD_NULL) { log_err("in make_snapshot_dir, proc is null ptr.\n"); goto OUT; } ret = get_proc_archive_path(proc, archive_dir_path); if (ret == TD_FAILURE) { log_err("in make_snapshot_dir, get_proc_archive_path failed.\n"); goto OUT; } // 如果文件夹不存在,则创建proc落盘文件夹 dir = opendir(archive_dir_path); if (dir == TD_NULL) { ret = mkdir(archive_dir_path, DIR_PERM); if (ret == TD_FAILURE) { log_err("in make_snapshot_dir, mkdir failed\n"); goto OUT; } } else { if (closedir(dir) == TD_FAILURE) { log_err("closedir %s failed\n", archive_dir_path); goto OUT; } } ret = strcpy_s(proc->archive_path, MAX_LEN, archive_dir_path); if (ret == TD_FAILURE) { log_err("in make_snapshot_dir, strcpy_s failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } static int make_log_dir() { DIR *dir = TD_NULL; int ret = TD_FAILURE; char path[MAX_LEN] = {'\0'}; // 创建procd_log文件夹 ret = sprintf_s(path, MAX_LEN, "%s/%s", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH); dir = opendir(path); if (dir == TD_NULL) { ret = mkdir(path, DIR_PERM); if (ret == TD_FAILURE) { log_err("in make_log_dir, mkdir failed\n"); goto OUT; } } else { if (closedir(dir) == TD_FAILURE) { log_err("closedir %s failed\n", path); goto OUT; } } return TD_SUCCESS; OUT: return TD_FAILURE; } // 检查落盘根文件夹及落盘子文件夹是否存在,如果不存在,则重新新建 static int check_dir_exits(proc_node *proc) { int ret = TD_FAILURE; // 检查根文件夹 /data/vendor/procsnap ret = make_snapshot_root_dir(); if (ret == TD_FAILURE) { log_err("in check_dir_exits, make_snapshot_root_dir failed\n"); goto OUT; } // 检查日志文件夹 /data/vendor/procsnap/procd_log ret = make_log_dir(); if (ret == TD_FAILURE) { log_err("in check_dir_exits, make_log_dir failed\n"); goto OUT; } // 检查子文件夹 /data/vendor/procsnap/xxx if (proc != TD_NULL) { ret = make_snapshot_dir(proc); if (ret == TD_FAILURE) { log_err("in check_dir_exits, make_snapshot_dir failed\n"); goto OUT; } } return TD_SUCCESS; OUT: return TD_FAILURE; } // 输出关键日志 static int write_log(const char *log_content) { int fd = -1; int ret = TD_FAILURE; char log_file_name[BUF_SIZE] = {'\0'}; char buf[BUF_SIZE] = {'\0'}; char time_stamp[MAX_LEN] = {'\0'}; if (check_dir_exits(TD_NULL) == TD_FAILURE) { log_err("in write_log, make_log_dir failed\n"); goto OUT; } ret = sprintf_s(log_file_name, BUF_SIZE, "%s/%s/%s.log", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH, LOG_NAME); if (ret == TD_FAILURE) { log_err("in write_log, sprintf_s failed\n"); goto OUT; } if (get_file_size(log_file_name) >= MAX_LOG_FILE_SIZE) { fd = open(log_file_name, O_CREAT | O_WRONLY | O_TRUNC, FILE_PERM); } else { fd = open(log_file_name, O_CREAT | O_WRONLY | O_APPEND, FILE_PERM); } if (fd == TD_FAILURE) { log_err("in write_log, open %s failed\n", log_file_name); goto OUT; } ret = get_timestamp_for_log(time_stamp); if (ret == TD_FAILURE) { log_err("in write_log, get_timestamp_for_log failed\n"); goto OUT1; } ret = sprintf_s(buf, BUF_SIZE, "%s[%s] [pid:%d]: %s\n", starts_with(log_content, "[Start]") ? "\n" : "", time_stamp, gettid(), log_content); if (ret == TD_FAILURE) { log_err("sprintf_s failed\n"); goto OUT1; } ret = write(fd, buf, BUF_SIZE); if (ret == TD_FAILURE) { log_err("in write_log, write failed\n"); goto OUT1; } close(fd); return TD_SUCCESS; OUT1: close(fd); OUT: return TD_FAILURE; } // 获得时间戳,用于判断系统时间是否初始化完成 static int is_system_timer_init() { time_t t; struct tm *st = TD_NULL; (void)time(&t); st = localtime(&t); if (st->tm_year + START_YEAR == SYSTEM_INIT_YEAR) { log_dbg("system timer not init.\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } static int get_proc_stack() { char proc_stack_path[MAX_LEN]; char buf[MAX_LEN]; int ret = TD_FAILURE; int from_stack_fd = -1, to_stack_fd = -1; int nread, nwrite; ret = sprintf_s(proc_stack_path, MAX_LEN, "/proc/%d/stack", getpid()); if (ret == TD_FAILURE) { log_err("in get_proc_stack, sprintf_s failed\n"); goto OUT; } from_stack_fd = open(proc_stack_path, O_RDONLY); if (from_stack_fd == TD_FAILURE) { log_err("in get_proc_stack, open %s failed\n", proc_stack_path); goto OUT; } to_stack_fd = open(STACK_FILE_PATH, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM); if (to_stack_fd == TD_FAILURE) { log_err("in get_proc_stack, open %s failed\n", STACK_FILE_PATH); goto OUT1; } while ((nread = read(from_stack_fd, buf, MAX_LEN)) > 0) { nwrite = 0; do { ret = write(to_stack_fd, &buf[nwrite], nread - nwrite); if (ret == TD_FAILURE) { log_err("in get_proc_stack, write failed\n"); goto OUT2; } nwrite += nread; } while (nwrite < nread); } if (nread == TD_FAILURE) { log_err("in get_proc_stack, read failed\n"); goto OUT2; } close(from_stack_fd); close(to_stack_fd); return TD_SUCCESS; OUT2: close(to_stack_fd); OUT1: close(from_stack_fd); OUT: return TD_FAILURE; } // 判断字符串是否为整数 static int is_int_digital(char *num) { int len = strlen(num); int i; for (i = 0; i < len; i++) { if ('0' <= num[i] && num[i] <= '9') { continue; } else { return TD_FAILURE; } } return TD_SUCCESS; } // 判断字符串是否为浮点数 static int is_float_digital(char *num) { int len = strlen(num); int i; int dot_num = 0; for (i = 0; i < len; i++) { if (num[i] == '.') { dot_num++; if (dot_num > 1) { return TD_FAILURE; } } else if ('0' <= num[i] && num[i] <= '9') { continue; } else { return TD_FAILURE; } } return TD_SUCCESS; } // 计算cpu占用率 static int get_cpu_usage(float *usage) { int fd = -1; char filename[BUF_SIZE] = {'\0'}; char buf[BUF_SIZE] = {'\0'}; int ret = TD_FAILURE; int nread; char *data = TD_NULL; int user_time = 0; int kernel_time = 0; static int last_user_time = 0; static int last_kernel_time = 0; int counter = 0; ret = sprintf_s(filename, BUF_SIZE, "/proc/%d/stat", getpid()); if (ret == TD_FAILURE) { log_err("in get_cpu_usage, sprintf_s failed\n"); goto OUT; } fd = open(filename, O_RDONLY); if (fd == TD_FAILURE) { log_err("in get_cpu_usage, open %s failed\n", filename); goto OUT; } nread = read(fd, buf, BUF_SIZE); data = strtok(buf, " "); while (data != NULL) { if (counter == UTIME_IDX) { user_time = atoi(data); // 获取用户态占用时间 } else if (counter == CTIME_IDX) { kernel_time = atoi(data); // 获取内核态占用时间 break; } counter++; data = strtok(NULL, " "); } *usage = 1.0 * (user_time - last_user_time + kernel_time - last_kernel_time) / (sysconf(_SC_CLK_TCK) * CPU_TIMER_SEC); last_user_time = user_time; last_kernel_time = kernel_time; close(fd); return TD_SUCCESS; OUT: return TD_FAILURE; } static int fault_report_cpu_overuse(float usage) { int ret = TD_FAILURE; char usage_str[MAX_LEN] = {'\0'}; char buf[MAX_LEN] = {'\0'}; unsigned int handle; ret = get_proc_stack(); if (ret == TD_FAILURE) { log_err("in fault_report_cpu_overuse, get_proc_stack failed\n"); goto OUT; } ret = sprintf_s(usage_str, MAX_LEN, "%.2f", usage); if (ret == TD_FAILURE) { log_err("in fault_report_cpu_overuse, sprintf_s failed\n"); goto OUT; } log_dbg("fault report for overuse cpu.\n"); ret = dft_event_create(FAULT_NO_OVERUSE_CPU, &handle); if (ret == TD_FAILURE) { log_err("in fault_report_cpu_overuse, dft_event_create failed\n"); goto OUT; } dft_event_put_string(handle, "PNAME", "procdaemon"); dft_event_put_string(handle, "F1NAME", "fault_report_cpu_overuse"); dft_event_put_string(handle, "F2NAME", "thread_watch_cpu_usage"); dft_event_put_string(handle, "CPU_USAGE", usage_str); ret = dft_event_report(handle); if (ret == TD_FAILURE) { log_err("in fault_report_cpu_overuse, dft_event_report failed\n"); goto OUT; } dft_event_destroy(handle); ret = sprintf_s(buf, MAX_LEN, "[Fault Report] cpu overuse: %.2f", usage); if (ret == TD_FAILURE) { log_err("in fault_report_cpu_overuse, sprintf_s failed\n"); goto OUT; } write_log(buf); return TD_SUCCESS; OUT: return TD_FAILURE; } // 检测cpu占用率,超过阈值则故障上报 static void check_cpu_usage() { int ret = TD_FAILURE; float usage; char buf[MAX_LEN] = {'\0'}; ret = get_cpu_usage(&usage); if (ret == TD_FAILURE) { log_err("in check_cpu_usage, get_cpu_usage failed\n"); goto OUT; } ret = sprintf_s(buf, MAX_LEN, "[CPU Usage] %.2f", usage); if (ret == TD_FAILURE) { log_err("in check_cpu_usage, sprintf_s failed\n"); goto OUT; } write_log(buf); if (usage > g_max_cpu_utility) { ret = fault_report_cpu_overuse(usage); if (ret == TD_FAILURE) { log_err("in check_cpu_usage, fault_report_cpu_overuse failed\n"); goto OUT; } } OUT: return; } // 获取系统时间作为落盘文件名,与落盘路径拼接成完整路径 static int get_snapshot_path_from_system_time(proc_node *proc, char *snapshot_path) { char snapshot_file_name[MAX_LEN] = {'\0'}; time_t t; struct tm *st = TD_NULL; int ret = TD_FAILURE; (void)time(&t); st = localtime(&t); ret = sprintf_s(snapshot_file_name, MAX_LEN, "%04d%02d%02d%02d%02d%02d.txt", st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec); if (ret == TD_FAILURE) { log_err("in get_snapshot_path_from_system_time, sprintf_s failed\n"); goto OUT; } if (proc == TD_NULL) { log_err("in get_snapshot_path_from_system_time, proc is null ptr.\n"); goto OUT; } ret = strcpy_s(snapshot_path, MAX_LEN, proc->archive_path); if (ret == TD_FAILURE) { log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n"); goto OUT; } ret = strcat_s(snapshot_path, MAX_LEN, "/"); if (ret == TD_FAILURE) { log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n"); goto OUT; } ret = strcat_s(snapshot_path, MAX_LEN, snapshot_file_name); if (ret == TD_FAILURE) { log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n"); goto OUT; } return TD_SUCCESS; // 落盘文件的完成路径 OUT: return TD_FAILURE; } // 获取目标文件夹下的文件数量 static int get_dir_files_num(char *dir_path, int *file_num) { DIR *dir = TD_NULL; struct dirent *ptr = TD_NULL; int total = 0; dir = opendir(dir_path); if (dir == TD_NULL) { log_err("in get_dir_files_num, opendir failed\n"); return TD_FAILURE; } while ((ptr = readdir(dir)) != NULL) { if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) { continue; } if (ptr->d_type == DT_REG) { total++; } } closedir(dir); *file_num = total; return TD_SUCCESS; } static int get_file_names(char *dir_path, char file_names[][MAX_LEN]) { DIR *dir = TD_NULL; struct dirent *ptr = TD_NULL; int total = 0; int ret = TD_FAILURE; dir = opendir(dir_path); if (dir == TD_NULL) { log_err("in get_file_names, opendir failed\n"); goto OUT; } while ((ptr = readdir(dir)) != NULL) { if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) { continue; } if (ptr->d_type == DT_REG) { ret = strcpy_s(file_names[total++], MAX_LEN, ptr->d_name); if (ret == TD_FAILURE) { log_err("in get_file_names, strcpy_s failed\n"); goto OUT; } } } ret = closedir(dir); if (ret == TD_FAILURE) { log_err("in get_file_names, closedir failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } static int get_max_file_name(int file_num, char file_names[][MAX_LEN], char *target_file_name) { int i; int ret = TD_FAILURE; for (i = 0; i < file_num; i++) { if (strcmp(file_names[i], target_file_name) > 0) { ret = strcpy_s(target_file_name, MAX_LEN, file_names[i]); if (ret == TD_FAILURE) { log_err("in get_max_file_name, strcpy_s failed\n"); return TD_FAILURE; } } } return TD_SUCCESS; } static int get_min_file_name(int file_num, char file_names[][MAX_LEN], char *target_file_name) { int i; int ret = TD_FAILURE; for (i = 0; i < file_num; i++) { if (strcmp(file_names[i], target_file_name) < 0) { ret = strcpy_s(target_file_name, MAX_LEN, file_names[i]); if (ret == TD_FAILURE) { log_err("in get_min_file_name, strcpy_s failed\n"); return TD_FAILURE; } } } return TD_SUCCESS; } // 获得指定文件夹中文件名最大或最小的文件,get_max==0时,获得最小文件;get_max==1时,获得最大文件 static int get_min_or_max_file_name(char *dir_path, int file_num, int get_max, char *target_file_path) { char file_names[file_num][MAX_LEN]; int ret = TD_FAILURE; char target_file_name[MAX_LEN] = {'\0'}; ret = get_file_names(dir_path, file_names); if (ret == TD_FAILURE) { log_err("in get_min_or_max_file_name, get_file_names failed\n"); goto OUT; } ret = strcpy_s(target_file_name, MAX_LEN, file_names[0]); if (ret == TD_FAILURE) { log_err("in get_min_or_max_file_name, strcpy_s failed\n"); goto OUT; } if (get_max == 1) { ret = get_max_file_name(file_num, file_names, target_file_name); if (ret == TD_FAILURE) { log_err("in get_min_or_max_file_name, get_max_file_name failed\n"); goto OUT; } } else if (get_max == 0) { ret = get_min_file_name(file_num, file_names, target_file_name); if (ret == TD_FAILURE) { log_err("in get_min_or_max_file_name, get_min_file_name failed\n"); goto OUT; } } ret = snprintf_s(target_file_path, MAX_LEN, MAX_LEN - 1, "%s/%s", dir_path, target_file_name); if (ret == TD_FAILURE) { log_err("snprintf_s failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 清理落盘文件夹,保持文件数小于等于阈值 static int clean_snapshot_dir(proc_node *proc) { int file_num; char target_file_name[MAX_LEN] = {'\0'}; char buf[MAX_LEN] = {'\0'}; int ret = TD_FAILURE; if (proc == TD_NULL) { log_err("in clean_snapshot_dir, proc is null ptr.\n"); goto OUT; } ret = get_dir_files_num(proc->archive_path, &file_num); if (ret == TD_FAILURE) { log_err("in clean_snapshot_dir, get_dir_files_num failed\n"); goto OUT; } while (file_num > g_max_rotation_count) { // 文件数 > 3时,删掉多余的小名文件 ret = get_min_or_max_file_name(proc->archive_path, file_num, 0, target_file_name); if (ret == TD_FAILURE) { log_err("in clean_snapshot_dir, get_dir_files_num failed\n"); goto OUT; } ret = sprintf_s(buf, MAX_LEN, "[Remove] %s", target_file_name); if (ret == TD_FAILURE) { log_err("in clean_snapshot_dir, sprintf_s failed\n"); goto OUT; } write_log(buf); ret = remove(target_file_name); if (ret == TD_FAILURE) { log_err("in clean_snapshot_dir, remove failed\n"); goto OUT; } file_num--; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 获取落盘目标文件的文件描述符,保证有效且发生错误时已关闭文件 static int get_target_file(int file_num, char *target_file_name, proc_node *proc, char *snapshot_path) { char buf[MAX_LEN] = {'\0'}; int fd = -1; if (proc == TD_NULL) { log_err("in get_target_file, proc is null ptr.\n"); goto OUT; } // 获取目标落盘文件路径 if (get_min_or_max_file_name(proc->archive_path, file_num, 1, target_file_name) == TD_FAILURE) { log_err("in get_target_file, get_min_or_max_file_name failed\n"); goto OUT; } if (get_file_size(target_file_name) >= g_max_rotation_size) { // 目标文件大小超过阈值,则新建文件 if (get_snapshot_path_from_system_time(proc, snapshot_path) == TD_FAILURE) { log_err("in get_target_file, get_snapshot_path_from_system_time\n"); goto OUT; } fd = open(snapshot_path, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM); if (fd == -1) { log_err("in get_target_file, open %s failed\n", snapshot_path); goto OUT; } if (sprintf_s(buf, MAX_LEN, "[Create] %s", snapshot_path) == TD_FAILURE) { log_err("in get_target_file, sprintf_s failed\n"); goto OUT1; } if (write_log(buf) == TD_FAILURE) { log_err("write_log failed\n"); goto OUT1; } if (clean_snapshot_dir(proc) == TD_FAILURE) { log_err("in get_target_file, clean_snapshot_dir failed\n"); goto OUT1; } } else { // 目标文件大小在阈值范围内,往目标文件append内容 fd = open(target_file_name, O_WRONLY | O_APPEND); if (fd == -1) { log_err("in get_target_file, open %s failed\n", target_file_name); goto OUT; } } return fd; OUT1: close(fd); OUT: return TD_FAILURE; } // 创建落盘目标文件,返回文件描述符,保证有效且发生错误时已关闭文件 static int create_target_file(char *snapshot_path, proc_node *proc) { int ret = TD_FAILURE; char buf[MAX_LEN] = {'\0'}; int fd = -1; ret = get_snapshot_path_from_system_time(proc, snapshot_path); if (ret == TD_FAILURE) { log_err("in create_target_file, get_snapshot_path_from_system_time failed\n"); goto OUT; } fd = open(snapshot_path, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM); if (fd == -1) { log_err("in create_target_file, open %s failed\n", snapshot_path); goto OUT; } ret = sprintf_s(buf, MAX_LEN, "[Create] %s", snapshot_path); if (ret == TD_FAILURE) { log_err("in create_target_file, sprintf_s failed\n"); goto OUT1; } ret = write_log(buf); if (ret == TD_FAILURE) { log_err("write_log failed"); goto OUT1; } return fd; OUT1: close(fd); OUT: return TD_FAILURE; } // 打开落盘文件夹中的目标文件,返回其文件描述符 to_fd,保证有效且发生错误时已关闭文件 static int open_or_remove_snapshot_file(char *snapshot_path, proc_node *proc) { int ret = TD_FAILURE; int file_num; char target_file_name[BUF_SIZE] = {'\0'}; int fd = -1; if (proc == TD_NULL) { log_err("in open_or_remove_snapshot_file, proc is null ptr.\n"); goto OUT; } ret = clean_snapshot_dir(proc); // 保证目标文件夹中的文件数量在阈值范围内 if (ret == TD_FAILURE) { log_err("in open_or_remove_snapshot_file, clean_snapshot_dir failed\n"); goto OUT; } ret = get_dir_files_num(proc->archive_path, &file_num); // 获取目标文件夹中的文件数目 if (ret == TD_FAILURE) { log_err("in open_or_remove_snapshot_file, get_dir_files_num failed\n"); goto OUT; } if (file_num > 0) { // 有文件,获取最大名文件作为目标文件,保证文件描述符有效且发生错误时已关闭文件 fd = get_target_file(file_num, target_file_name, proc, snapshot_path); if (fd == -1) { log_err("in open_or_remove_snapshot_file, get_target_file failed\n"); goto OUT; } } else { // 没有文件,创建文件,保证文件描述符有效且发生错误时已关闭文件 fd = create_target_file(snapshot_path, proc); if (fd == -1) { log_err("in open_or_remove_snapshot_file, create_target_file failed\n"); goto OUT; } } return fd; OUT: return TD_FAILURE; } // proc存盘超时5秒,故障上报 static void fault_report_timeout(union sigval val) { long cur_timestamp = get_timestamp_for_timer(); static __thread int has_report = 0; timer_val *tvalue = (timer_val *)(val.sival_ptr); int ret = TD_FAILURE; unsigned int handle; int timeout = 0; char buf[MAX_LEN] = {'\0'}; if (tvalue == TD_NULL || tvalue->proc == TD_NULL) { log_err("in fault_report_timeout, tvalue or tvalue->proc is null ptr.\n"); goto OUT; } timeout = OVERTIME_CO * atoi(tvalue->proc->time); if (has_report == 0 && cur_timestamp - *(tvalue->timestamp) > timeout * SEC_TO_NSEC) { ret = get_proc_stack(); if (ret == TD_FAILURE) { log_err("in fault_report_timeout, get_proc_stack failed\n"); goto OUT; } log_dbg("fault report from %s in %ld\n", tvalue->proc->path, cur_timestamp); ret = dft_event_create(FAULT_NO_TIMEOUT, &handle); if (ret == TD_FAILURE) { log_err("in fault_report_timeout, dft_event_create failed\n"); goto OUT; } dft_event_put_string(handle, "PNAME", "procdaemon"); dft_event_put_string(handle, "F1NAME", "fault_report_timeout"); dft_event_put_string(handle, "F2NAME", "save_proc_file_cycle"); dft_event_put_string(handle, "PROCNAME", tvalue->proc->path); ret = dft_event_report(handle); if (ret == TD_FAILURE) { log_err("in fault_report_timeout, dft_event_report failed\n"); goto OUT; } dft_event_destroy(handle); has_report = 1; ret = sprintf_s(buf, MAX_LEN, "[Fault Report] proc %s archive timeout", tvalue->proc->path); if (ret == TD_FAILURE) { log_err("in fault_report_timeout, sprintf_s failed\n"); goto OUT; } write_log(buf); } OUT: return; } // 初始化计时器 static int init_timer(int init_sec, int sec, struct sigevent event) { int ret = TD_FAILURE; timer_t timerid; ret = timer_create(CLOCK_REALTIME, &event, &timerid); if (ret == TD_FAILURE) { log_err("in init_timer, memset_s failed\n"); goto OUT; } struct itimerspec it; it.it_interval.tv_sec = sec; it.it_interval.tv_nsec = TIMER_NSEC; it.it_value.tv_sec = init_sec; it.it_value.tv_nsec = TIMER_NSEC; ret = timer_settime(timerid, 0, &it, NULL); if (ret == TD_FAILURE) { log_err("in init_timer, timer_settime failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } static int write_snapshot(int to_fd, int from_fd, proc_node *proc) { int nread; int nwrite; int ret = TD_FAILURE; char buf[MAX_LEN] = {'\0'}; char time_stamp[MAX_LEN] = {'\0'}; if (proc == TD_NULL) { log_err("in write_snapshot, proc is null ptr.\n"); goto OUT; } // 写时间戳 ret = get_timestamp_for_archive_content(time_stamp); if (ret == TD_FAILURE) { log_err("in write_snapshot, get_timestamp_for_archive_content failed\n"); goto OUT; } ret = write(to_fd, time_stamp, strlen(time_stamp)); if (ret == TD_FAILURE) { log_err("write time stamp to file %s failed.\n", proc->archive_path); goto OUT; } // 写proc内容 while ((nread = read(from_fd, buf, sizeof(buf))) > 0) { // 读proc nwrite = 0; do { ret = write(to_fd, &buf[nwrite], nread - nwrite); // 写proc if (ret == TD_FAILURE) { log_err("write proc content to file %s failed.\n", proc->archive_path); goto OUT; } nwrite += ret; } while (nwrite < nread); } if (nread == TD_FAILURE) { log_err("in write_snapshot, read %s failed.\n", proc->path); goto OUT; } // 写入换行符 ret = write(to_fd, "\n", 1); if (ret == TD_FAILURE) { log_err("write \\n failed.\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 创建计时器检测read操作响应 static int init_timer_check_response(long *timestamp, proc_node *proc) { int ret = TD_FAILURE; timer_val val; struct sigevent event; *timestamp = get_timestamp_for_timer(); val.timestamp = timestamp; val.proc = proc; ret = memset_s(&event, sizeof(struct sigevent), 0, sizeof(struct sigevent)); if (ret == TD_FAILURE) { log_err("in init_timer_check_response, memset_s failed\n"); goto OUT; } event.sigev_value.sival_ptr = &val; event.sigev_notify = SIGEV_THREAD; event.sigev_notify_function = fault_report_timeout; ret = init_timer(TIMER_SEC, TIMER_NSEC, event); if (ret == TD_FAILURE) { log_err("in init_timer_check_response, init_timer failed.\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 打开proc,打开失败时重试 static int try_open_proc(char *proc_path) { int fd = -1; int has_print_info = TD_FAILURE; fd = open(proc_path, O_RDONLY); while (fd == -1) { if (has_print_info == TD_FAILURE) { log_info("open %s failed, reopen after each %d seconds\n", proc_path, PROC_REOPEN_INTERVAL); has_print_info = TD_SUCCESS; } sleep(PROC_REOPEN_INTERVAL); fd = open(proc_path, O_RDONLY); } if (has_print_info == TD_SUCCESS) { log_info("open %s success\n", proc_path); } return fd; } // 每当落盘次数达到xx次,就会尝试关闭proc并重新打开,打开失败时会重试 static int try_close_and_open_proc(int from_fd, char *path, int loop_time) { if (loop_time % MAX_TIME_TO_ARCHIVE != 0) { return from_fd; } // 如果落盘次数到一定量,则关闭文件后重新尝试打开 if (from_fd != -1) { close(from_fd); } return try_open_proc(path); } // 读取proc文件,并落盘 static void *save_proc_file_cycle(void *args) { int from_fd = -1, to_fd = -1; long timestamp; char snapshot_path[MAX_LEN] = {'\0'}; // 落盘文件名 proc_node *proc = (proc_node *)args; int loop_time = 0; set_thread_name_from_proc_path(proc->path); if (init_timer_check_response(×tamp, proc) == TD_FAILURE) { // 初始化时钟,检测落盘超时 log_err("in save_proc_file_cycle, init_timer_check_response failed.\n"); goto OUT1; } // 循环周期性落盘 while (1) { // 落盘一定次数后,先关闭再尝试打开proc,打开失败时重复尝试 from_fd = try_close_and_open_proc(from_fd, proc->path, loop_time); if (check_dir_exits(proc) == TD_FAILURE) { log_err("in save_proc_file_cycle, make_snapshot_dir failed.\n"); goto OUT; } lseek(from_fd, 0, SEEK_SET); // 将文件指针移动到文件开头 // 打开落盘目标文件,返回有效文件描述符 to_fd = open_or_remove_snapshot_file(snapshot_path, proc); if (to_fd == -1) { log_err("in save_proc_file_cycle, open_or_remove_snapshot_file failed\n"); goto OUT; } timestamp = get_timestamp_for_timer(); if (write_snapshot(to_fd, from_fd, proc) == TD_FAILURE) { // 往目标文件中写入proc内容 log_err("write_snapshot from %s failed\n", proc->path); goto OUT; } loop_time = (loop_time + 1) % MAX_TIME_TO_ARCHIVE; close(to_fd); sleep(atoi(proc->time)); } close(from_fd); return (void *)TD_SUCCESS; OUT: close(from_fd); OUT1: return (void *)TD_FAILURE; } static int process_mode(xmlNodePtr prop_node_ptr, proc_node *node) { xmlChar *sz_attr = TD_NULL; if (node == TD_NULL) { log_err("in process_mode, node is null ptr.\n"); goto OUT; } sz_attr = xmlGetProp(prop_node_ptr, BAD_CAST "mode"); if (strcmp("cycle", (char *)sz_attr) == 0) { node->mode = CYCLE_MODE; } else if (strcmp("variation", (char *)sz_attr) == 0) { node->mode = VARIATION_MODE; } else { node->mode = UNDEFINED_MODE; } xmlFree(sz_attr); return TD_SUCCESS; OUT: return TD_FAILURE; } static int process_time(xmlNodePtr prop_node_ptr, proc_node *node) { int ret = TD_FAILURE; xmlChar *sz_attr = TD_NULL; if (node == TD_NULL) { log_err("in process_time, node is null ptr.\n"); goto OUT; } sz_attr = xmlGetProp(prop_node_ptr, BAD_CAST "time"); ret = is_float_digital((char *)sz_attr); if (ret == TD_FAILURE) { log_err("in process_time, is_float_digital failed\n"); goto OUT1; } if (atoi((char *)sz_attr) < 1) { ret = strcpy_s(node->time, MAX_LEN, "1"); if (ret == TD_FAILURE) { log_err("in process_time, strcpy_s failed\n"); goto OUT1; } log_info("cycle time is less 1 second, auto fix to 1 second.\n"); } else { ret = strcpy_s(node->time, MAX_LEN, (char *)sz_attr); if (ret == TD_FAILURE) { log_err("in process_time, strcpy_s failed\n"); goto OUT1; } } xmlFree(sz_attr); return TD_SUCCESS; OUT1: xmlFree(sz_attr); OUT: return TD_FAILURE; } // 检测xml的node内部属性 static int process_node_properties(xmlNodePtr prop_node_ptr, int node_idx) { xmlAttrPtr attr_ptr = TD_NULL; int ret = TD_FAILURE; proc_node *node = &g_nodes[node_idx]; if (node_idx >= MAX_LEN || node_idx < 0) { log_err("in process_node_properties, node_idx is overflow.\n"); goto OUT; } attr_ptr = prop_node_ptr->properties; while (attr_ptr != NULL) { // 获取mode属性 if (!xmlStrcmp(attr_ptr->name, BAD_CAST "mode")) { ret = process_mode(prop_node_ptr, node); if (ret == TD_FAILURE) { log_err("in process_node_properties, process_mode failed\n"); goto OUT; } } // 获取time属性 if (!xmlStrcmp(attr_ptr->name, BAD_CAST "time")) { ret = process_time(prop_node_ptr, node); if (ret == TD_FAILURE) { log_err("in process_node_properties, process_time failed\n"); goto OUT; } } ret = strcpy_s(node->time, MAX_LEN, node->mode == VARIATION_MODE ? "-1" : node->time); if (ret == TD_FAILURE) { log_err("in process_node_properties, strcpy_s failed\n"); goto OUT; } attr_ptr = attr_ptr->next; if (node->mode == UNDEFINED_MODE) { log_err("node does not have 'mode' property.\n"); return TD_FAILURE; } else if (node->mode == CYCLE_MODE && strcmp(node->time, "-1") == 0) { log_err("cycle mode needs to set archive cycle time.\n"); return TD_FAILURE; } } return TD_SUCCESS; OUT: return TD_FAILURE; } // 获取节点中包裹的合法路径 static int get_safe_path_inner_node(xmlNodePtr cur_node, int node_idx) { xmlChar *sz_key = TD_NULL; int ret = TD_FAILURE; proc_node *node = &g_nodes[node_idx]; if (cur_node == TD_NULL) { log_err("in get_safe_path_inner_node, cur_node is null ptr.\n"); goto OUT; } if (node_idx >= MAX_LEN || node_idx < 0) { log_err("in get_safe_path_inner_node, node_idx is overflow.\n"); goto OUT; } sz_key = xmlNodeGetContent(cur_node); if (sz_key == TD_NULL) { log_err("in get_safe_path_inner_node, xmlNodeGetContent return null ptr.\n"); goto OUT; } ret = starts_with((char *)sz_key, PROC_PATH_PREFIX); if (ret == TD_FAILURE) { log_info("path not start with %s.\n", PROC_PATH_PREFIX); goto OUT1; } ret = strcpy_s(node->path, MAX_LEN, (char *)sz_key); if (ret == TD_FAILURE) { log_err("in get_safe_path_inner_node, strcpy_s failed\n"); goto OUT1; } xmlFree(sz_key); return TD_SUCCESS; OUT1: xmlFree(sz_key); OUT: return TD_FAILURE; } // 解析list节点,获取落盘proc文件列表及相应配置 static void parse_list_xml_node(xmlNodePtr cur_node) { int ret = TD_FAILURE; pthread_t cycle_archive_tid_idx = 0; pthread_t variation_archive_tid_idx = 0; int node_idx = 0; while (cur_node != TD_NULL && node_idx < MAX_LEN) { // 解析xml配置文件 // 1. 是否为snapshot节点(如果不是,则跳过,处理下一个xml节点) if (xmlStrcmp(cur_node->name, (const xmlChar *) "snapshot")) { log_err("cur_node is not , skip & process next node.\n"); cur_node = cur_node->next; continue; } // 2. 获取节点路径,并判断是否合法(如果不合法,则跳过,处理下一个xml节点) ret = get_safe_path_inner_node(cur_node, node_idx); if (ret == TD_FAILURE) { log_err("in parse_list_xml_node, get_safe_path_inner_node failed.\n"); cur_node = cur_node->next; continue; } // 3. 获取节点属性(如果获取失败,处理下一个xml节点) ret = process_node_properties(cur_node, node_idx); if (ret == TD_FAILURE) { log_err("in parse_list_xml_node, process_node_properties failed\n"); cur_node = cur_node->next; continue; } // 4. 开始根据node内容落盘 ret = proc_archive(node_idx, &cycle_archive_tid_idx, &variation_archive_tid_idx); // 5. 处理下一个xml节点 cur_node = cur_node->next; node_idx++; } } static int parse_common_xml_node_max_cpu_utility(xmlNodePtr cur_node) { int ret = TD_FAILURE; xmlChar *sz_key = TD_NULL; sz_key = xmlNodeGetContent(cur_node); // 获取最大cpu占用率 ret = is_float_digital((char *)sz_key); if (ret == TD_FAILURE) { log_err("in parse_common_xml_node_max_cpu_utility, is_float_digital failed\n"); goto OUT; } g_max_cpu_utility = strtod((char *)sz_key, NULL); if (g_max_cpu_utility > MAX_MAX_CPU_UTILITY || g_max_cpu_utility <= MIN_MAX_CPU_UTILITY) { g_max_cpu_utility = DEFAULT_MAX_CPU_UTILITY; log_info("got a wrong g_max_cpu_utility, reset to %.1f\n", DEFAULT_MAX_CPU_UTILITY); } xmlFree(sz_key); return TD_SUCCESS; OUT: xmlFree(sz_key); return TD_FAILURE; } static int parse_common_xml_node_max_rotation_count(xmlNodePtr cur_node) { int ret = TD_FAILURE; xmlChar *sz_key = TD_NULL; sz_key = xmlNodeGetContent(cur_node); // 获取最大落盘文件数量 ret = is_int_digital((char *)sz_key); if (ret == TD_FAILURE) { log_err("in parse_common_xml_node_max_rotation_count, is_int_digital failed\n"); goto OUT; } g_max_rotation_count = atoi((char *)sz_key); if (g_max_rotation_count <= MIN_MAX_ROTATION_COUNT || g_max_rotation_count > MAX_MAX_ROTATION_COUNT) { g_max_rotation_count = DEFAULT_MAX_ROTATION_COUNT; log_info("got a wrong g_max_rotation_count, reset to %d\n", DEFAULT_MAX_ROTATION_COUNT); } xmlFree(sz_key); return TD_SUCCESS; OUT: xmlFree(sz_key); return TD_FAILURE; } static int parse_common_xml_node_max_rotation_size(xmlNodePtr cur_node) { int ret = TD_FAILURE; xmlChar *sz_key = TD_NULL; sz_key = xmlNodeGetContent(cur_node); // 获取最大落盘文件尺寸 ret = is_int_digital((char *)sz_key); if (ret == TD_FAILURE) { log_err("in parse_common_xml_node_max_rotation_size, is_int_digital failed\n"); goto OUT; } g_max_rotation_size = atoi((char *)sz_key); if (g_max_rotation_size <= MIN_MAX_ROTATION_SIZE || g_max_rotation_size > MAX_MAX_ROTATION_SIZE) { g_max_rotation_size = DEFAULT_MAX_ROTATION_SIZE; log_info("got a wrong g_max_rotation_size, reset to %d\n", DEFAULT_MAX_ROTATION_SIZE); } xmlFree(sz_key); return TD_SUCCESS; OUT: xmlFree(sz_key); return TD_FAILURE; } // 解析common节点,获取通用配置信息 static int parse_common_xml_node(xmlNodePtr cur_node) { int ret = TD_FAILURE; while (cur_node != NULL) { if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_cpu_utility"))) { ret = parse_common_xml_node_max_cpu_utility(cur_node); if (ret == TD_FAILURE) { log_err("in parse_common_xml_node, parse_common_xml_node_max_cpu_utility failed\n"); goto OUT; } } if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_rotation_count"))) { ret = parse_common_xml_node_max_rotation_count(cur_node); if (ret == TD_FAILURE) { log_err("in parse_common_xml_node, parse_common_xml_node_max_rotation_count failed\n"); goto OUT; } } if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_rotation_size"))) { ret = parse_common_xml_node_max_rotation_size(cur_node); if (ret == TD_FAILURE) { log_err("in parse_common_xml_node, parse_common_xml_node_max_rotation_size failed\n"); goto OUT; } } cur_node = cur_node->next; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 解析xml文件,获得配置信息 static int procd_main(char *xml_file_path) { xmlDocPtr doc = TD_NULL; xmlNodePtr cur_node = TD_NULL; int ret = TD_FAILURE; xmlKeepBlanksDefault(0); // 过滤空白text节点 doc = xmlReadFile(xml_file_path, "UTF-8", XML_PARSE_RECOVER); if (doc == TD_NULL) { fprintf(stderr, "Document not parsed successfully.\n"); goto OUT1; } // 获取xml的根节点 cur_node = xmlDocGetRootElement(doc); if (cur_node == TD_NULL) { fprintf(stderr, "empty document\n"); goto OUT; } // 判断根节点是否符合预期 if (xmlStrcmp(cur_node->name, BAD_CAST "procd")) { fprintf(stderr, "document of the wrong type, root node != proc_list\n"); goto OUT; } // 遍历xml文件 cur_node = cur_node->xmlChildrenNode; while (cur_node != TD_NULL) { // 处理common的子节点内容 if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "common"))) { ret = parse_common_xml_node(cur_node->xmlChildrenNode); if (ret == TD_FAILURE) { log_err("in procd_main, parse_common_xml_node failed\n"); goto OUT; } } // 处理list的子节点内容,遍历的同事创建线程进行落盘 if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "list"))) { parse_list_xml_node(cur_node->xmlChildrenNode); } cur_node = cur_node->next; } (void)xmlFreeDoc(doc); return TD_SUCCESS; OUT: xmlFreeDoc(doc); OUT1: return TD_FAILURE; } // 变化落盘存文件 static int save_proc_file_variation(proc_node *proc, int from_fd) { int to_fd = -1; int ret = TD_FAILURE; char snapshot_path[MAX_LEN] = {'\0'}; ret = memset_s(snapshot_path, MAX_LEN, '\0', MAX_LEN); if (ret == TD_FAILURE) { log_err("in save_proc_file_variation, memset_s failed\n"); goto OUT; } lseek(from_fd, 0, SEEK_SET); to_fd = open_or_remove_snapshot_file(snapshot_path, proc); // 打开落盘文件,返回有效文件描述符 if (to_fd == -1) { log_err("in save_proc_file_variation, open_or_remove_snapshot_file failed\n"); goto OUT; } log_dbg("variation archive into %s\n", snapshot_path); write_snapshot(to_fd, from_fd, proc); close(to_fd); return TD_SUCCESS; OUT: return TD_FAILURE; } // 变化落盘poll检测 static void *proc_variation_poll(void *arg) { int ret = TD_FAILURE; proc_node *proc = (proc_node *)arg; struct pollfd *pfd = TD_NULL; int ready = 0; int loop_time = 0; pfd = (struct pollfd *)calloc(1, sizeof(struct pollfd)); if (pfd == TD_NULL) { log_err("in proc_variation_poll, calloc failed, pfd is null ptr.\n"); goto OUT; } set_thread_name_from_proc_path(proc->path); pfd->events = POLLIN; pfd->revents = 0; // 循环检测poll的返回值,检测是否有被触发 while (1) { // 落盘次数超过阈值后,会尝试关闭并重新打开proc,打开失败时重复尝试 pfd->fd = try_close_and_open_proc(pfd->fd, proc->path, loop_time); ret = check_dir_exits(proc); if (ret == TD_FAILURE) { log_err("in proc_variation_poll, make_snapshot_dir failed.\n"); goto OUT1; } ready = poll(pfd, 1, -1); if (ready < 0) { log_err("in proc_variation_poll, poll failed.\n"); goto OUT1; } else if ((ready > 0) && ((pfd->revents != 0) && (pfd->revents & POLLIN))) { log_dbg("mention proc changed.\n"); ret = save_proc_file_variation(proc, pfd->fd); if (ret == TD_FAILURE) { log_err("in proc_variation_poll, save_proc_file_variation failed.\n"); goto OUT1; } loop_time = (loop_time + 1) % MAX_TIME_TO_ARCHIVE; } } close(pfd->fd); free(pfd); return (void *)TD_SUCCESS; OUT1: close(pfd->fd); free(pfd); OUT: return (void *)TD_FAILURE; } // 周期落盘主逻辑 static int cycle_archive(proc_node *node, pthread_t *tid) { int ret = TD_FAILURE; if (node == TD_NULL) { log_err("in cycle_archive, node is null ptr.\n"); goto OUT; } ret = check_dir_exits(node); if (ret == TD_FAILURE) { log_err("check_dir_exits failed\n"); goto OUT; } // 创建新线程,执行save_proc_file函数,实现snap落盘 ret = pthread_create(tid, TD_NULL, save_proc_file_cycle, node); if (ret == TD_FAILURE) { log_err("in cycle_archive, pthread_create failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 变化落盘主逻辑 static int variation_archive(proc_node *node, pthread_t *tid) { int ret = TD_FAILURE; if (node == TD_NULL) { log_err("in variation_archive, node is null ptr.\n"); goto OUT; } ret = check_dir_exits(node); if (ret == TD_FAILURE) { log_err("check_dir_exits failed.\n"); goto OUT; } ret = pthread_create(tid, TD_NULL, proc_variation_poll, node); if (ret == TD_FAILURE) { log_err("in variation_archive, pthread_create failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 设定cpu占用率检测时钟 static void *thread_watch_cpu_usage() { int ret = TD_FAILURE; struct sigevent event; ret = memset_s(&event, sizeof(struct sigevent), 0, sizeof(struct sigevent)); if (ret == TD_FAILURE) { log_err("in thread_watch_cpu_usage, memset_s failed\n"); return (void *)TD_FAILURE; } event.sigev_value.sival_ptr = TD_NULL; event.sigev_notify = SIGEV_THREAD; event.sigev_notify_function = check_cpu_usage; ret = init_timer(CPU_TIMER_SEC, CPU_TIMER_SEC, event); if (ret == TD_FAILURE) { log_err("in thread_watch_cpu_usage, init_timer failed\n"); return (void *)TD_FAILURE; } return (void *)TD_SUCCESS; } // cpu占用率检测 static int cpu_monitor() { int ret = TD_FAILURE; pthread_t pid; ret = pthread_create(&pid, NULL, thread_watch_cpu_usage, NULL); if (ret == TD_FAILURE) { log_err("in cpu_monitor, pthread_creat failed\n"); return TD_FAILURE; } pthread_join(pid, NULL); return TD_SUCCESS; } // 落盘逻辑 static int proc_archive(int node_idx, pthread_t *cycle_archive_tid_idx, pthread_t *variation_archive_tid_idx) { int ret = TD_FAILURE; proc_node *node = &g_nodes[node_idx]; if (node_idx >= MAX_LEN || node_idx < 0) { log_err("node_idx is overflow.\n"); goto OUT; } if (node->mode == CYCLE_MODE) { ret = cycle_archive(node, &(g_cycle_archive_tids[*cycle_archive_tid_idx])); // 周期落盘 if (ret == TD_FAILURE) { log_err("in proc_archive, cycle_archive failed.\n"); goto OUT; } *cycle_archive_tid_idx = *cycle_archive_tid_idx + 1; } else if (node->mode == VARIATION_MODE) { ret = variation_archive(node, &(g_variation_archive_tids[*variation_archive_tid_idx])); // 变化落盘 if (ret == TD_FAILURE) { log_err("in proc_archive, cycle_archive failed.\n"); goto OUT; } *variation_archive_tid_idx = *variation_archive_tid_idx + 1; } else { log_err("archive mode is error, please check\n"); } return TD_SUCCESS; OUT: return TD_FAILURE; } static int lock_file(int fd) { struct flock fl; fl.l_type = F_WRLCK; fl.l_start = 0; fl.l_whence = SEEK_SET; fl.l_len = 0; fl.l_pid = -1; return (fcntl(fd, F_SETLK, &fl)); } static int is_running(const char *process_name) { int ret = TD_FAILURE; int fd = -1; char buf[MAX_LEN] = {'\0'}; char file_name[MAX_LEN] = {'\0'}; ret = sprintf_s(file_name, MAX_LEN, "%s/%s/%s.pid", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH, process_name); if (ret == TD_FAILURE) { log_err("in is_running, sprintf_s failed\n"); goto OUT; } fd = open(file_name, O_CREAT | O_RDWR, FILE_PERM); if (fd == TD_FAILURE) { log_err("in is_running, open %s failed\n", file_name); goto OUT; } ret = lock_file(fd); if (ret == TD_FAILURE) { write_log("[Start Error] procdameon is already running, single instance mode refuse start another procdaemon"); goto OUT1; } ftruncate(fd, 0); ret = sprintf_s(buf, MAX_LEN, "%ld\n", (long)getpid()); if (ret == TD_FAILURE) { log_err("in is_running, sprintf_s failed\n"); goto OUT1; } write(fd, buf, MAX_LEN); close(fd); return TD_SUCCESS; OUT1: close(fd); OUT: return TD_FAILURE; } static int start_log() { char buf[MAX_LEN] = {'\0'}; int ret = TD_FAILURE; ret = sprintf_s(buf, MAX_LEN, "[Start] procdaemon"); if (ret == TD_FAILURE) { log_err("in start_log, sprintf_s failed\n"); return TD_FAILURE; } write_log(buf); return TD_SUCCESS; } // 初始化 static int procd_preparation() { // 创建procsnap落盘根目录,/data/vendor/procsnap if (make_snapshot_root_dir() == TD_FAILURE) { log_err("in procd_preparation, make_snapshot_root_dir failed\n"); goto OUT; } // 创建procsnap日志文件夹,/data/vendor/procsnap/procd_log if (make_log_dir() == TD_FAILURE) { log_err("in procd_preparation make_log_dir failed\n"); goto OUT; } // 等待系统时钟初始化完成 while (is_system_timer_init() == TD_FAILURE) { sleep(1); } // 写入启动日志 if (start_log() == TD_FAILURE) { log_err("in main, start_log failed\n"); goto OUT; } // 检查进程单例 if (is_running(PROCESS_NAME) == TD_FAILURE) { log_info("procd is running...\n"); goto OUT; } // 检测cpu占用率 if (cpu_monitor() == TD_FAILURE) { log_err("in main, cpu_monitor failed\n"); goto OUT; } return TD_SUCCESS; OUT: return TD_FAILURE; } // 后续工作,等待线程结束并回收资源 static int procd_clean() { int i; int ret = TD_FAILURE; log_info("start procd_clean\n"); // 回收线程的tid for (i = 0; i < MAX_LEN; i++) { if (g_cycle_archive_tids[i] != -1) { ret = pthread_join(g_cycle_archive_tids[i], TD_NULL); if (ret == TD_FAILURE) { log_err("pthread_join failed for tid=%ld\n", g_cycle_archive_tids[i]); goto OUT; } } if (g_variation_archive_tids[i] != -1) { ret = pthread_join(g_variation_archive_tids[i], TD_NULL); if (ret == TD_FAILURE) { log_err("pthread_join failed for tid=%ld\n", g_variation_archive_tids[i]); goto OUT; } } } return TD_SUCCESS; OUT: return TD_FAILURE; } int main() { int ret = TD_FAILURE; // 预备动作 ret = procd_preparation(); if (ret == TD_FAILURE) { log_err("procd_preparation failed.\n"); goto OUT; } // 主逻辑 ret = procd_main(CONF_PATH); if (ret == TD_FAILURE) { log_err("procd_main failed.\n"); goto OUT; } // 收尾动作 ret = procd_clean(); if (ret == TD_FAILURE) { log_err("procd_clean failed.\n"); goto OUT; } return 0; OUT: return TD_FAILURE; }