You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1883 lines
54 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*
* Copyright (c) Hisilicon Technologies Co., Ltd.. 2021-2021. All rights reserved.
* Description: part [procdaemon] for archive proc informations.
* Create: 2022-09-20
*/
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <pthread.h>
#include <assert.h>
#include <time.h>
#include <string.h>
#include <dirent.h>
#include <poll.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/select.h>
#include <sys/inotify.h>
#include <sys/prctl.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
#include <securec.h>
#include "dft_event.h"
#include "proc_node.h"
#include "procd_base.h"
static int g_max_rotation_count = -1;
static int g_max_rotation_size = -1;
static float g_max_cpu_utility = 0.0f;
static pthread_t g_cycle_archive_tids[MAX_LEN] = {-1};
static pthread_t g_variation_archive_tids[MAX_LEN] = {-1};
static proc_node g_nodes[MAX_LEN];
typedef struct timer_val {
long *timestamp;
proc_node *proc;
} timer_val;
static int proc_archive(int node_idx, pthread_t *g_cycle_archive_tid, pthread_t *g_variation_archive_tid);
static void inline set_thread_name_from_proc_path(char *proc_path)
{
char *proc_name = TD_NULL;
proc_name = strrchr(proc_path, '/');
prctl(PR_SET_NAME, proc_name);
}
// 获得时间戳(用于落盘内容分隔)
static int get_timestamp_for_archive_content(char *timestamp)
{
time_t t;
struct tm *st = TD_NULL;
int ret = TD_FAILURE;
(void)time(&t);
st = localtime(&t);
ret = sprintf_s(timestamp, MAX_LEN, ">>> %04d-%02d-%02d %02d:%02d:%02d\n",
st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec);
if (ret == TD_FAILURE) {
log_err("in get_timestamp_for_archive_content, sprintf_s failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 返回时间戳(用于计时器判断时间)
static inline long get_timestamp_for_timer()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * SEC_TO_NSEC + tv.tv_usec;
}
static int get_timestamp_for_log(char *timestamp)
{
time_t t;
struct tm *st = TD_NULL;
int ret = TD_FAILURE;
(void)time(&t);
st = localtime(&t);
ret = sprintf_s(timestamp, MAX_LEN, "%04d-%02d-%02d %02d:%02d:%02d",
st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec);
if (ret == TD_FAILURE) {
log_err("in get_timestamp_for_log, sprintf_s failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 获得指定路径文件的大小单位KB
static inline int get_file_size(char *file_path)
{
struct stat stat_buf;
stat(file_path, &stat_buf);
int size = stat_buf.st_size;
return size / BYTE_KB;
}
// 检测字符串str是不是以字符串prefix开头
static int starts_with(const char *str, const char *prefix)
{
if (!str || !prefix) {
return TD_FAILURE;
}
if (strncmp(str, prefix, strlen(prefix)) == 0) {
return TD_SUCCESS;
}
return TD_FAILURE;
}
// 拼接源文件的全路径
static int get_proc_archive_path(proc_node *proc, char *archive_dir_path)
{
char *proc_name = TD_NULL;
int ret = TD_FAILURE;
char proc_path[MAX_LEN] = {'\0'};
if (proc == TD_NULL) {
log_err("in get_proc_archive_path, proc is null ptr.\n");
goto OUT;
}
ret = strcpy_s(proc_path, MAX_LEN, proc->path);
if (ret == TD_FAILURE) {
log_err("in get_proc_archive_path, strcpy_s failed\n");
goto OUT;
}
// 拼接proc落盘文件夹路径
proc_name = strrchr(proc_path, '/'); // proc_name = /klad
ret = strcpy_s(archive_dir_path, MAX_LEN, PROC_SNAP_ROOT_DIR);
if (ret == TD_FAILURE) {
log_err("in get_proc_archive_path, strcpy_s failed\n");
goto OUT;
}
ret = strcat_s(archive_dir_path, MAX_LEN, proc_name);
if (ret == TD_FAILURE) {
log_err("in get_proc_archive_path, strcat_s failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 创建procsnap落盘的根目录
static int make_snapshot_root_dir()
{
int ret = TD_FAILURE;
DIR *dir = TD_NULL;
dir = opendir(PROC_SNAP_ROOT_DIR);
if (dir == TD_NULL) {
ret = mkdir(PROC_SNAP_ROOT_DIR, DIR_PERM);
if (ret == TD_FAILURE) {
log_err("in make_snapshot_root_dir, mkdir failed\n");
goto OUT;
}
} else {
closedir(dir);
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 为每个落盘proc创建相应的文件夹
static int make_snapshot_dir(proc_node *proc)
{
int ret = TD_FAILURE;
char archive_dir_path[MAX_LEN] = {'\0'};
DIR *dir = TD_NULL;
if (proc == TD_NULL) {
log_err("in make_snapshot_dir, proc is null ptr.\n");
goto OUT;
}
ret = get_proc_archive_path(proc, archive_dir_path);
if (ret == TD_FAILURE) {
log_err("in make_snapshot_dir, get_proc_archive_path failed.\n");
goto OUT;
}
// 如果文件夹不存在则创建proc落盘文件夹
dir = opendir(archive_dir_path);
if (dir == TD_NULL) {
ret = mkdir(archive_dir_path, DIR_PERM);
if (ret == TD_FAILURE) {
log_err("in make_snapshot_dir, mkdir failed\n");
goto OUT;
}
} else {
if (closedir(dir) == TD_FAILURE) {
log_err("closedir %s failed\n", archive_dir_path);
goto OUT;
}
}
ret = strcpy_s(proc->archive_path, MAX_LEN, archive_dir_path);
if (ret == TD_FAILURE) {
log_err("in make_snapshot_dir, strcpy_s failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
static int make_log_dir()
{
DIR *dir = TD_NULL;
int ret = TD_FAILURE;
char path[MAX_LEN] = {'\0'};
// 创建procd_log文件夹
ret = sprintf_s(path, MAX_LEN, "%s/%s", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH);
dir = opendir(path);
if (dir == TD_NULL) {
ret = mkdir(path, DIR_PERM);
if (ret == TD_FAILURE) {
log_err("in make_log_dir, mkdir failed\n");
goto OUT;
}
} else {
if (closedir(dir) == TD_FAILURE) {
log_err("closedir %s failed\n", path);
goto OUT;
}
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 检查落盘根文件夹及落盘子文件夹是否存在,如果不存在,则重新新建
static int check_dir_exits(proc_node *proc)
{
int ret = TD_FAILURE;
// 检查根文件夹 /data/vendor/procsnap
ret = make_snapshot_root_dir();
if (ret == TD_FAILURE) {
log_err("in check_dir_exits, make_snapshot_root_dir failed\n");
goto OUT;
}
// 检查日志文件夹 /data/vendor/procsnap/procd_log
ret = make_log_dir();
if (ret == TD_FAILURE) {
log_err("in check_dir_exits, make_log_dir failed\n");
goto OUT;
}
// 检查子文件夹 /data/vendor/procsnap/xxx
if (proc != TD_NULL) {
ret = make_snapshot_dir(proc);
if (ret == TD_FAILURE) {
log_err("in check_dir_exits, make_snapshot_dir failed\n");
goto OUT;
}
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 输出关键日志
static int write_log(const char *log_content)
{
int fd = -1;
int ret = TD_FAILURE;
char log_file_name[BUF_SIZE] = {'\0'};
char buf[BUF_SIZE] = {'\0'};
char time_stamp[MAX_LEN] = {'\0'};
if (check_dir_exits(TD_NULL) == TD_FAILURE) {
log_err("in write_log, make_log_dir failed\n");
goto OUT;
}
ret = sprintf_s(log_file_name, BUF_SIZE, "%s/%s/%s.log", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH, LOG_NAME);
if (ret == TD_FAILURE) {
log_err("in write_log, sprintf_s failed\n");
goto OUT;
}
if (get_file_size(log_file_name) >= MAX_LOG_FILE_SIZE) {
fd = open(log_file_name, O_CREAT | O_WRONLY | O_TRUNC, FILE_PERM);
} else {
fd = open(log_file_name, O_CREAT | O_WRONLY | O_APPEND, FILE_PERM);
}
if (fd == TD_FAILURE) {
log_err("in write_log, open %s failed\n", log_file_name);
goto OUT;
}
ret = get_timestamp_for_log(time_stamp);
if (ret == TD_FAILURE) {
log_err("in write_log, get_timestamp_for_log failed\n");
goto OUT1;
}
ret = sprintf_s(buf, BUF_SIZE, "%s[%s] [pid:%d]: %s\n",
starts_with(log_content, "[Start]") ? "\n" : "", time_stamp, gettid(), log_content);
if (ret == TD_FAILURE) {
log_err("sprintf_s failed\n");
goto OUT1;
}
ret = write(fd, buf, BUF_SIZE);
if (ret == TD_FAILURE) {
log_err("in write_log, write failed\n");
goto OUT1;
}
close(fd);
return TD_SUCCESS;
OUT1:
close(fd);
OUT:
return TD_FAILURE;
}
// 获得时间戳,用于判断系统时间是否初始化完成
static int is_system_timer_init()
{
time_t t;
struct tm *st = TD_NULL;
(void)time(&t);
st = localtime(&t);
if (st->tm_year + START_YEAR == SYSTEM_INIT_YEAR) {
log_dbg("system timer not init.\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
static int get_proc_stack()
{
char proc_stack_path[MAX_LEN];
char buf[MAX_LEN];
int ret = TD_FAILURE;
int from_stack_fd = -1, to_stack_fd = -1;
int nread, nwrite;
ret = sprintf_s(proc_stack_path, MAX_LEN, "/proc/%d/stack", getpid());
if (ret == TD_FAILURE) {
log_err("in get_proc_stack, sprintf_s failed\n");
goto OUT;
}
from_stack_fd = open(proc_stack_path, O_RDONLY);
if (from_stack_fd == TD_FAILURE) {
log_err("in get_proc_stack, open %s failed\n", proc_stack_path);
goto OUT;
}
to_stack_fd = open(STACK_FILE_PATH, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM);
if (to_stack_fd == TD_FAILURE) {
log_err("in get_proc_stack, open %s failed\n", STACK_FILE_PATH);
goto OUT1;
}
while ((nread = read(from_stack_fd, buf, MAX_LEN)) > 0) {
nwrite = 0;
do {
ret = write(to_stack_fd, &buf[nwrite], nread - nwrite);
if (ret == TD_FAILURE) {
log_err("in get_proc_stack, write failed\n");
goto OUT2;
}
nwrite += nread;
} while (nwrite < nread);
}
if (nread == TD_FAILURE) {
log_err("in get_proc_stack, read failed\n");
goto OUT2;
}
close(from_stack_fd);
close(to_stack_fd);
return TD_SUCCESS;
OUT2:
close(to_stack_fd);
OUT1:
close(from_stack_fd);
OUT:
return TD_FAILURE;
}
// 判断字符串是否为整数
static int is_int_digital(char *num)
{
int len = strlen(num);
int i;
for (i = 0; i < len; i++) {
if ('0' <= num[i] && num[i] <= '9') {
continue;
} else {
return TD_FAILURE;
}
}
return TD_SUCCESS;
}
// 判断字符串是否为浮点数
static int is_float_digital(char *num)
{
int len = strlen(num);
int i;
int dot_num = 0;
for (i = 0; i < len; i++) {
if (num[i] == '.') {
dot_num++;
if (dot_num > 1) {
return TD_FAILURE;
}
} else if ('0' <= num[i] && num[i] <= '9') {
continue;
} else {
return TD_FAILURE;
}
}
return TD_SUCCESS;
}
// 计算cpu占用率
static int get_cpu_usage(float *usage)
{
int fd = -1;
char filename[BUF_SIZE] = {'\0'};
char buf[BUF_SIZE] = {'\0'};
int ret = TD_FAILURE;
int nread;
char *data = TD_NULL;
int user_time = 0;
int kernel_time = 0;
static int last_user_time = 0;
static int last_kernel_time = 0;
int counter = 0;
ret = sprintf_s(filename, BUF_SIZE, "/proc/%d/stat", getpid());
if (ret == TD_FAILURE) {
log_err("in get_cpu_usage, sprintf_s failed\n");
goto OUT;
}
fd = open(filename, O_RDONLY);
if (fd == TD_FAILURE) {
log_err("in get_cpu_usage, open %s failed\n", filename);
goto OUT;
}
nread = read(fd, buf, BUF_SIZE);
data = strtok(buf, " ");
while (data != NULL) {
if (counter == UTIME_IDX) {
user_time = atoi(data); // 获取用户态占用时间
} else if (counter == CTIME_IDX) {
kernel_time = atoi(data); // 获取内核态占用时间
break;
}
counter++;
data = strtok(NULL, " ");
}
*usage = 1.0 * (user_time - last_user_time + kernel_time - last_kernel_time)
/ (sysconf(_SC_CLK_TCK) * CPU_TIMER_SEC);
last_user_time = user_time;
last_kernel_time = kernel_time;
close(fd);
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
static int fault_report_cpu_overuse(float usage)
{
int ret = TD_FAILURE;
char usage_str[MAX_LEN] = {'\0'};
char buf[MAX_LEN] = {'\0'};
unsigned int handle;
ret = get_proc_stack();
if (ret == TD_FAILURE) {
log_err("in fault_report_cpu_overuse, get_proc_stack failed\n");
goto OUT;
}
ret = sprintf_s(usage_str, MAX_LEN, "%.2f", usage);
if (ret == TD_FAILURE) {
log_err("in fault_report_cpu_overuse, sprintf_s failed\n");
goto OUT;
}
log_dbg("fault report for overuse cpu.\n");
ret = dft_event_create(FAULT_NO_OVERUSE_CPU, &handle);
if (ret == TD_FAILURE) {
log_err("in fault_report_cpu_overuse, dft_event_create failed\n");
goto OUT;
}
dft_event_put_string(handle, "PNAME", "procdaemon");
dft_event_put_string(handle, "F1NAME", "fault_report_cpu_overuse");
dft_event_put_string(handle, "F2NAME", "thread_watch_cpu_usage");
dft_event_put_string(handle, "CPU_USAGE", usage_str);
ret = dft_event_report(handle);
if (ret == TD_FAILURE) {
log_err("in fault_report_cpu_overuse, dft_event_report failed\n");
goto OUT;
}
dft_event_destroy(handle);
ret = sprintf_s(buf, MAX_LEN, "[Fault Report] cpu overuse: %.2f", usage);
if (ret == TD_FAILURE) {
log_err("in fault_report_cpu_overuse, sprintf_s failed\n");
goto OUT;
}
write_log(buf);
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 检测cpu占用率超过阈值则故障上报
static void check_cpu_usage()
{
int ret = TD_FAILURE;
float usage;
char buf[MAX_LEN] = {'\0'};
ret = get_cpu_usage(&usage);
if (ret == TD_FAILURE) {
log_err("in check_cpu_usage, get_cpu_usage failed\n");
goto OUT;
}
ret = sprintf_s(buf, MAX_LEN, "[CPU Usage] %.2f", usage);
if (ret == TD_FAILURE) {
log_err("in check_cpu_usage, sprintf_s failed\n");
goto OUT;
}
write_log(buf);
if (usage > g_max_cpu_utility) {
ret = fault_report_cpu_overuse(usage);
if (ret == TD_FAILURE) {
log_err("in check_cpu_usage, fault_report_cpu_overuse failed\n");
goto OUT;
}
}
OUT:
return;
}
// 获取系统时间作为落盘文件名,与落盘路径拼接成完整路径
static int get_snapshot_path_from_system_time(proc_node *proc, char *snapshot_path)
{
char snapshot_file_name[MAX_LEN] = {'\0'};
time_t t;
struct tm *st = TD_NULL;
int ret = TD_FAILURE;
(void)time(&t);
st = localtime(&t);
ret = sprintf_s(snapshot_file_name, MAX_LEN, "%04d%02d%02d%02d%02d%02d.txt",
st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec);
if (ret == TD_FAILURE) {
log_err("in get_snapshot_path_from_system_time, sprintf_s failed\n");
goto OUT;
}
if (proc == TD_NULL) {
log_err("in get_snapshot_path_from_system_time, proc is null ptr.\n");
goto OUT;
}
ret = strcpy_s(snapshot_path, MAX_LEN, proc->archive_path);
if (ret == TD_FAILURE) {
log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n");
goto OUT;
}
ret = strcat_s(snapshot_path, MAX_LEN, "/");
if (ret == TD_FAILURE) {
log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n");
goto OUT;
}
ret = strcat_s(snapshot_path, MAX_LEN, snapshot_file_name);
if (ret == TD_FAILURE) {
log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n");
goto OUT;
}
return TD_SUCCESS; // 落盘文件的完成路径
OUT:
return TD_FAILURE;
}
// 获取目标文件夹下的文件数量
static int get_dir_files_num(char *dir_path, int *file_num)
{
DIR *dir = TD_NULL;
struct dirent *ptr = TD_NULL;
int total = 0;
dir = opendir(dir_path);
if (dir == TD_NULL) {
log_err("in get_dir_files_num, opendir failed\n");
return TD_FAILURE;
}
while ((ptr = readdir(dir)) != NULL) {
if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
continue;
}
if (ptr->d_type == DT_REG) {
total++;
}
}
closedir(dir);
*file_num = total;
return TD_SUCCESS;
}
static int get_file_names(char *dir_path, char file_names[][MAX_LEN])
{
DIR *dir = TD_NULL;
struct dirent *ptr = TD_NULL;
int total = 0;
int ret = TD_FAILURE;
dir = opendir(dir_path);
if (dir == TD_NULL) {
log_err("in get_file_names, opendir failed\n");
goto OUT;
}
while ((ptr = readdir(dir)) != NULL) {
if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
continue;
}
if (ptr->d_type == DT_REG) {
ret = strcpy_s(file_names[total++], MAX_LEN, ptr->d_name);
if (ret == TD_FAILURE) {
log_err("in get_file_names, strcpy_s failed\n");
goto OUT;
}
}
}
ret = closedir(dir);
if (ret == TD_FAILURE) {
log_err("in get_file_names, closedir failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
static int get_max_file_name(int file_num, char file_names[][MAX_LEN], char *target_file_name)
{
int i;
int ret = TD_FAILURE;
for (i = 0; i < file_num; i++) {
if (strcmp(file_names[i], target_file_name) > 0) {
ret = strcpy_s(target_file_name, MAX_LEN, file_names[i]);
if (ret == TD_FAILURE) {
log_err("in get_max_file_name, strcpy_s failed\n");
return TD_FAILURE;
}
}
}
return TD_SUCCESS;
}
static int get_min_file_name(int file_num, char file_names[][MAX_LEN], char *target_file_name)
{
int i;
int ret = TD_FAILURE;
for (i = 0; i < file_num; i++) {
if (strcmp(file_names[i], target_file_name) < 0) {
ret = strcpy_s(target_file_name, MAX_LEN, file_names[i]);
if (ret == TD_FAILURE) {
log_err("in get_min_file_name, strcpy_s failed\n");
return TD_FAILURE;
}
}
}
return TD_SUCCESS;
}
// 获得指定文件夹中文件名最大或最小的文件get_max==0时获得最小文件get_max==1时获得最大文件
static int get_min_or_max_file_name(char *dir_path, int file_num, int get_max, char *target_file_path)
{
char file_names[file_num][MAX_LEN];
int ret = TD_FAILURE;
char target_file_name[MAX_LEN] = {'\0'};
ret = get_file_names(dir_path, file_names);
if (ret == TD_FAILURE) {
log_err("in get_min_or_max_file_name, get_file_names failed\n");
goto OUT;
}
ret = strcpy_s(target_file_name, MAX_LEN, file_names[0]);
if (ret == TD_FAILURE) {
log_err("in get_min_or_max_file_name, strcpy_s failed\n");
goto OUT;
}
if (get_max == 1) {
ret = get_max_file_name(file_num, file_names, target_file_name);
if (ret == TD_FAILURE) {
log_err("in get_min_or_max_file_name, get_max_file_name failed\n");
goto OUT;
}
} else if (get_max == 0) {
ret = get_min_file_name(file_num, file_names, target_file_name);
if (ret == TD_FAILURE) {
log_err("in get_min_or_max_file_name, get_min_file_name failed\n");
goto OUT;
}
}
ret = snprintf_s(target_file_path, MAX_LEN, MAX_LEN - 1, "%s/%s", dir_path, target_file_name);
if (ret == TD_FAILURE) {
log_err("snprintf_s failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 清理落盘文件夹,保持文件数小于等于阈值
static int clean_snapshot_dir(proc_node *proc)
{
int file_num;
char target_file_name[MAX_LEN] = {'\0'};
char buf[MAX_LEN] = {'\0'};
int ret = TD_FAILURE;
if (proc == TD_NULL) {
log_err("in clean_snapshot_dir, proc is null ptr.\n");
goto OUT;
}
ret = get_dir_files_num(proc->archive_path, &file_num);
if (ret == TD_FAILURE) {
log_err("in clean_snapshot_dir, get_dir_files_num failed\n");
goto OUT;
}
while (file_num > g_max_rotation_count) { // 文件数 > 3时删掉多余的小名文件
ret = get_min_or_max_file_name(proc->archive_path, file_num, 0, target_file_name);
if (ret == TD_FAILURE) {
log_err("in clean_snapshot_dir, get_dir_files_num failed\n");
goto OUT;
}
ret = sprintf_s(buf, MAX_LEN, "[Remove] %s", target_file_name);
if (ret == TD_FAILURE) {
log_err("in clean_snapshot_dir, sprintf_s failed\n");
goto OUT;
}
write_log(buf);
ret = remove(target_file_name);
if (ret == TD_FAILURE) {
log_err("in clean_snapshot_dir, remove failed\n");
goto OUT;
}
file_num--;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 获取落盘目标文件的文件描述符,保证有效且发生错误时已关闭文件
static int get_target_file(int file_num, char *target_file_name, proc_node *proc, char *snapshot_path)
{
char buf[MAX_LEN] = {'\0'};
int fd = -1;
if (proc == TD_NULL) {
log_err("in get_target_file, proc is null ptr.\n");
goto OUT;
}
// 获取目标落盘文件路径
if (get_min_or_max_file_name(proc->archive_path, file_num, 1, target_file_name) == TD_FAILURE) {
log_err("in get_target_file, get_min_or_max_file_name failed\n");
goto OUT;
}
if (get_file_size(target_file_name) >= g_max_rotation_size) { // 目标文件大小超过阈值,则新建文件
if (get_snapshot_path_from_system_time(proc, snapshot_path) == TD_FAILURE) {
log_err("in get_target_file, get_snapshot_path_from_system_time\n");
goto OUT;
}
fd = open(snapshot_path, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM);
if (fd == -1) {
log_err("in get_target_file, open %s failed\n", snapshot_path);
goto OUT;
}
if (sprintf_s(buf, MAX_LEN, "[Create] %s", snapshot_path) == TD_FAILURE) {
log_err("in get_target_file, sprintf_s failed\n");
goto OUT1;
}
if (write_log(buf) == TD_FAILURE) {
log_err("write_log failed\n");
goto OUT1;
}
if (clean_snapshot_dir(proc) == TD_FAILURE) {
log_err("in get_target_file, clean_snapshot_dir failed\n");
goto OUT1;
}
} else { // 目标文件大小在阈值范围内往目标文件append内容
fd = open(target_file_name, O_WRONLY | O_APPEND);
if (fd == -1) {
log_err("in get_target_file, open %s failed\n", target_file_name);
goto OUT;
}
}
return fd;
OUT1:
close(fd);
OUT:
return TD_FAILURE;
}
// 创建落盘目标文件,返回文件描述符,保证有效且发生错误时已关闭文件
static int create_target_file(char *snapshot_path, proc_node *proc)
{
int ret = TD_FAILURE;
char buf[MAX_LEN] = {'\0'};
int fd = -1;
ret = get_snapshot_path_from_system_time(proc, snapshot_path);
if (ret == TD_FAILURE) {
log_err("in create_target_file, get_snapshot_path_from_system_time failed\n");
goto OUT;
}
fd = open(snapshot_path, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM);
if (fd == -1) {
log_err("in create_target_file, open %s failed\n", snapshot_path);
goto OUT;
}
ret = sprintf_s(buf, MAX_LEN, "[Create] %s", snapshot_path);
if (ret == TD_FAILURE) {
log_err("in create_target_file, sprintf_s failed\n");
goto OUT1;
}
ret = write_log(buf);
if (ret == TD_FAILURE) {
log_err("write_log failed");
goto OUT1;
}
return fd;
OUT1:
close(fd);
OUT:
return TD_FAILURE;
}
// 打开落盘文件夹中的目标文件,返回其文件描述符 to_fd保证有效且发生错误时已关闭文件
static int open_or_remove_snapshot_file(char *snapshot_path, proc_node *proc)
{
int ret = TD_FAILURE;
int file_num;
char target_file_name[BUF_SIZE] = {'\0'};
int fd = -1;
if (proc == TD_NULL) {
log_err("in open_or_remove_snapshot_file, proc is null ptr.\n");
goto OUT;
}
ret = clean_snapshot_dir(proc); // 保证目标文件夹中的文件数量在阈值范围内
if (ret == TD_FAILURE) {
log_err("in open_or_remove_snapshot_file, clean_snapshot_dir failed\n");
goto OUT;
}
ret = get_dir_files_num(proc->archive_path, &file_num); // 获取目标文件夹中的文件数目
if (ret == TD_FAILURE) {
log_err("in open_or_remove_snapshot_file, get_dir_files_num failed\n");
goto OUT;
}
if (file_num > 0) { // 有文件,获取最大名文件作为目标文件,保证文件描述符有效且发生错误时已关闭文件
fd = get_target_file(file_num, target_file_name, proc, snapshot_path);
if (fd == -1) {
log_err("in open_or_remove_snapshot_file, get_target_file failed\n");
goto OUT;
}
} else { // 没有文件,创建文件,保证文件描述符有效且发生错误时已关闭文件
fd = create_target_file(snapshot_path, proc);
if (fd == -1) {
log_err("in open_or_remove_snapshot_file, create_target_file failed\n");
goto OUT;
}
}
return fd;
OUT:
return TD_FAILURE;
}
// proc存盘超时5秒故障上报
static void fault_report_timeout(union sigval val)
{
long cur_timestamp = get_timestamp_for_timer();
static __thread int has_report = 0;
timer_val *tvalue = (timer_val *)(val.sival_ptr);
int ret = TD_FAILURE;
unsigned int handle;
int timeout = 0;
char buf[MAX_LEN] = {'\0'};
if (tvalue == TD_NULL || tvalue->proc == TD_NULL) {
log_err("in fault_report_timeout, tvalue or tvalue->proc is null ptr.\n");
goto OUT;
}
timeout = OVERTIME_CO * atoi(tvalue->proc->time);
if (has_report == 0 && cur_timestamp - *(tvalue->timestamp) > timeout * SEC_TO_NSEC) {
ret = get_proc_stack();
if (ret == TD_FAILURE) {
log_err("in fault_report_timeout, get_proc_stack failed\n");
goto OUT;
}
log_dbg("fault report from %s in %ld\n", tvalue->proc->path, cur_timestamp);
ret = dft_event_create(FAULT_NO_TIMEOUT, &handle);
if (ret == TD_FAILURE) {
log_err("in fault_report_timeout, dft_event_create failed\n");
goto OUT;
}
dft_event_put_string(handle, "PNAME", "procdaemon");
dft_event_put_string(handle, "F1NAME", "fault_report_timeout");
dft_event_put_string(handle, "F2NAME", "save_proc_file_cycle");
dft_event_put_string(handle, "PROCNAME", tvalue->proc->path);
ret = dft_event_report(handle);
if (ret == TD_FAILURE) {
log_err("in fault_report_timeout, dft_event_report failed\n");
goto OUT;
}
dft_event_destroy(handle);
has_report = 1;
ret = sprintf_s(buf, MAX_LEN, "[Fault Report] proc %s archive timeout", tvalue->proc->path);
if (ret == TD_FAILURE) {
log_err("in fault_report_timeout, sprintf_s failed\n");
goto OUT;
}
write_log(buf);
}
OUT:
return;
}
// 初始化计时器
static int init_timer(int init_sec, int sec, struct sigevent event)
{
int ret = TD_FAILURE;
timer_t timerid;
ret = timer_create(CLOCK_REALTIME, &event, &timerid);
if (ret == TD_FAILURE) {
log_err("in init_timer, memset_s failed\n");
goto OUT;
}
struct itimerspec it;
it.it_interval.tv_sec = sec;
it.it_interval.tv_nsec = TIMER_NSEC;
it.it_value.tv_sec = init_sec;
it.it_value.tv_nsec = TIMER_NSEC;
ret = timer_settime(timerid, 0, &it, NULL);
if (ret == TD_FAILURE) {
log_err("in init_timer, timer_settime failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
static int write_snapshot(int to_fd, int from_fd, proc_node *proc)
{
int nread;
int nwrite;
int ret = TD_FAILURE;
char buf[MAX_LEN] = {'\0'};
char time_stamp[MAX_LEN] = {'\0'};
if (proc == TD_NULL) {
log_err("in write_snapshot, proc is null ptr.\n");
goto OUT;
}
// 写时间戳
ret = get_timestamp_for_archive_content(time_stamp);
if (ret == TD_FAILURE) {
log_err("in write_snapshot, get_timestamp_for_archive_content failed\n");
goto OUT;
}
ret = write(to_fd, time_stamp, strlen(time_stamp));
if (ret == TD_FAILURE) {
log_err("write time stamp to file %s failed.\n", proc->archive_path);
goto OUT;
}
// 写proc内容
while ((nread = read(from_fd, buf, sizeof(buf))) > 0) { // 读proc
nwrite = 0;
do {
ret = write(to_fd, &buf[nwrite], nread - nwrite); // 写proc
if (ret == TD_FAILURE) {
log_err("write proc content to file %s failed.\n", proc->archive_path);
goto OUT;
}
nwrite += ret;
} while (nwrite < nread);
}
if (nread == TD_FAILURE) {
log_err("in write_snapshot, read %s failed.\n", proc->path);
goto OUT;
}
// 写入换行符
ret = write(to_fd, "\n", 1);
if (ret == TD_FAILURE) {
log_err("write \\n failed.\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 创建计时器检测read操作响应
static int init_timer_check_response(long *timestamp, proc_node *proc)
{
int ret = TD_FAILURE;
timer_val val;
struct sigevent event;
*timestamp = get_timestamp_for_timer();
val.timestamp = timestamp;
val.proc = proc;
ret = memset_s(&event, sizeof(struct sigevent), 0, sizeof(struct sigevent));
if (ret == TD_FAILURE) {
log_err("in init_timer_check_response, memset_s failed\n");
goto OUT;
}
event.sigev_value.sival_ptr = &val;
event.sigev_notify = SIGEV_THREAD;
event.sigev_notify_function = fault_report_timeout;
ret = init_timer(TIMER_SEC, TIMER_NSEC, event);
if (ret == TD_FAILURE) {
log_err("in init_timer_check_response, init_timer failed.\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 打开proc打开失败时重试
static int try_open_proc(char *proc_path)
{
int fd = -1;
int has_print_info = TD_FAILURE;
fd = open(proc_path, O_RDONLY);
while (fd == -1) {
if (has_print_info == TD_FAILURE) {
log_info("open %s failed, reopen after each %d seconds\n", proc_path, PROC_REOPEN_INTERVAL);
has_print_info = TD_SUCCESS;
}
sleep(PROC_REOPEN_INTERVAL);
fd = open(proc_path, O_RDONLY);
}
if (has_print_info == TD_SUCCESS) {
log_info("open %s success\n", proc_path);
}
return fd;
}
// 每当落盘次数达到xx次就会尝试关闭proc并重新打开打开失败时会重试
static int try_close_and_open_proc(int from_fd, char *path, int loop_time)
{
if (loop_time % MAX_TIME_TO_ARCHIVE != 0) {
return from_fd;
}
// 如果落盘次数到一定量,则关闭文件后重新尝试打开
if (from_fd != -1) {
close(from_fd);
}
return try_open_proc(path);
}
// 读取proc文件并落盘
static void *save_proc_file_cycle(void *args)
{
int from_fd = -1, to_fd = -1;
long timestamp;
char snapshot_path[MAX_LEN] = {'\0'}; // 落盘文件名
proc_node *proc = (proc_node *)args;
int loop_time = 0;
set_thread_name_from_proc_path(proc->path);
if (init_timer_check_response(&timestamp, proc) == TD_FAILURE) { // 初始化时钟,检测落盘超时
log_err("in save_proc_file_cycle, init_timer_check_response failed.\n");
goto OUT1;
}
// 循环周期性落盘
while (1) {
// 落盘一定次数后先关闭再尝试打开proc打开失败时重复尝试
from_fd = try_close_and_open_proc(from_fd, proc->path, loop_time);
if (check_dir_exits(proc) == TD_FAILURE) {
log_err("in save_proc_file_cycle, make_snapshot_dir failed.\n");
goto OUT;
}
lseek(from_fd, 0, SEEK_SET); // 将文件指针移动到文件开头
// 打开落盘目标文件,返回有效文件描述符
to_fd = open_or_remove_snapshot_file(snapshot_path, proc);
if (to_fd == -1) {
log_err("in save_proc_file_cycle, open_or_remove_snapshot_file failed\n");
goto OUT;
}
timestamp = get_timestamp_for_timer();
if (write_snapshot(to_fd, from_fd, proc) == TD_FAILURE) { // 往目标文件中写入proc内容
log_err("write_snapshot from %s failed\n", proc->path);
goto OUT;
}
loop_time = (loop_time + 1) % MAX_TIME_TO_ARCHIVE;
close(to_fd);
sleep(atoi(proc->time));
}
close(from_fd);
return (void *)TD_SUCCESS;
OUT:
close(from_fd);
OUT1:
return (void *)TD_FAILURE;
}
static int process_mode(xmlNodePtr prop_node_ptr, proc_node *node)
{
xmlChar *sz_attr = TD_NULL;
if (node == TD_NULL) {
log_err("in process_mode, node is null ptr.\n");
goto OUT;
}
sz_attr = xmlGetProp(prop_node_ptr, BAD_CAST "mode");
if (strcmp("cycle", (char *)sz_attr) == 0) {
node->mode = CYCLE_MODE;
} else if (strcmp("variation", (char *)sz_attr) == 0) {
node->mode = VARIATION_MODE;
} else {
node->mode = UNDEFINED_MODE;
}
xmlFree(sz_attr);
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
static int process_time(xmlNodePtr prop_node_ptr, proc_node *node)
{
int ret = TD_FAILURE;
xmlChar *sz_attr = TD_NULL;
if (node == TD_NULL) {
log_err("in process_time, node is null ptr.\n");
goto OUT;
}
sz_attr = xmlGetProp(prop_node_ptr, BAD_CAST "time");
ret = is_float_digital((char *)sz_attr);
if (ret == TD_FAILURE) {
log_err("in process_time, is_float_digital failed\n");
goto OUT1;
}
if (atoi((char *)sz_attr) < 1) {
ret = strcpy_s(node->time, MAX_LEN, "1");
if (ret == TD_FAILURE) {
log_err("in process_time, strcpy_s failed\n");
goto OUT1;
}
log_info("cycle time is less 1 second, auto fix to 1 second.\n");
} else {
ret = strcpy_s(node->time, MAX_LEN, (char *)sz_attr);
if (ret == TD_FAILURE) {
log_err("in process_time, strcpy_s failed\n");
goto OUT1;
}
}
xmlFree(sz_attr);
return TD_SUCCESS;
OUT1:
xmlFree(sz_attr);
OUT:
return TD_FAILURE;
}
// 检测xml的node内部属性
static int process_node_properties(xmlNodePtr prop_node_ptr, int node_idx)
{
xmlAttrPtr attr_ptr = TD_NULL;
int ret = TD_FAILURE;
proc_node *node = &g_nodes[node_idx];
if (node_idx >= MAX_LEN || node_idx < 0) {
log_err("in process_node_properties, node_idx is overflow.\n");
goto OUT;
}
attr_ptr = prop_node_ptr->properties;
while (attr_ptr != NULL) {
// 获取mode属性
if (!xmlStrcmp(attr_ptr->name, BAD_CAST "mode")) {
ret = process_mode(prop_node_ptr, node);
if (ret == TD_FAILURE) {
log_err("in process_node_properties, process_mode failed\n");
goto OUT;
}
}
// 获取time属性
if (!xmlStrcmp(attr_ptr->name, BAD_CAST "time")) {
ret = process_time(prop_node_ptr, node);
if (ret == TD_FAILURE) {
log_err("in process_node_properties, process_time failed\n");
goto OUT;
}
}
ret = strcpy_s(node->time, MAX_LEN, node->mode == VARIATION_MODE ? "-1" : node->time);
if (ret == TD_FAILURE) {
log_err("in process_node_properties, strcpy_s failed\n");
goto OUT;
}
attr_ptr = attr_ptr->next;
if (node->mode == UNDEFINED_MODE) {
log_err("node does not have 'mode' property.\n");
return TD_FAILURE;
} else if (node->mode == CYCLE_MODE && strcmp(node->time, "-1") == 0) {
log_err("cycle mode needs to set archive cycle time.\n");
return TD_FAILURE;
}
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 获取节点中包裹的合法路径
static int get_safe_path_inner_node(xmlNodePtr cur_node, int node_idx)
{
xmlChar *sz_key = TD_NULL;
int ret = TD_FAILURE;
proc_node *node = &g_nodes[node_idx];
if (cur_node == TD_NULL) {
log_err("in get_safe_path_inner_node, cur_node is null ptr.\n");
goto OUT;
}
if (node_idx >= MAX_LEN || node_idx < 0) {
log_err("in get_safe_path_inner_node, node_idx is overflow.\n");
goto OUT;
}
sz_key = xmlNodeGetContent(cur_node);
if (sz_key == TD_NULL) {
log_err("in get_safe_path_inner_node, xmlNodeGetContent return null ptr.\n");
goto OUT;
}
ret = starts_with((char *)sz_key, PROC_PATH_PREFIX);
if (ret == TD_FAILURE) {
log_info("path not start with %s.\n", PROC_PATH_PREFIX);
goto OUT1;
}
ret = strcpy_s(node->path, MAX_LEN, (char *)sz_key);
if (ret == TD_FAILURE) {
log_err("in get_safe_path_inner_node, strcpy_s failed\n");
goto OUT1;
}
xmlFree(sz_key);
return TD_SUCCESS;
OUT1:
xmlFree(sz_key);
OUT:
return TD_FAILURE;
}
// 解析list节点获取落盘proc文件列表及相应配置
static void parse_list_xml_node(xmlNodePtr cur_node)
{
int ret = TD_FAILURE;
pthread_t cycle_archive_tid_idx = 0;
pthread_t variation_archive_tid_idx = 0;
int node_idx = 0;
while (cur_node != TD_NULL && node_idx < MAX_LEN) { // 解析xml配置文件
// 1. 是否为snapshot节点如果不是则跳过处理下一个xml节点
if (xmlStrcmp(cur_node->name, (const xmlChar *) "snapshot")) {
log_err("cur_node is not <snapshot>, skip & process next node.\n");
cur_node = cur_node->next;
continue;
}
// 2. 获取节点路径并判断是否合法如果不合法则跳过处理下一个xml节点
ret = get_safe_path_inner_node(cur_node, node_idx);
if (ret == TD_FAILURE) {
log_err("in parse_list_xml_node, get_safe_path_inner_node failed.\n");
cur_node = cur_node->next;
continue;
}
// 3. 获取节点属性如果获取失败处理下一个xml节点
ret = process_node_properties(cur_node, node_idx);
if (ret == TD_FAILURE) {
log_err("in parse_list_xml_node, process_node_properties failed\n");
cur_node = cur_node->next;
continue;
}
// 4. 开始根据node内容落盘
ret = proc_archive(node_idx, &cycle_archive_tid_idx, &variation_archive_tid_idx);
// 5. 处理下一个xml节点
cur_node = cur_node->next;
node_idx++;
}
}
static int parse_common_xml_node_max_cpu_utility(xmlNodePtr cur_node)
{
int ret = TD_FAILURE;
xmlChar *sz_key = TD_NULL;
sz_key = xmlNodeGetContent(cur_node); // 获取最大cpu占用率
ret = is_float_digital((char *)sz_key);
if (ret == TD_FAILURE) {
log_err("in parse_common_xml_node_max_cpu_utility, is_float_digital failed\n");
goto OUT;
}
g_max_cpu_utility = strtod((char *)sz_key, NULL);
if (g_max_cpu_utility > MAX_MAX_CPU_UTILITY || g_max_cpu_utility <= MIN_MAX_CPU_UTILITY) {
g_max_cpu_utility = DEFAULT_MAX_CPU_UTILITY;
log_info("got a wrong g_max_cpu_utility, reset to %.1f\n", DEFAULT_MAX_CPU_UTILITY);
}
xmlFree(sz_key);
return TD_SUCCESS;
OUT:
xmlFree(sz_key);
return TD_FAILURE;
}
static int parse_common_xml_node_max_rotation_count(xmlNodePtr cur_node)
{
int ret = TD_FAILURE;
xmlChar *sz_key = TD_NULL;
sz_key = xmlNodeGetContent(cur_node); // 获取最大落盘文件数量
ret = is_int_digital((char *)sz_key);
if (ret == TD_FAILURE) {
log_err("in parse_common_xml_node_max_rotation_count, is_int_digital failed\n");
goto OUT;
}
g_max_rotation_count = atoi((char *)sz_key);
if (g_max_rotation_count <= MIN_MAX_ROTATION_COUNT || g_max_rotation_count > MAX_MAX_ROTATION_COUNT) {
g_max_rotation_count = DEFAULT_MAX_ROTATION_COUNT;
log_info("got a wrong g_max_rotation_count, reset to %d\n", DEFAULT_MAX_ROTATION_COUNT);
}
xmlFree(sz_key);
return TD_SUCCESS;
OUT:
xmlFree(sz_key);
return TD_FAILURE;
}
static int parse_common_xml_node_max_rotation_size(xmlNodePtr cur_node)
{
int ret = TD_FAILURE;
xmlChar *sz_key = TD_NULL;
sz_key = xmlNodeGetContent(cur_node); // 获取最大落盘文件尺寸
ret = is_int_digital((char *)sz_key);
if (ret == TD_FAILURE) {
log_err("in parse_common_xml_node_max_rotation_size, is_int_digital failed\n");
goto OUT;
}
g_max_rotation_size = atoi((char *)sz_key);
if (g_max_rotation_size <= MIN_MAX_ROTATION_SIZE || g_max_rotation_size > MAX_MAX_ROTATION_SIZE) {
g_max_rotation_size = DEFAULT_MAX_ROTATION_SIZE;
log_info("got a wrong g_max_rotation_size, reset to %d\n", DEFAULT_MAX_ROTATION_SIZE);
}
xmlFree(sz_key);
return TD_SUCCESS;
OUT:
xmlFree(sz_key);
return TD_FAILURE;
}
// 解析common节点获取通用配置信息
static int parse_common_xml_node(xmlNodePtr cur_node)
{
int ret = TD_FAILURE;
while (cur_node != NULL) {
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_cpu_utility"))) {
ret = parse_common_xml_node_max_cpu_utility(cur_node);
if (ret == TD_FAILURE) {
log_err("in parse_common_xml_node, parse_common_xml_node_max_cpu_utility failed\n");
goto OUT;
}
}
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_rotation_count"))) {
ret = parse_common_xml_node_max_rotation_count(cur_node);
if (ret == TD_FAILURE) {
log_err("in parse_common_xml_node, parse_common_xml_node_max_rotation_count failed\n");
goto OUT;
}
}
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_rotation_size"))) {
ret = parse_common_xml_node_max_rotation_size(cur_node);
if (ret == TD_FAILURE) {
log_err("in parse_common_xml_node, parse_common_xml_node_max_rotation_size failed\n");
goto OUT;
}
}
cur_node = cur_node->next;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 解析xml文件获得配置信息
static int procd_main(char *xml_file_path)
{
xmlDocPtr doc = TD_NULL;
xmlNodePtr cur_node = TD_NULL;
int ret = TD_FAILURE;
xmlKeepBlanksDefault(0); // 过滤空白text节点
doc = xmlReadFile(xml_file_path, "UTF-8", XML_PARSE_RECOVER);
if (doc == TD_NULL) {
fprintf(stderr, "Document not parsed successfully.\n");
goto OUT1;
}
// 获取xml的根节点
cur_node = xmlDocGetRootElement(doc);
if (cur_node == TD_NULL) {
fprintf(stderr, "empty document\n");
goto OUT;
}
// 判断根节点是否符合预期
if (xmlStrcmp(cur_node->name, BAD_CAST "procd")) {
fprintf(stderr, "document of the wrong type, root node != proc_list\n");
goto OUT;
}
// 遍历xml文件
cur_node = cur_node->xmlChildrenNode;
while (cur_node != TD_NULL) {
// 处理common的子节点内容
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "common"))) {
ret = parse_common_xml_node(cur_node->xmlChildrenNode);
if (ret == TD_FAILURE) {
log_err("in procd_main, parse_common_xml_node failed\n");
goto OUT;
}
}
// 处理list的子节点内容遍历的同事创建线程进行落盘
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "list"))) {
parse_list_xml_node(cur_node->xmlChildrenNode);
}
cur_node = cur_node->next;
}
(void)xmlFreeDoc(doc);
return TD_SUCCESS;
OUT:
xmlFreeDoc(doc);
OUT1:
return TD_FAILURE;
}
// 变化落盘存文件
static int save_proc_file_variation(proc_node *proc, int from_fd)
{
int to_fd = -1;
int ret = TD_FAILURE;
char snapshot_path[MAX_LEN] = {'\0'};
ret = memset_s(snapshot_path, MAX_LEN, '\0', MAX_LEN);
if (ret == TD_FAILURE) {
log_err("in save_proc_file_variation, memset_s failed\n");
goto OUT;
}
lseek(from_fd, 0, SEEK_SET);
to_fd = open_or_remove_snapshot_file(snapshot_path, proc); // 打开落盘文件,返回有效文件描述符
if (to_fd == -1) {
log_err("in save_proc_file_variation, open_or_remove_snapshot_file failed\n");
goto OUT;
}
log_dbg("variation archive into %s\n", snapshot_path);
write_snapshot(to_fd, from_fd, proc);
close(to_fd);
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 变化落盘poll检测
static void *proc_variation_poll(void *arg)
{
int ret = TD_FAILURE;
proc_node *proc = (proc_node *)arg;
struct pollfd *pfd = TD_NULL;
int ready = 0;
int loop_time = 0;
pfd = (struct pollfd *)calloc(1, sizeof(struct pollfd));
if (pfd == TD_NULL) {
log_err("in proc_variation_poll, calloc failed, pfd is null ptr.\n");
goto OUT;
}
set_thread_name_from_proc_path(proc->path);
pfd->events = POLLIN;
pfd->revents = 0;
// 循环检测poll的返回值检测是否有被触发
while (1) {
// 落盘次数超过阈值后会尝试关闭并重新打开proc打开失败时重复尝试
pfd->fd = try_close_and_open_proc(pfd->fd, proc->path, loop_time);
ret = check_dir_exits(proc);
if (ret == TD_FAILURE) {
log_err("in proc_variation_poll, make_snapshot_dir failed.\n");
goto OUT1;
}
ready = poll(pfd, 1, -1);
if (ready < 0) {
log_err("in proc_variation_poll, poll failed.\n");
goto OUT1;
} else if ((ready > 0) && ((pfd->revents != 0) && (pfd->revents & POLLIN))) {
log_dbg("mention proc changed.\n");
ret = save_proc_file_variation(proc, pfd->fd);
if (ret == TD_FAILURE) {
log_err("in proc_variation_poll, save_proc_file_variation failed.\n");
goto OUT1;
}
loop_time = (loop_time + 1) % MAX_TIME_TO_ARCHIVE;
}
}
close(pfd->fd);
free(pfd);
return (void *)TD_SUCCESS;
OUT1:
close(pfd->fd);
free(pfd);
OUT:
return (void *)TD_FAILURE;
}
// 周期落盘主逻辑
static int cycle_archive(proc_node *node, pthread_t *tid)
{
int ret = TD_FAILURE;
if (node == TD_NULL) {
log_err("in cycle_archive, node is null ptr.\n");
goto OUT;
}
ret = check_dir_exits(node);
if (ret == TD_FAILURE) {
log_err("check_dir_exits failed\n");
goto OUT;
}
// 创建新线程执行save_proc_file函数实现snap落盘
ret = pthread_create(tid, TD_NULL, save_proc_file_cycle, node);
if (ret == TD_FAILURE) {
log_err("in cycle_archive, pthread_create failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 变化落盘主逻辑
static int variation_archive(proc_node *node, pthread_t *tid)
{
int ret = TD_FAILURE;
if (node == TD_NULL) {
log_err("in variation_archive, node is null ptr.\n");
goto OUT;
}
ret = check_dir_exits(node);
if (ret == TD_FAILURE) {
log_err("check_dir_exits failed.\n");
goto OUT;
}
ret = pthread_create(tid, TD_NULL, proc_variation_poll, node);
if (ret == TD_FAILURE) {
log_err("in variation_archive, pthread_create failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 设定cpu占用率检测时钟
static void *thread_watch_cpu_usage()
{
int ret = TD_FAILURE;
struct sigevent event;
ret = memset_s(&event, sizeof(struct sigevent), 0, sizeof(struct sigevent));
if (ret == TD_FAILURE) {
log_err("in thread_watch_cpu_usage, memset_s failed\n");
return (void *)TD_FAILURE;
}
event.sigev_value.sival_ptr = TD_NULL;
event.sigev_notify = SIGEV_THREAD;
event.sigev_notify_function = check_cpu_usage;
ret = init_timer(CPU_TIMER_SEC, CPU_TIMER_SEC, event);
if (ret == TD_FAILURE) {
log_err("in thread_watch_cpu_usage, init_timer failed\n");
return (void *)TD_FAILURE;
}
return (void *)TD_SUCCESS;
}
// cpu占用率检测
static int cpu_monitor()
{
int ret = TD_FAILURE;
pthread_t pid;
ret = pthread_create(&pid, NULL, thread_watch_cpu_usage, NULL);
if (ret == TD_FAILURE) {
log_err("in cpu_monitor, pthread_creat failed\n");
return TD_FAILURE;
}
pthread_join(pid, NULL);
return TD_SUCCESS;
}
// 落盘逻辑
static int proc_archive(int node_idx, pthread_t *cycle_archive_tid_idx, pthread_t *variation_archive_tid_idx)
{
int ret = TD_FAILURE;
proc_node *node = &g_nodes[node_idx];
if (node_idx >= MAX_LEN || node_idx < 0) {
log_err("node_idx is overflow.\n");
goto OUT;
}
if (node->mode == CYCLE_MODE) {
ret = cycle_archive(node, &(g_cycle_archive_tids[*cycle_archive_tid_idx])); // 周期落盘
if (ret == TD_FAILURE) {
log_err("in proc_archive, cycle_archive failed.\n");
goto OUT;
}
*cycle_archive_tid_idx = *cycle_archive_tid_idx + 1;
} else if (node->mode == VARIATION_MODE) {
ret = variation_archive(node, &(g_variation_archive_tids[*variation_archive_tid_idx])); // 变化落盘
if (ret == TD_FAILURE) {
log_err("in proc_archive, cycle_archive failed.\n");
goto OUT;
}
*variation_archive_tid_idx = *variation_archive_tid_idx + 1;
} else {
log_err("archive mode is error, please check\n");
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
static int lock_file(int fd)
{
struct flock fl;
fl.l_type = F_WRLCK;
fl.l_start = 0;
fl.l_whence = SEEK_SET;
fl.l_len = 0;
fl.l_pid = -1;
return (fcntl(fd, F_SETLK, &fl));
}
static int is_running(const char *process_name)
{
int ret = TD_FAILURE;
int fd = -1;
char buf[MAX_LEN] = {'\0'};
char file_name[MAX_LEN] = {'\0'};
ret = sprintf_s(file_name, MAX_LEN, "%s/%s/%s.pid", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH, process_name);
if (ret == TD_FAILURE) {
log_err("in is_running, sprintf_s failed\n");
goto OUT;
}
fd = open(file_name, O_CREAT | O_RDWR, FILE_PERM);
if (fd == TD_FAILURE) {
log_err("in is_running, open %s failed\n", file_name);
goto OUT;
}
ret = lock_file(fd);
if (ret == TD_FAILURE) {
write_log("[Start Error] procdameon is already running, single instance mode refuse start another procdaemon");
goto OUT1;
}
ftruncate(fd, 0);
ret = sprintf_s(buf, MAX_LEN, "%ld\n", (long)getpid());
if (ret == TD_FAILURE) {
log_err("in is_running, sprintf_s failed\n");
goto OUT1;
}
write(fd, buf, MAX_LEN);
close(fd);
return TD_SUCCESS;
OUT1:
close(fd);
OUT:
return TD_FAILURE;
}
static int start_log()
{
char buf[MAX_LEN] = {'\0'};
int ret = TD_FAILURE;
ret = sprintf_s(buf, MAX_LEN, "[Start] procdaemon");
if (ret == TD_FAILURE) {
log_err("in start_log, sprintf_s failed\n");
return TD_FAILURE;
}
write_log(buf);
return TD_SUCCESS;
}
// 初始化
static int procd_preparation()
{
// 创建procsnap落盘根目录/data/vendor/procsnap
if (make_snapshot_root_dir() == TD_FAILURE) {
log_err("in procd_preparation, make_snapshot_root_dir failed\n");
goto OUT;
}
// 创建procsnap日志文件夹/data/vendor/procsnap/procd_log
if (make_log_dir() == TD_FAILURE) {
log_err("in procd_preparation make_log_dir failed\n");
goto OUT;
}
// 等待系统时钟初始化完成
while (is_system_timer_init() == TD_FAILURE) {
sleep(1);
}
// 写入启动日志
if (start_log() == TD_FAILURE) {
log_err("in main, start_log failed\n");
goto OUT;
}
// 检查进程单例
if (is_running(PROCESS_NAME) == TD_FAILURE) {
log_info("procd is running...\n");
goto OUT;
}
// 检测cpu占用率
if (cpu_monitor() == TD_FAILURE) {
log_err("in main, cpu_monitor failed\n");
goto OUT;
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
// 后续工作,等待线程结束并回收资源
static int procd_clean()
{
int i;
int ret = TD_FAILURE;
log_info("start procd_clean\n");
// 回收线程的tid
for (i = 0; i < MAX_LEN; i++) {
if (g_cycle_archive_tids[i] != -1) {
ret = pthread_join(g_cycle_archive_tids[i], TD_NULL);
if (ret == TD_FAILURE) {
log_err("pthread_join failed for tid=%ld\n", g_cycle_archive_tids[i]);
goto OUT;
}
}
if (g_variation_archive_tids[i] != -1) {
ret = pthread_join(g_variation_archive_tids[i], TD_NULL);
if (ret == TD_FAILURE) {
log_err("pthread_join failed for tid=%ld\n", g_variation_archive_tids[i]);
goto OUT;
}
}
}
return TD_SUCCESS;
OUT:
return TD_FAILURE;
}
int main()
{
int ret = TD_FAILURE;
// 预备动作
ret = procd_preparation();
if (ret == TD_FAILURE) {
log_err("procd_preparation failed.\n");
goto OUT;
}
// 主逻辑
ret = procd_main(CONF_PATH);
if (ret == TD_FAILURE) {
log_err("procd_main failed.\n");
goto OUT;
}
// 收尾动作
ret = procd_clean();
if (ret == TD_FAILURE) {
log_err("procd_clean failed.\n");
goto OUT;
}
return 0;
OUT:
return TD_FAILURE;
}