|
|
/*
|
|
|
* Copyright (c) Hisilicon Technologies Co., Ltd.. 2021-2021. All rights reserved.
|
|
|
* Description: part [procdaemon] for archive proc informations.
|
|
|
* Create: 2022-09-20
|
|
|
*/
|
|
|
|
|
|
#include <stdio.h>
|
|
|
#include <stdlib.h>
|
|
|
#include <fcntl.h>
|
|
|
#include <pthread.h>
|
|
|
#include <assert.h>
|
|
|
#include <time.h>
|
|
|
#include <string.h>
|
|
|
#include <dirent.h>
|
|
|
#include <poll.h>
|
|
|
#include <sys/stat.h>
|
|
|
#include <sys/types.h>
|
|
|
#include <sys/time.h>
|
|
|
#include <sys/select.h>
|
|
|
#include <sys/inotify.h>
|
|
|
#include <sys/prctl.h>
|
|
|
#include <libxml/parser.h>
|
|
|
#include <libxml/tree.h>
|
|
|
#include <securec.h>
|
|
|
#include "dft_event.h"
|
|
|
#include "proc_node.h"
|
|
|
#include "procd_base.h"
|
|
|
|
|
|
static int g_max_rotation_count = -1;
|
|
|
static int g_max_rotation_size = -1;
|
|
|
static float g_max_cpu_utility = 0.0f;
|
|
|
static pthread_t g_cycle_archive_tids[MAX_LEN] = {-1};
|
|
|
static pthread_t g_variation_archive_tids[MAX_LEN] = {-1};
|
|
|
static proc_node g_nodes[MAX_LEN];
|
|
|
|
|
|
typedef struct timer_val {
|
|
|
long *timestamp;
|
|
|
proc_node *proc;
|
|
|
} timer_val;
|
|
|
|
|
|
static int proc_archive(int node_idx, pthread_t *g_cycle_archive_tid, pthread_t *g_variation_archive_tid);
|
|
|
|
|
|
static void inline set_thread_name_from_proc_path(char *proc_path)
|
|
|
{
|
|
|
char *proc_name = TD_NULL;
|
|
|
|
|
|
proc_name = strrchr(proc_path, '/');
|
|
|
prctl(PR_SET_NAME, proc_name);
|
|
|
}
|
|
|
|
|
|
// 获得时间戳(用于落盘内容分隔)
|
|
|
static int get_timestamp_for_archive_content(char *timestamp)
|
|
|
{
|
|
|
time_t t;
|
|
|
struct tm *st = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
(void)time(&t);
|
|
|
st = localtime(&t);
|
|
|
ret = sprintf_s(timestamp, MAX_LEN, ">>> %04d-%02d-%02d %02d:%02d:%02d\n",
|
|
|
st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_timestamp_for_archive_content, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 返回时间戳(用于计时器判断时间)
|
|
|
static inline long get_timestamp_for_timer()
|
|
|
{
|
|
|
struct timeval tv;
|
|
|
gettimeofday(&tv, NULL);
|
|
|
return tv.tv_sec * SEC_TO_NSEC + tv.tv_usec;
|
|
|
}
|
|
|
|
|
|
static int get_timestamp_for_log(char *timestamp)
|
|
|
{
|
|
|
time_t t;
|
|
|
struct tm *st = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
(void)time(&t);
|
|
|
st = localtime(&t);
|
|
|
ret = sprintf_s(timestamp, MAX_LEN, "%04d-%02d-%02d %02d:%02d:%02d",
|
|
|
st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_timestamp_for_log, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 获得指定路径文件的大小,单位KB
|
|
|
static inline int get_file_size(char *file_path)
|
|
|
{
|
|
|
struct stat stat_buf;
|
|
|
stat(file_path, &stat_buf);
|
|
|
int size = stat_buf.st_size;
|
|
|
return size / BYTE_KB;
|
|
|
}
|
|
|
|
|
|
// 检测字符串str是不是以字符串prefix开头
|
|
|
static int starts_with(const char *str, const char *prefix)
|
|
|
{
|
|
|
if (!str || !prefix) {
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
if (strncmp(str, prefix, strlen(prefix)) == 0) {
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 拼接源文件的全路径
|
|
|
static int get_proc_archive_path(proc_node *proc, char *archive_dir_path)
|
|
|
{
|
|
|
char *proc_name = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
char proc_path[MAX_LEN] = {'\0'};
|
|
|
|
|
|
if (proc == TD_NULL) {
|
|
|
log_err("in get_proc_archive_path, proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = strcpy_s(proc_path, MAX_LEN, proc->path);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_proc_archive_path, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 拼接proc落盘文件夹路径
|
|
|
proc_name = strrchr(proc_path, '/'); // proc_name = /klad
|
|
|
ret = strcpy_s(archive_dir_path, MAX_LEN, PROC_SNAP_ROOT_DIR);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_proc_archive_path, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = strcat_s(archive_dir_path, MAX_LEN, proc_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_proc_archive_path, strcat_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 创建procsnap落盘的根目录
|
|
|
static int make_snapshot_root_dir()
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
DIR *dir = TD_NULL;
|
|
|
|
|
|
dir = opendir(PROC_SNAP_ROOT_DIR);
|
|
|
if (dir == TD_NULL) {
|
|
|
ret = mkdir(PROC_SNAP_ROOT_DIR, DIR_PERM);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in make_snapshot_root_dir, mkdir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
} else {
|
|
|
closedir(dir);
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 为每个落盘proc创建相应的文件夹
|
|
|
static int make_snapshot_dir(proc_node *proc)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
char archive_dir_path[MAX_LEN] = {'\0'};
|
|
|
DIR *dir = TD_NULL;
|
|
|
|
|
|
if (proc == TD_NULL) {
|
|
|
log_err("in make_snapshot_dir, proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = get_proc_archive_path(proc, archive_dir_path);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in make_snapshot_dir, get_proc_archive_path failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 如果文件夹不存在,则创建proc落盘文件夹
|
|
|
dir = opendir(archive_dir_path);
|
|
|
if (dir == TD_NULL) {
|
|
|
ret = mkdir(archive_dir_path, DIR_PERM);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in make_snapshot_dir, mkdir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
} else {
|
|
|
if (closedir(dir) == TD_FAILURE) {
|
|
|
log_err("closedir %s failed\n", archive_dir_path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
ret = strcpy_s(proc->archive_path, MAX_LEN, archive_dir_path);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in make_snapshot_dir, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int make_log_dir()
|
|
|
{
|
|
|
DIR *dir = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
char path[MAX_LEN] = {'\0'};
|
|
|
|
|
|
// 创建procd_log文件夹
|
|
|
ret = sprintf_s(path, MAX_LEN, "%s/%s", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH);
|
|
|
dir = opendir(path);
|
|
|
if (dir == TD_NULL) {
|
|
|
ret = mkdir(path, DIR_PERM);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in make_log_dir, mkdir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
} else {
|
|
|
if (closedir(dir) == TD_FAILURE) {
|
|
|
log_err("closedir %s failed\n", path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 检查落盘根文件夹及落盘子文件夹是否存在,如果不存在,则重新新建
|
|
|
static int check_dir_exits(proc_node *proc)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
// 检查根文件夹 /data/vendor/procsnap
|
|
|
ret = make_snapshot_root_dir();
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in check_dir_exits, make_snapshot_root_dir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 检查日志文件夹 /data/vendor/procsnap/procd_log
|
|
|
ret = make_log_dir();
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in check_dir_exits, make_log_dir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 检查子文件夹 /data/vendor/procsnap/xxx
|
|
|
if (proc != TD_NULL) {
|
|
|
ret = make_snapshot_dir(proc);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in check_dir_exits, make_snapshot_dir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 输出关键日志
|
|
|
static int write_log(const char *log_content)
|
|
|
{
|
|
|
int fd = -1;
|
|
|
int ret = TD_FAILURE;
|
|
|
char log_file_name[BUF_SIZE] = {'\0'};
|
|
|
char buf[BUF_SIZE] = {'\0'};
|
|
|
char time_stamp[MAX_LEN] = {'\0'};
|
|
|
|
|
|
if (check_dir_exits(TD_NULL) == TD_FAILURE) {
|
|
|
log_err("in write_log, make_log_dir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = sprintf_s(log_file_name, BUF_SIZE, "%s/%s/%s.log", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH, LOG_NAME);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in write_log, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
if (get_file_size(log_file_name) >= MAX_LOG_FILE_SIZE) {
|
|
|
fd = open(log_file_name, O_CREAT | O_WRONLY | O_TRUNC, FILE_PERM);
|
|
|
} else {
|
|
|
fd = open(log_file_name, O_CREAT | O_WRONLY | O_APPEND, FILE_PERM);
|
|
|
}
|
|
|
if (fd == TD_FAILURE) {
|
|
|
log_err("in write_log, open %s failed\n", log_file_name);
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
ret = get_timestamp_for_log(time_stamp);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in write_log, get_timestamp_for_log failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
ret = sprintf_s(buf, BUF_SIZE, "%s[%s] [pid:%d]: %s\n",
|
|
|
starts_with(log_content, "[Start]") ? "\n" : "", time_stamp, gettid(), log_content);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("sprintf_s failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
ret = write(fd, buf, BUF_SIZE);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in write_log, write failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
close(fd);
|
|
|
return TD_SUCCESS;
|
|
|
OUT1:
|
|
|
close(fd);
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 获得时间戳,用于判断系统时间是否初始化完成
|
|
|
static int is_system_timer_init()
|
|
|
{
|
|
|
time_t t;
|
|
|
struct tm *st = TD_NULL;
|
|
|
|
|
|
(void)time(&t);
|
|
|
st = localtime(&t);
|
|
|
if (st->tm_year + START_YEAR == SYSTEM_INIT_YEAR) {
|
|
|
log_dbg("system timer not init.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int get_proc_stack()
|
|
|
{
|
|
|
char proc_stack_path[MAX_LEN];
|
|
|
char buf[MAX_LEN];
|
|
|
int ret = TD_FAILURE;
|
|
|
int from_stack_fd = -1, to_stack_fd = -1;
|
|
|
int nread, nwrite;
|
|
|
|
|
|
ret = sprintf_s(proc_stack_path, MAX_LEN, "/proc/%d/stack", getpid());
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_proc_stack, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
from_stack_fd = open(proc_stack_path, O_RDONLY);
|
|
|
if (from_stack_fd == TD_FAILURE) {
|
|
|
log_err("in get_proc_stack, open %s failed\n", proc_stack_path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
to_stack_fd = open(STACK_FILE_PATH, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM);
|
|
|
if (to_stack_fd == TD_FAILURE) {
|
|
|
log_err("in get_proc_stack, open %s failed\n", STACK_FILE_PATH);
|
|
|
goto OUT1;
|
|
|
}
|
|
|
while ((nread = read(from_stack_fd, buf, MAX_LEN)) > 0) {
|
|
|
nwrite = 0;
|
|
|
do {
|
|
|
ret = write(to_stack_fd, &buf[nwrite], nread - nwrite);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_proc_stack, write failed\n");
|
|
|
goto OUT2;
|
|
|
}
|
|
|
nwrite += nread;
|
|
|
} while (nwrite < nread);
|
|
|
}
|
|
|
if (nread == TD_FAILURE) {
|
|
|
log_err("in get_proc_stack, read failed\n");
|
|
|
goto OUT2;
|
|
|
}
|
|
|
close(from_stack_fd);
|
|
|
close(to_stack_fd);
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT2:
|
|
|
close(to_stack_fd);
|
|
|
OUT1:
|
|
|
close(from_stack_fd);
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 判断字符串是否为整数
|
|
|
static int is_int_digital(char *num)
|
|
|
{
|
|
|
int len = strlen(num);
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
if ('0' <= num[i] && num[i] <= '9') {
|
|
|
continue;
|
|
|
} else {
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
// 判断字符串是否为浮点数
|
|
|
static int is_float_digital(char *num)
|
|
|
{
|
|
|
int len = strlen(num);
|
|
|
int i;
|
|
|
int dot_num = 0;
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
if (num[i] == '.') {
|
|
|
dot_num++;
|
|
|
if (dot_num > 1) {
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
} else if ('0' <= num[i] && num[i] <= '9') {
|
|
|
continue;
|
|
|
} else {
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
// 计算cpu占用率
|
|
|
static int get_cpu_usage(float *usage)
|
|
|
{
|
|
|
int fd = -1;
|
|
|
char filename[BUF_SIZE] = {'\0'};
|
|
|
char buf[BUF_SIZE] = {'\0'};
|
|
|
int ret = TD_FAILURE;
|
|
|
int nread;
|
|
|
char *data = TD_NULL;
|
|
|
int user_time = 0;
|
|
|
int kernel_time = 0;
|
|
|
static int last_user_time = 0;
|
|
|
static int last_kernel_time = 0;
|
|
|
int counter = 0;
|
|
|
|
|
|
ret = sprintf_s(filename, BUF_SIZE, "/proc/%d/stat", getpid());
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_cpu_usage, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
fd = open(filename, O_RDONLY);
|
|
|
if (fd == TD_FAILURE) {
|
|
|
log_err("in get_cpu_usage, open %s failed\n", filename);
|
|
|
goto OUT;
|
|
|
}
|
|
|
nread = read(fd, buf, BUF_SIZE);
|
|
|
data = strtok(buf, " ");
|
|
|
while (data != NULL) {
|
|
|
if (counter == UTIME_IDX) {
|
|
|
user_time = atoi(data); // 获取用户态占用时间
|
|
|
} else if (counter == CTIME_IDX) {
|
|
|
kernel_time = atoi(data); // 获取内核态占用时间
|
|
|
break;
|
|
|
}
|
|
|
counter++;
|
|
|
data = strtok(NULL, " ");
|
|
|
}
|
|
|
*usage = 1.0 * (user_time - last_user_time + kernel_time - last_kernel_time)
|
|
|
/ (sysconf(_SC_CLK_TCK) * CPU_TIMER_SEC);
|
|
|
|
|
|
last_user_time = user_time;
|
|
|
last_kernel_time = kernel_time;
|
|
|
|
|
|
close(fd);
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int fault_report_cpu_overuse(float usage)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
char usage_str[MAX_LEN] = {'\0'};
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
unsigned int handle;
|
|
|
|
|
|
ret = get_proc_stack();
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_cpu_overuse, get_proc_stack failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = sprintf_s(usage_str, MAX_LEN, "%.2f", usage);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_cpu_overuse, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
log_dbg("fault report for overuse cpu.\n");
|
|
|
ret = dft_event_create(FAULT_NO_OVERUSE_CPU, &handle);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_cpu_overuse, dft_event_create failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
dft_event_put_string(handle, "PNAME", "procdaemon");
|
|
|
dft_event_put_string(handle, "F1NAME", "fault_report_cpu_overuse");
|
|
|
dft_event_put_string(handle, "F2NAME", "thread_watch_cpu_usage");
|
|
|
dft_event_put_string(handle, "CPU_USAGE", usage_str);
|
|
|
ret = dft_event_report(handle);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_cpu_overuse, dft_event_report failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
dft_event_destroy(handle);
|
|
|
|
|
|
ret = sprintf_s(buf, MAX_LEN, "[Fault Report] cpu overuse: %.2f", usage);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_cpu_overuse, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
write_log(buf);
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 检测cpu占用率,超过阈值则故障上报
|
|
|
static void check_cpu_usage()
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
float usage;
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
|
|
|
ret = get_cpu_usage(&usage);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in check_cpu_usage, get_cpu_usage failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = sprintf_s(buf, MAX_LEN, "[CPU Usage] %.2f", usage);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in check_cpu_usage, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
write_log(buf);
|
|
|
|
|
|
if (usage > g_max_cpu_utility) {
|
|
|
ret = fault_report_cpu_overuse(usage);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in check_cpu_usage, fault_report_cpu_overuse failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
OUT:
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
// 获取系统时间作为落盘文件名,与落盘路径拼接成完整路径
|
|
|
static int get_snapshot_path_from_system_time(proc_node *proc, char *snapshot_path)
|
|
|
{
|
|
|
char snapshot_file_name[MAX_LEN] = {'\0'};
|
|
|
time_t t;
|
|
|
struct tm *st = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
(void)time(&t);
|
|
|
st = localtime(&t);
|
|
|
ret = sprintf_s(snapshot_file_name, MAX_LEN, "%04d%02d%02d%02d%02d%02d.txt",
|
|
|
st->tm_year + START_YEAR, st->tm_mon + START_MONTH, st->tm_mday, st->tm_hour, st->tm_min, st->tm_sec);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_snapshot_path_from_system_time, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
if (proc == TD_NULL) {
|
|
|
log_err("in get_snapshot_path_from_system_time, proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = strcpy_s(snapshot_path, MAX_LEN, proc->archive_path);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = strcat_s(snapshot_path, MAX_LEN, "/");
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = strcat_s(snapshot_path, MAX_LEN, snapshot_file_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_snapshot_path_from_system_time, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS; // 落盘文件的完成路径
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 获取目标文件夹下的文件数量
|
|
|
static int get_dir_files_num(char *dir_path, int *file_num)
|
|
|
{
|
|
|
DIR *dir = TD_NULL;
|
|
|
struct dirent *ptr = TD_NULL;
|
|
|
int total = 0;
|
|
|
|
|
|
dir = opendir(dir_path);
|
|
|
if (dir == TD_NULL) {
|
|
|
log_err("in get_dir_files_num, opendir failed\n");
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
while ((ptr = readdir(dir)) != NULL) {
|
|
|
if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
|
|
|
continue;
|
|
|
}
|
|
|
if (ptr->d_type == DT_REG) {
|
|
|
total++;
|
|
|
}
|
|
|
}
|
|
|
closedir(dir);
|
|
|
*file_num = total;
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
static int get_file_names(char *dir_path, char file_names[][MAX_LEN])
|
|
|
{
|
|
|
DIR *dir = TD_NULL;
|
|
|
struct dirent *ptr = TD_NULL;
|
|
|
int total = 0;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
dir = opendir(dir_path);
|
|
|
if (dir == TD_NULL) {
|
|
|
log_err("in get_file_names, opendir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
while ((ptr = readdir(dir)) != NULL) {
|
|
|
if (strcmp(ptr->d_name, ".") == 0 || strcmp(ptr->d_name, "..") == 0) {
|
|
|
continue;
|
|
|
}
|
|
|
if (ptr->d_type == DT_REG) {
|
|
|
ret = strcpy_s(file_names[total++], MAX_LEN, ptr->d_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_file_names, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
ret = closedir(dir);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_file_names, closedir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int get_max_file_name(int file_num, char file_names[][MAX_LEN], char *target_file_name)
|
|
|
{
|
|
|
int i;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
for (i = 0; i < file_num; i++) {
|
|
|
if (strcmp(file_names[i], target_file_name) > 0) {
|
|
|
ret = strcpy_s(target_file_name, MAX_LEN, file_names[i]);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_max_file_name, strcpy_s failed\n");
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
static int get_min_file_name(int file_num, char file_names[][MAX_LEN], char *target_file_name)
|
|
|
{
|
|
|
int i;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
for (i = 0; i < file_num; i++) {
|
|
|
if (strcmp(file_names[i], target_file_name) < 0) {
|
|
|
ret = strcpy_s(target_file_name, MAX_LEN, file_names[i]);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_min_file_name, strcpy_s failed\n");
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
// 获得指定文件夹中文件名最大或最小的文件,get_max==0时,获得最小文件;get_max==1时,获得最大文件
|
|
|
static int get_min_or_max_file_name(char *dir_path, int file_num, int get_max, char *target_file_path)
|
|
|
{
|
|
|
char file_names[file_num][MAX_LEN];
|
|
|
int ret = TD_FAILURE;
|
|
|
char target_file_name[MAX_LEN] = {'\0'};
|
|
|
|
|
|
ret = get_file_names(dir_path, file_names);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_min_or_max_file_name, get_file_names failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = strcpy_s(target_file_name, MAX_LEN, file_names[0]);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_min_or_max_file_name, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
if (get_max == 1) {
|
|
|
ret = get_max_file_name(file_num, file_names, target_file_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_min_or_max_file_name, get_max_file_name failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
} else if (get_max == 0) {
|
|
|
ret = get_min_file_name(file_num, file_names, target_file_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_min_or_max_file_name, get_min_file_name failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
ret = snprintf_s(target_file_path, MAX_LEN, MAX_LEN - 1, "%s/%s", dir_path, target_file_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("snprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 清理落盘文件夹,保持文件数小于等于阈值
|
|
|
static int clean_snapshot_dir(proc_node *proc)
|
|
|
{
|
|
|
int file_num;
|
|
|
char target_file_name[MAX_LEN] = {'\0'};
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
if (proc == TD_NULL) {
|
|
|
log_err("in clean_snapshot_dir, proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = get_dir_files_num(proc->archive_path, &file_num);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in clean_snapshot_dir, get_dir_files_num failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
while (file_num > g_max_rotation_count) { // 文件数 > 3时,删掉多余的小名文件
|
|
|
ret = get_min_or_max_file_name(proc->archive_path, file_num, 0, target_file_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in clean_snapshot_dir, get_dir_files_num failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = sprintf_s(buf, MAX_LEN, "[Remove] %s", target_file_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in clean_snapshot_dir, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
write_log(buf);
|
|
|
ret = remove(target_file_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in clean_snapshot_dir, remove failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
file_num--;
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 获取落盘目标文件的文件描述符,保证有效且发生错误时已关闭文件
|
|
|
static int get_target_file(int file_num, char *target_file_name, proc_node *proc, char *snapshot_path)
|
|
|
{
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
int fd = -1;
|
|
|
|
|
|
if (proc == TD_NULL) {
|
|
|
log_err("in get_target_file, proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
// 获取目标落盘文件路径
|
|
|
if (get_min_or_max_file_name(proc->archive_path, file_num, 1, target_file_name) == TD_FAILURE) {
|
|
|
log_err("in get_target_file, get_min_or_max_file_name failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
if (get_file_size(target_file_name) >= g_max_rotation_size) { // 目标文件大小超过阈值,则新建文件
|
|
|
if (get_snapshot_path_from_system_time(proc, snapshot_path) == TD_FAILURE) {
|
|
|
log_err("in get_target_file, get_snapshot_path_from_system_time\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
fd = open(snapshot_path, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM);
|
|
|
if (fd == -1) {
|
|
|
log_err("in get_target_file, open %s failed\n", snapshot_path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
if (sprintf_s(buf, MAX_LEN, "[Create] %s", snapshot_path) == TD_FAILURE) {
|
|
|
log_err("in get_target_file, sprintf_s failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
if (write_log(buf) == TD_FAILURE) {
|
|
|
log_err("write_log failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
if (clean_snapshot_dir(proc) == TD_FAILURE) {
|
|
|
log_err("in get_target_file, clean_snapshot_dir failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
} else { // 目标文件大小在阈值范围内,往目标文件append内容
|
|
|
fd = open(target_file_name, O_WRONLY | O_APPEND);
|
|
|
if (fd == -1) {
|
|
|
log_err("in get_target_file, open %s failed\n", target_file_name);
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return fd;
|
|
|
OUT1:
|
|
|
close(fd);
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 创建落盘目标文件,返回文件描述符,保证有效且发生错误时已关闭文件
|
|
|
static int create_target_file(char *snapshot_path, proc_node *proc)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
int fd = -1;
|
|
|
|
|
|
ret = get_snapshot_path_from_system_time(proc, snapshot_path);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in create_target_file, get_snapshot_path_from_system_time failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
fd = open(snapshot_path, O_WRONLY | O_CREAT | O_TRUNC, FILE_PERM);
|
|
|
if (fd == -1) {
|
|
|
log_err("in create_target_file, open %s failed\n", snapshot_path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = sprintf_s(buf, MAX_LEN, "[Create] %s", snapshot_path);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in create_target_file, sprintf_s failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
ret = write_log(buf);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("write_log failed");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
|
|
|
return fd;
|
|
|
OUT1:
|
|
|
close(fd);
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 打开落盘文件夹中的目标文件,返回其文件描述符 to_fd,保证有效且发生错误时已关闭文件
|
|
|
static int open_or_remove_snapshot_file(char *snapshot_path, proc_node *proc)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
int file_num;
|
|
|
char target_file_name[BUF_SIZE] = {'\0'};
|
|
|
int fd = -1;
|
|
|
|
|
|
if (proc == TD_NULL) {
|
|
|
log_err("in open_or_remove_snapshot_file, proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = clean_snapshot_dir(proc); // 保证目标文件夹中的文件数量在阈值范围内
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in open_or_remove_snapshot_file, clean_snapshot_dir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = get_dir_files_num(proc->archive_path, &file_num); // 获取目标文件夹中的文件数目
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in open_or_remove_snapshot_file, get_dir_files_num failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
if (file_num > 0) { // 有文件,获取最大名文件作为目标文件,保证文件描述符有效且发生错误时已关闭文件
|
|
|
fd = get_target_file(file_num, target_file_name, proc, snapshot_path);
|
|
|
if (fd == -1) {
|
|
|
log_err("in open_or_remove_snapshot_file, get_target_file failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
} else { // 没有文件,创建文件,保证文件描述符有效且发生错误时已关闭文件
|
|
|
fd = create_target_file(snapshot_path, proc);
|
|
|
if (fd == -1) {
|
|
|
log_err("in open_or_remove_snapshot_file, create_target_file failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return fd;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// proc存盘超时5秒,故障上报
|
|
|
static void fault_report_timeout(union sigval val)
|
|
|
{
|
|
|
long cur_timestamp = get_timestamp_for_timer();
|
|
|
static __thread int has_report = 0;
|
|
|
timer_val *tvalue = (timer_val *)(val.sival_ptr);
|
|
|
int ret = TD_FAILURE;
|
|
|
unsigned int handle;
|
|
|
int timeout = 0;
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
|
|
|
if (tvalue == TD_NULL || tvalue->proc == TD_NULL) {
|
|
|
log_err("in fault_report_timeout, tvalue or tvalue->proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
timeout = OVERTIME_CO * atoi(tvalue->proc->time);
|
|
|
if (has_report == 0 && cur_timestamp - *(tvalue->timestamp) > timeout * SEC_TO_NSEC) {
|
|
|
ret = get_proc_stack();
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_timeout, get_proc_stack failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
log_dbg("fault report from %s in %ld\n", tvalue->proc->path, cur_timestamp);
|
|
|
ret = dft_event_create(FAULT_NO_TIMEOUT, &handle);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_timeout, dft_event_create failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
dft_event_put_string(handle, "PNAME", "procdaemon");
|
|
|
dft_event_put_string(handle, "F1NAME", "fault_report_timeout");
|
|
|
dft_event_put_string(handle, "F2NAME", "save_proc_file_cycle");
|
|
|
dft_event_put_string(handle, "PROCNAME", tvalue->proc->path);
|
|
|
ret = dft_event_report(handle);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_timeout, dft_event_report failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
dft_event_destroy(handle);
|
|
|
has_report = 1;
|
|
|
ret = sprintf_s(buf, MAX_LEN, "[Fault Report] proc %s archive timeout", tvalue->proc->path);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in fault_report_timeout, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
write_log(buf);
|
|
|
}
|
|
|
OUT:
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
// 初始化计时器
|
|
|
static int init_timer(int init_sec, int sec, struct sigevent event)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
timer_t timerid;
|
|
|
|
|
|
ret = timer_create(CLOCK_REALTIME, &event, &timerid);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in init_timer, memset_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
struct itimerspec it;
|
|
|
it.it_interval.tv_sec = sec;
|
|
|
it.it_interval.tv_nsec = TIMER_NSEC;
|
|
|
it.it_value.tv_sec = init_sec;
|
|
|
it.it_value.tv_nsec = TIMER_NSEC;
|
|
|
ret = timer_settime(timerid, 0, &it, NULL);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in init_timer, timer_settime failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int write_snapshot(int to_fd, int from_fd, proc_node *proc)
|
|
|
{
|
|
|
int nread;
|
|
|
int nwrite;
|
|
|
int ret = TD_FAILURE;
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
char time_stamp[MAX_LEN] = {'\0'};
|
|
|
|
|
|
if (proc == TD_NULL) {
|
|
|
log_err("in write_snapshot, proc is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
// 写时间戳
|
|
|
ret = get_timestamp_for_archive_content(time_stamp);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in write_snapshot, get_timestamp_for_archive_content failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = write(to_fd, time_stamp, strlen(time_stamp));
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("write time stamp to file %s failed.\n", proc->archive_path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
// 写proc内容
|
|
|
while ((nread = read(from_fd, buf, sizeof(buf))) > 0) { // 读proc
|
|
|
nwrite = 0;
|
|
|
do {
|
|
|
ret = write(to_fd, &buf[nwrite], nread - nwrite); // 写proc
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("write proc content to file %s failed.\n", proc->archive_path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
nwrite += ret;
|
|
|
} while (nwrite < nread);
|
|
|
}
|
|
|
if (nread == TD_FAILURE) {
|
|
|
log_err("in write_snapshot, read %s failed.\n", proc->path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
// 写入换行符
|
|
|
ret = write(to_fd, "\n", 1);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("write \\n failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 创建计时器检测read操作响应
|
|
|
static int init_timer_check_response(long *timestamp, proc_node *proc)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
timer_val val;
|
|
|
struct sigevent event;
|
|
|
|
|
|
*timestamp = get_timestamp_for_timer();
|
|
|
val.timestamp = timestamp;
|
|
|
val.proc = proc;
|
|
|
|
|
|
ret = memset_s(&event, sizeof(struct sigevent), 0, sizeof(struct sigevent));
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in init_timer_check_response, memset_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
event.sigev_value.sival_ptr = &val;
|
|
|
event.sigev_notify = SIGEV_THREAD;
|
|
|
event.sigev_notify_function = fault_report_timeout;
|
|
|
|
|
|
ret = init_timer(TIMER_SEC, TIMER_NSEC, event);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in init_timer_check_response, init_timer failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 打开proc,打开失败时重试
|
|
|
static int try_open_proc(char *proc_path)
|
|
|
{
|
|
|
int fd = -1;
|
|
|
int has_print_info = TD_FAILURE;
|
|
|
|
|
|
fd = open(proc_path, O_RDONLY);
|
|
|
while (fd == -1) {
|
|
|
if (has_print_info == TD_FAILURE) {
|
|
|
log_info("open %s failed, reopen after each %d seconds\n", proc_path, PROC_REOPEN_INTERVAL);
|
|
|
has_print_info = TD_SUCCESS;
|
|
|
}
|
|
|
sleep(PROC_REOPEN_INTERVAL);
|
|
|
fd = open(proc_path, O_RDONLY);
|
|
|
}
|
|
|
if (has_print_info == TD_SUCCESS) {
|
|
|
log_info("open %s success\n", proc_path);
|
|
|
}
|
|
|
|
|
|
return fd;
|
|
|
}
|
|
|
|
|
|
// 每当落盘次数达到xx次,就会尝试关闭proc并重新打开,打开失败时会重试
|
|
|
static int try_close_and_open_proc(int from_fd, char *path, int loop_time)
|
|
|
{
|
|
|
if (loop_time % MAX_TIME_TO_ARCHIVE != 0) {
|
|
|
return from_fd;
|
|
|
}
|
|
|
|
|
|
// 如果落盘次数到一定量,则关闭文件后重新尝试打开
|
|
|
if (from_fd != -1) {
|
|
|
close(from_fd);
|
|
|
}
|
|
|
return try_open_proc(path);
|
|
|
}
|
|
|
|
|
|
// 读取proc文件,并落盘
|
|
|
static void *save_proc_file_cycle(void *args)
|
|
|
{
|
|
|
int from_fd = -1, to_fd = -1;
|
|
|
long timestamp;
|
|
|
char snapshot_path[MAX_LEN] = {'\0'}; // 落盘文件名
|
|
|
proc_node *proc = (proc_node *)args;
|
|
|
int loop_time = 0;
|
|
|
|
|
|
set_thread_name_from_proc_path(proc->path);
|
|
|
if (init_timer_check_response(×tamp, proc) == TD_FAILURE) { // 初始化时钟,检测落盘超时
|
|
|
log_err("in save_proc_file_cycle, init_timer_check_response failed.\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
|
|
|
// 循环周期性落盘
|
|
|
while (1) {
|
|
|
// 落盘一定次数后,先关闭再尝试打开proc,打开失败时重复尝试
|
|
|
from_fd = try_close_and_open_proc(from_fd, proc->path, loop_time);
|
|
|
if (check_dir_exits(proc) == TD_FAILURE) {
|
|
|
log_err("in save_proc_file_cycle, make_snapshot_dir failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
lseek(from_fd, 0, SEEK_SET); // 将文件指针移动到文件开头
|
|
|
// 打开落盘目标文件,返回有效文件描述符
|
|
|
to_fd = open_or_remove_snapshot_file(snapshot_path, proc);
|
|
|
if (to_fd == -1) {
|
|
|
log_err("in save_proc_file_cycle, open_or_remove_snapshot_file failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
timestamp = get_timestamp_for_timer();
|
|
|
if (write_snapshot(to_fd, from_fd, proc) == TD_FAILURE) { // 往目标文件中写入proc内容
|
|
|
log_err("write_snapshot from %s failed\n", proc->path);
|
|
|
goto OUT;
|
|
|
}
|
|
|
loop_time = (loop_time + 1) % MAX_TIME_TO_ARCHIVE;
|
|
|
close(to_fd);
|
|
|
sleep(atoi(proc->time));
|
|
|
}
|
|
|
close(from_fd);
|
|
|
return (void *)TD_SUCCESS;
|
|
|
OUT:
|
|
|
close(from_fd);
|
|
|
OUT1:
|
|
|
return (void *)TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int process_mode(xmlNodePtr prop_node_ptr, proc_node *node)
|
|
|
{
|
|
|
xmlChar *sz_attr = TD_NULL;
|
|
|
|
|
|
if (node == TD_NULL) {
|
|
|
log_err("in process_mode, node is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
sz_attr = xmlGetProp(prop_node_ptr, BAD_CAST "mode");
|
|
|
if (strcmp("cycle", (char *)sz_attr) == 0) {
|
|
|
node->mode = CYCLE_MODE;
|
|
|
} else if (strcmp("variation", (char *)sz_attr) == 0) {
|
|
|
node->mode = VARIATION_MODE;
|
|
|
} else {
|
|
|
node->mode = UNDEFINED_MODE;
|
|
|
}
|
|
|
|
|
|
xmlFree(sz_attr);
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int process_time(xmlNodePtr prop_node_ptr, proc_node *node)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
xmlChar *sz_attr = TD_NULL;
|
|
|
|
|
|
if (node == TD_NULL) {
|
|
|
log_err("in process_time, node is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
sz_attr = xmlGetProp(prop_node_ptr, BAD_CAST "time");
|
|
|
ret = is_float_digital((char *)sz_attr);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in process_time, is_float_digital failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
if (atoi((char *)sz_attr) < 1) {
|
|
|
ret = strcpy_s(node->time, MAX_LEN, "1");
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in process_time, strcpy_s failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
log_info("cycle time is less 1 second, auto fix to 1 second.\n");
|
|
|
} else {
|
|
|
ret = strcpy_s(node->time, MAX_LEN, (char *)sz_attr);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in process_time, strcpy_s failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
}
|
|
|
xmlFree(sz_attr);
|
|
|
return TD_SUCCESS;
|
|
|
OUT1:
|
|
|
xmlFree(sz_attr);
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 检测xml的node内部属性
|
|
|
static int process_node_properties(xmlNodePtr prop_node_ptr, int node_idx)
|
|
|
{
|
|
|
xmlAttrPtr attr_ptr = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
proc_node *node = &g_nodes[node_idx];
|
|
|
|
|
|
if (node_idx >= MAX_LEN || node_idx < 0) {
|
|
|
log_err("in process_node_properties, node_idx is overflow.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
attr_ptr = prop_node_ptr->properties;
|
|
|
while (attr_ptr != NULL) {
|
|
|
// 获取mode属性
|
|
|
if (!xmlStrcmp(attr_ptr->name, BAD_CAST "mode")) {
|
|
|
ret = process_mode(prop_node_ptr, node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in process_node_properties, process_mode failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// 获取time属性
|
|
|
if (!xmlStrcmp(attr_ptr->name, BAD_CAST "time")) {
|
|
|
ret = process_time(prop_node_ptr, node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in process_node_properties, process_time failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
ret = strcpy_s(node->time, MAX_LEN, node->mode == VARIATION_MODE ? "-1" : node->time);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in process_node_properties, strcpy_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
attr_ptr = attr_ptr->next;
|
|
|
if (node->mode == UNDEFINED_MODE) {
|
|
|
log_err("node does not have 'mode' property.\n");
|
|
|
return TD_FAILURE;
|
|
|
} else if (node->mode == CYCLE_MODE && strcmp(node->time, "-1") == 0) {
|
|
|
log_err("cycle mode needs to set archive cycle time.\n");
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 获取节点中包裹的合法路径
|
|
|
static int get_safe_path_inner_node(xmlNodePtr cur_node, int node_idx)
|
|
|
{
|
|
|
xmlChar *sz_key = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
proc_node *node = &g_nodes[node_idx];
|
|
|
|
|
|
if (cur_node == TD_NULL) {
|
|
|
log_err("in get_safe_path_inner_node, cur_node is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
if (node_idx >= MAX_LEN || node_idx < 0) {
|
|
|
log_err("in get_safe_path_inner_node, node_idx is overflow.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
sz_key = xmlNodeGetContent(cur_node);
|
|
|
if (sz_key == TD_NULL) {
|
|
|
log_err("in get_safe_path_inner_node, xmlNodeGetContent return null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
ret = starts_with((char *)sz_key, PROC_PATH_PREFIX);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_info("path not start with %s.\n", PROC_PATH_PREFIX);
|
|
|
goto OUT1;
|
|
|
}
|
|
|
|
|
|
ret = strcpy_s(node->path, MAX_LEN, (char *)sz_key);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in get_safe_path_inner_node, strcpy_s failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
|
|
|
xmlFree(sz_key);
|
|
|
return TD_SUCCESS;
|
|
|
OUT1:
|
|
|
xmlFree(sz_key);
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 解析list节点,获取落盘proc文件列表及相应配置
|
|
|
static void parse_list_xml_node(xmlNodePtr cur_node)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
pthread_t cycle_archive_tid_idx = 0;
|
|
|
pthread_t variation_archive_tid_idx = 0;
|
|
|
int node_idx = 0;
|
|
|
|
|
|
while (cur_node != TD_NULL && node_idx < MAX_LEN) { // 解析xml配置文件
|
|
|
// 1. 是否为snapshot节点(如果不是,则跳过,处理下一个xml节点)
|
|
|
if (xmlStrcmp(cur_node->name, (const xmlChar *) "snapshot")) {
|
|
|
log_err("cur_node is not <snapshot>, skip & process next node.\n");
|
|
|
cur_node = cur_node->next;
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
// 2. 获取节点路径,并判断是否合法(如果不合法,则跳过,处理下一个xml节点)
|
|
|
ret = get_safe_path_inner_node(cur_node, node_idx);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_list_xml_node, get_safe_path_inner_node failed.\n");
|
|
|
cur_node = cur_node->next;
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
// 3. 获取节点属性(如果获取失败,处理下一个xml节点)
|
|
|
ret = process_node_properties(cur_node, node_idx);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_list_xml_node, process_node_properties failed\n");
|
|
|
cur_node = cur_node->next;
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
// 4. 开始根据node内容落盘
|
|
|
ret = proc_archive(node_idx, &cycle_archive_tid_idx, &variation_archive_tid_idx);
|
|
|
|
|
|
// 5. 处理下一个xml节点
|
|
|
cur_node = cur_node->next;
|
|
|
node_idx++;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
static int parse_common_xml_node_max_cpu_utility(xmlNodePtr cur_node)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
xmlChar *sz_key = TD_NULL;
|
|
|
|
|
|
sz_key = xmlNodeGetContent(cur_node); // 获取最大cpu占用率
|
|
|
ret = is_float_digital((char *)sz_key);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_common_xml_node_max_cpu_utility, is_float_digital failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
g_max_cpu_utility = strtod((char *)sz_key, NULL);
|
|
|
if (g_max_cpu_utility > MAX_MAX_CPU_UTILITY || g_max_cpu_utility <= MIN_MAX_CPU_UTILITY) {
|
|
|
g_max_cpu_utility = DEFAULT_MAX_CPU_UTILITY;
|
|
|
log_info("got a wrong g_max_cpu_utility, reset to %.1f\n", DEFAULT_MAX_CPU_UTILITY);
|
|
|
}
|
|
|
|
|
|
xmlFree(sz_key);
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
xmlFree(sz_key);
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int parse_common_xml_node_max_rotation_count(xmlNodePtr cur_node)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
xmlChar *sz_key = TD_NULL;
|
|
|
|
|
|
sz_key = xmlNodeGetContent(cur_node); // 获取最大落盘文件数量
|
|
|
ret = is_int_digital((char *)sz_key);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_common_xml_node_max_rotation_count, is_int_digital failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
g_max_rotation_count = atoi((char *)sz_key);
|
|
|
if (g_max_rotation_count <= MIN_MAX_ROTATION_COUNT || g_max_rotation_count > MAX_MAX_ROTATION_COUNT) {
|
|
|
g_max_rotation_count = DEFAULT_MAX_ROTATION_COUNT;
|
|
|
log_info("got a wrong g_max_rotation_count, reset to %d\n", DEFAULT_MAX_ROTATION_COUNT);
|
|
|
}
|
|
|
|
|
|
xmlFree(sz_key);
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
xmlFree(sz_key);
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int parse_common_xml_node_max_rotation_size(xmlNodePtr cur_node)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
xmlChar *sz_key = TD_NULL;
|
|
|
|
|
|
sz_key = xmlNodeGetContent(cur_node); // 获取最大落盘文件尺寸
|
|
|
ret = is_int_digital((char *)sz_key);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_common_xml_node_max_rotation_size, is_int_digital failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
g_max_rotation_size = atoi((char *)sz_key);
|
|
|
if (g_max_rotation_size <= MIN_MAX_ROTATION_SIZE || g_max_rotation_size > MAX_MAX_ROTATION_SIZE) {
|
|
|
g_max_rotation_size = DEFAULT_MAX_ROTATION_SIZE;
|
|
|
log_info("got a wrong g_max_rotation_size, reset to %d\n", DEFAULT_MAX_ROTATION_SIZE);
|
|
|
}
|
|
|
|
|
|
xmlFree(sz_key);
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
xmlFree(sz_key);
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 解析common节点,获取通用配置信息
|
|
|
static int parse_common_xml_node(xmlNodePtr cur_node)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
while (cur_node != NULL) {
|
|
|
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_cpu_utility"))) {
|
|
|
ret = parse_common_xml_node_max_cpu_utility(cur_node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_common_xml_node, parse_common_xml_node_max_cpu_utility failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_rotation_count"))) {
|
|
|
ret = parse_common_xml_node_max_rotation_count(cur_node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_common_xml_node, parse_common_xml_node_max_rotation_count failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "max_rotation_size"))) {
|
|
|
ret = parse_common_xml_node_max_rotation_size(cur_node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in parse_common_xml_node, parse_common_xml_node_max_rotation_size failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
cur_node = cur_node->next;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 解析xml文件,获得配置信息
|
|
|
static int procd_main(char *xml_file_path)
|
|
|
{
|
|
|
xmlDocPtr doc = TD_NULL;
|
|
|
xmlNodePtr cur_node = TD_NULL;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
xmlKeepBlanksDefault(0); // 过滤空白text节点
|
|
|
doc = xmlReadFile(xml_file_path, "UTF-8", XML_PARSE_RECOVER);
|
|
|
if (doc == TD_NULL) {
|
|
|
fprintf(stderr, "Document not parsed successfully.\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
|
|
|
// 获取xml的根节点
|
|
|
cur_node = xmlDocGetRootElement(doc);
|
|
|
if (cur_node == TD_NULL) {
|
|
|
fprintf(stderr, "empty document\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 判断根节点是否符合预期
|
|
|
if (xmlStrcmp(cur_node->name, BAD_CAST "procd")) {
|
|
|
fprintf(stderr, "document of the wrong type, root node != proc_list\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 遍历xml文件
|
|
|
cur_node = cur_node->xmlChildrenNode;
|
|
|
while (cur_node != TD_NULL) {
|
|
|
// 处理common的子节点内容
|
|
|
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "common"))) {
|
|
|
ret = parse_common_xml_node(cur_node->xmlChildrenNode);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in procd_main, parse_common_xml_node failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// 处理list的子节点内容,遍历的同事创建线程进行落盘
|
|
|
if ((!xmlStrcmp(cur_node->name, (const xmlChar *) "list"))) {
|
|
|
parse_list_xml_node(cur_node->xmlChildrenNode);
|
|
|
}
|
|
|
cur_node = cur_node->next;
|
|
|
}
|
|
|
(void)xmlFreeDoc(doc);
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
xmlFreeDoc(doc);
|
|
|
OUT1:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 变化落盘存文件
|
|
|
static int save_proc_file_variation(proc_node *proc, int from_fd)
|
|
|
{
|
|
|
int to_fd = -1;
|
|
|
int ret = TD_FAILURE;
|
|
|
char snapshot_path[MAX_LEN] = {'\0'};
|
|
|
|
|
|
ret = memset_s(snapshot_path, MAX_LEN, '\0', MAX_LEN);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in save_proc_file_variation, memset_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
lseek(from_fd, 0, SEEK_SET);
|
|
|
to_fd = open_or_remove_snapshot_file(snapshot_path, proc); // 打开落盘文件,返回有效文件描述符
|
|
|
if (to_fd == -1) {
|
|
|
log_err("in save_proc_file_variation, open_or_remove_snapshot_file failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
log_dbg("variation archive into %s\n", snapshot_path);
|
|
|
write_snapshot(to_fd, from_fd, proc);
|
|
|
|
|
|
close(to_fd);
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 变化落盘poll检测
|
|
|
static void *proc_variation_poll(void *arg)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
proc_node *proc = (proc_node *)arg;
|
|
|
struct pollfd *pfd = TD_NULL;
|
|
|
int ready = 0;
|
|
|
int loop_time = 0;
|
|
|
pfd = (struct pollfd *)calloc(1, sizeof(struct pollfd));
|
|
|
if (pfd == TD_NULL) {
|
|
|
log_err("in proc_variation_poll, calloc failed, pfd is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
set_thread_name_from_proc_path(proc->path);
|
|
|
|
|
|
pfd->events = POLLIN;
|
|
|
pfd->revents = 0;
|
|
|
|
|
|
// 循环检测poll的返回值,检测是否有被触发
|
|
|
while (1) {
|
|
|
// 落盘次数超过阈值后,会尝试关闭并重新打开proc,打开失败时重复尝试
|
|
|
pfd->fd = try_close_and_open_proc(pfd->fd, proc->path, loop_time);
|
|
|
ret = check_dir_exits(proc);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in proc_variation_poll, make_snapshot_dir failed.\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
ready = poll(pfd, 1, -1);
|
|
|
if (ready < 0) {
|
|
|
log_err("in proc_variation_poll, poll failed.\n");
|
|
|
goto OUT1;
|
|
|
} else if ((ready > 0) && ((pfd->revents != 0) && (pfd->revents & POLLIN))) {
|
|
|
log_dbg("mention proc changed.\n");
|
|
|
ret = save_proc_file_variation(proc, pfd->fd);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in proc_variation_poll, save_proc_file_variation failed.\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
loop_time = (loop_time + 1) % MAX_TIME_TO_ARCHIVE;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
close(pfd->fd);
|
|
|
free(pfd);
|
|
|
return (void *)TD_SUCCESS;
|
|
|
OUT1:
|
|
|
close(pfd->fd);
|
|
|
free(pfd);
|
|
|
OUT:
|
|
|
return (void *)TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 周期落盘主逻辑
|
|
|
static int cycle_archive(proc_node *node, pthread_t *tid)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
if (node == TD_NULL) {
|
|
|
log_err("in cycle_archive, node is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
ret = check_dir_exits(node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("check_dir_exits failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 创建新线程,执行save_proc_file函数,实现snap落盘
|
|
|
ret = pthread_create(tid, TD_NULL, save_proc_file_cycle, node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in cycle_archive, pthread_create failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 变化落盘主逻辑
|
|
|
static int variation_archive(proc_node *node, pthread_t *tid)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
if (node == TD_NULL) {
|
|
|
log_err("in variation_archive, node is null ptr.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
ret = check_dir_exits(node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("check_dir_exits failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = pthread_create(tid, TD_NULL, proc_variation_poll, node);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in variation_archive, pthread_create failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 设定cpu占用率检测时钟
|
|
|
static void *thread_watch_cpu_usage()
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
struct sigevent event;
|
|
|
ret = memset_s(&event, sizeof(struct sigevent), 0, sizeof(struct sigevent));
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in thread_watch_cpu_usage, memset_s failed\n");
|
|
|
return (void *)TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
event.sigev_value.sival_ptr = TD_NULL;
|
|
|
event.sigev_notify = SIGEV_THREAD;
|
|
|
event.sigev_notify_function = check_cpu_usage;
|
|
|
ret = init_timer(CPU_TIMER_SEC, CPU_TIMER_SEC, event);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in thread_watch_cpu_usage, init_timer failed\n");
|
|
|
return (void *)TD_FAILURE;
|
|
|
}
|
|
|
return (void *)TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
// cpu占用率检测
|
|
|
static int cpu_monitor()
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
pthread_t pid;
|
|
|
ret = pthread_create(&pid, NULL, thread_watch_cpu_usage, NULL);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in cpu_monitor, pthread_creat failed\n");
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
pthread_join(pid, NULL);
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
// 落盘逻辑
|
|
|
static int proc_archive(int node_idx, pthread_t *cycle_archive_tid_idx, pthread_t *variation_archive_tid_idx)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
proc_node *node = &g_nodes[node_idx];
|
|
|
|
|
|
if (node_idx >= MAX_LEN || node_idx < 0) {
|
|
|
log_err("node_idx is overflow.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
if (node->mode == CYCLE_MODE) {
|
|
|
ret = cycle_archive(node, &(g_cycle_archive_tids[*cycle_archive_tid_idx])); // 周期落盘
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in proc_archive, cycle_archive failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
*cycle_archive_tid_idx = *cycle_archive_tid_idx + 1;
|
|
|
} else if (node->mode == VARIATION_MODE) {
|
|
|
ret = variation_archive(node, &(g_variation_archive_tids[*variation_archive_tid_idx])); // 变化落盘
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in proc_archive, cycle_archive failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
*variation_archive_tid_idx = *variation_archive_tid_idx + 1;
|
|
|
} else {
|
|
|
log_err("archive mode is error, please check\n");
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int lock_file(int fd)
|
|
|
{
|
|
|
struct flock fl;
|
|
|
fl.l_type = F_WRLCK;
|
|
|
fl.l_start = 0;
|
|
|
fl.l_whence = SEEK_SET;
|
|
|
fl.l_len = 0;
|
|
|
fl.l_pid = -1;
|
|
|
return (fcntl(fd, F_SETLK, &fl));
|
|
|
}
|
|
|
|
|
|
static int is_running(const char *process_name)
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
int fd = -1;
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
char file_name[MAX_LEN] = {'\0'};
|
|
|
|
|
|
ret = sprintf_s(file_name, MAX_LEN, "%s/%s/%s.pid", PROC_SNAP_ROOT_DIR, LOG_FILE_PATH, process_name);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in is_running, sprintf_s failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
fd = open(file_name, O_CREAT | O_RDWR, FILE_PERM);
|
|
|
if (fd == TD_FAILURE) {
|
|
|
log_err("in is_running, open %s failed\n", file_name);
|
|
|
goto OUT;
|
|
|
}
|
|
|
ret = lock_file(fd);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
write_log("[Start Error] procdameon is already running, single instance mode refuse start another procdaemon");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
ftruncate(fd, 0);
|
|
|
ret = sprintf_s(buf, MAX_LEN, "%ld\n", (long)getpid());
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in is_running, sprintf_s failed\n");
|
|
|
goto OUT1;
|
|
|
}
|
|
|
write(fd, buf, MAX_LEN);
|
|
|
close(fd);
|
|
|
return TD_SUCCESS;
|
|
|
OUT1:
|
|
|
close(fd);
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
static int start_log()
|
|
|
{
|
|
|
char buf[MAX_LEN] = {'\0'};
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
ret = sprintf_s(buf, MAX_LEN, "[Start] procdaemon");
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("in start_log, sprintf_s failed\n");
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
write_log(buf);
|
|
|
return TD_SUCCESS;
|
|
|
}
|
|
|
|
|
|
// 初始化
|
|
|
static int procd_preparation()
|
|
|
{
|
|
|
// 创建procsnap落盘根目录,/data/vendor/procsnap
|
|
|
if (make_snapshot_root_dir() == TD_FAILURE) {
|
|
|
log_err("in procd_preparation, make_snapshot_root_dir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 创建procsnap日志文件夹,/data/vendor/procsnap/procd_log
|
|
|
if (make_log_dir() == TD_FAILURE) {
|
|
|
log_err("in procd_preparation make_log_dir failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 等待系统时钟初始化完成
|
|
|
while (is_system_timer_init() == TD_FAILURE) {
|
|
|
sleep(1);
|
|
|
}
|
|
|
|
|
|
// 写入启动日志
|
|
|
if (start_log() == TD_FAILURE) {
|
|
|
log_err("in main, start_log failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 检查进程单例
|
|
|
if (is_running(PROCESS_NAME) == TD_FAILURE) {
|
|
|
log_info("procd is running...\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 检测cpu占用率
|
|
|
if (cpu_monitor() == TD_FAILURE) {
|
|
|
log_err("in main, cpu_monitor failed\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
// 后续工作,等待线程结束并回收资源
|
|
|
static int procd_clean()
|
|
|
{
|
|
|
int i;
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
log_info("start procd_clean\n");
|
|
|
|
|
|
// 回收线程的tid
|
|
|
for (i = 0; i < MAX_LEN; i++) {
|
|
|
if (g_cycle_archive_tids[i] != -1) {
|
|
|
ret = pthread_join(g_cycle_archive_tids[i], TD_NULL);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("pthread_join failed for tid=%ld\n", g_cycle_archive_tids[i]);
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (g_variation_archive_tids[i] != -1) {
|
|
|
ret = pthread_join(g_variation_archive_tids[i], TD_NULL);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("pthread_join failed for tid=%ld\n", g_variation_archive_tids[i]);
|
|
|
goto OUT;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return TD_SUCCESS;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|
|
|
|
|
|
int main()
|
|
|
{
|
|
|
int ret = TD_FAILURE;
|
|
|
|
|
|
// 预备动作
|
|
|
ret = procd_preparation();
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("procd_preparation failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 主逻辑
|
|
|
ret = procd_main(CONF_PATH);
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("procd_main failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
// 收尾动作
|
|
|
ret = procd_clean();
|
|
|
if (ret == TD_FAILURE) {
|
|
|
log_err("procd_clean failed.\n");
|
|
|
goto OUT;
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
OUT:
|
|
|
return TD_FAILURE;
|
|
|
}
|