目的:在计算资源匮乏的服务器上,通过监控磁盘 I/O 负载情况,不受脚本执行时间影响,记录对磁盘读写超过设定阈值的程序的详细信息。
需求:
- 监控磁盘 I/O 负载情况,使用
/proc文件系统直接读取进程I/O数据,避免使用资源消耗大的工具,实现轻量化监控; - 该脚本运行时,每两秒采集一次所有程序对磁盘 I/O 的负载情况,脚本运行期间总共采集5次;
- 记录每一个程序对磁盘的 I/O 使用情况并计算平均使用情况,将磁盘平均读取速度大于50MiB/s的程序的详细信息记录到 log 文件中,并记录时间,以及对磁盘的使用情况;
- 记录每一个程序对磁盘的 I/O 使用情况并计算平均使用情况,将磁盘平均写入速度大于50MiB/s的程序的详细信息记录到 log 文件中,并记录时间,以及对磁盘的使用情况;
- 对于日志文件,采用在末尾追加的模式;
使用方法:使用一个定时程序,实现每隔一段时间执行一次该脚本。
#!/bin/bash
# 磁盘I/O监控脚本 - 精确时间版本
# 作者: 运维工程师
# 功能: 监控程序级别的磁盘I/O负载,记录高I/O使用情况
# 配置参数
SAMPLE_COUNT=5
SAMPLE_INTERVAL=2
THRESHOLD_MB=50
LOG_FILE="/var/log/disk_io_monitor.log"
# 确保日志目录存在
mkdir -p "$(dirname "$LOG_FILE")"
# 函数: 记录日志
log_message() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG_FILE"
}
# 函数: 获取单次采样数据
get_single_sample() {
local sample_num=$1
local sample_time=$(date +%s.%N)
local temp_file="$2"
local pid_list=$(find /proc -maxdepth 1 -name "[0-9]*" -type d 2>/dev/null | sed 's/\/proc\///' | grep -E '^[0-9]+$')
# 遍历所有进程收集I/O数据
for pid in $pid_list; do
if [ -r "/proc/$pid/io" ] && [ -r "/proc/$pid/comm" ]; then
# 获取进程名和I/O统计
local proc_name=$(cat "/proc/$pid/comm" 2>/dev/null || echo "unknown")
local read_bytes=$(grep "read_bytes" "/proc/$pid/io" 2>/dev/null | awk '{print $2}' || echo "0")
local write_bytes=$(grep "write_bytes" "/proc/$pid/io" 2>/dev/null | awk '{print $2}' || echo "0")
# 将数据写入临时文件
echo "$sample_time|$pid|$proc_name|$read_bytes|$write_bytes" >> "$temp_file"
fi
done
}
# 函数: 获取进程I/O统计信息
get_process_io_stats() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 创建临时文件存储采样数据
local temp_dir=$(mktemp -d)
local temp_files=()
# 采集SAMPLE_COUNT次数据
for sample in $(seq 1 $SAMPLE_COUNT); do
local temp_file="$temp_dir/sample_${sample}.txt"
temp_files+=("$temp_file")
# 获取采样数据
get_single_sample "$sample" "$temp_file"
# 等待采样间隔(除了最后一次)
if [ $sample -lt $SAMPLE_COUNT ]; then
sleep $SAMPLE_INTERVAL
fi
done
# 处理采集的数据并计算瞬时速度的平均值
process_io_data "$temp_dir"
# 清理临时文件
rm -rf "$temp_dir"
}
# 函数: 处理I/O数据并计算瞬时速度的平均值
process_io_data() {
local temp_dir=$1
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 创建关联数组存储进程每次采样的速度
declare -A process_speeds
# 从第二次采样开始计算瞬时速度(需要前一次的数据)
for sample in $(seq 2 $SAMPLE_COUNT); do
local current_sample="$temp_dir/sample_${sample}.txt"
local previous_sample="$temp_dir/sample_$((sample-1)).txt"
[ -f "$current_sample" ] || continue
[ -f "$previous_sample" ] || continue
# 创建关联数组存储前一次的数据
declare -A previous_data
while IFS='|' read -r prev_time pid proc_name read_bytes write_bytes; do
local key="${pid}_${proc_name}"
previous_data["$key"]="$prev_time|$read_bytes|$write_bytes"
done < "$previous_sample"
# 计算当前采样间隔内的瞬时速度
while IFS='|' read -r curr_time pid proc_name current_read current_write; do
local key="${pid}_${proc_name}"
if [[ -n "${previous_data[$key]}" ]]; then
IFS='|' read -r prev_time prev_read prev_write <<< "${previous_data[$key]}"
# 计算采样间隔内的字节变化
local read_diff=$((current_read - prev_read))
local write_diff=$((current_write - prev_write))
# 处理可能的负值(进程重启等情况)
if [ $read_diff -lt 0 ]; then read_diff=0; fi
if [ $write_diff -lt 0 ]; then write_diff=0; fi
# 计算实际时间间隔(纳秒转秒)
local time_diff=$(echo "scale=6; ($curr_time - $prev_time)" | bc 2>/dev/null || echo "$SAMPLE_INTERVAL")
# 计算瞬时速度 (字节/秒)
if [ "$(echo "$time_diff > 0" | bc 2>/dev/null || echo "1")" = "1" ]; then
local read_speed=$(echo "scale=2; $read_diff / $time_diff" | bc 2>/dev/null || echo "0")
local write_speed=$(echo "scale=2; $write_diff / $time_diff" | bc 2>/dev/null || echo "0")
else
local read_speed="0"
local write_speed="0"
fi
# 存储速度数据
if [[ -z "${process_speeds[$key]}" ]]; then
process_speeds["$key"]="$read_speed,$write_speed,1"
else
local existing_data="${process_speeds[$key]}"
IFS=',' read -r total_read_speed total_write_speed count <<< "$existing_data"
local new_read_speed=$(echo "scale=2; $total_read_speed + $read_speed" | bc 2>/dev/null || echo "$total_read_speed")
local new_write_speed=$(echo "scale=2; $total_write_speed + $write_speed" | bc 2>/dev/null || echo "$total_write_speed")
local new_count=$((count + 1))
process_speeds["$key"]="$new_read_speed,$new_write_speed,$new_count"
fi
fi
done < "$current_sample"
done
# 计算平均速度并筛选高I/O进程
local high_io_found=false
for key in "${!process_speeds[@]}"; do
IFS='_' read -r pid proc_name <<< "$key"
IFS=',' read -r total_read_speed total_write_speed count <<< "${process_speeds[$key]}"
# 计算平均速度 (字节/秒)
local avg_read_speed=$(echo "scale=2; $total_read_speed / $count" | bc 2>/dev/null || echo "0")
local avg_write_speed=$(echo "scale=2; $total_write_speed / $count" | bc 2>/dev/null || echo "0")
# 转换为MiB/s (1 MiB = 1024*1024 bytes)
local avg_read_mib=$(echo "scale=2; $avg_read_speed / 1024 / 1024" | bc 2>/dev/null || echo "0")
local avg_write_mib=$(echo "scale=2; $avg_write_speed / 1024 / 1024" | bc 2>/dev/null || echo "0")
# 检查是否超过阈值
local read_over_threshold=$(echo "$avg_read_mib > $THRESHOLD_MB" | bc 2>/dev/null || echo "0")
local write_over_threshold=$(echo "$avg_write_mib > $THRESHOLD_MB" | bc 2>/dev/null || echo "0")
if [ "$read_over_threshold" = "1" ] || [ "$write_over_threshold" = "1" ]; then
high_io_found=true
# 获取更多进程信息
local cmd_line="unknown"
local user="unknown"
if [ -r "/proc/$pid/cmdline" ]; then
cmd_line=$(cat "/proc/$pid/cmdline" 2>/dev/null | tr '\0' ' ' | sed 's/ $//' || echo "unknown")
fi
if [ -r "/proc/$pid/status" ]; then
user=$(grep "Uid:" "/proc/$pid/status" 2>/dev/null | awk '{print $2}' || echo "unknown")
fi
log_message "HIGH_IO_PROCESS pid=$pid, name=$proc_name, user=$user"
log_message " Read: ${avg_read_mib} MiB/s (threshold: ${THRESHOLD_MB} MiB/s)"
log_message " Write: ${avg_write_mib} MiB/s (threshold: ${THRESHOLD_MB} MiB/s)"
log_message " Sample count: $count"
log_message " Command: $cmd_line"
log_message "---"
fi
done
if [ "$high_io_found" = "false" ]; then
log_message "No processes found with I/O above ${THRESHOLD_MB} MiB/s"
fi
log_message "=================================================="
}
# 函数: 检查依赖
check_dependencies() {
local missing_deps=()
# 检查bc是否安装
if ! command -v bc &> /dev/null; then
missing_deps+=("bc")
fi
if [ ${#missing_deps[@]} -gt 0 ]; then
echo "错误: 缺少必要的依赖包: ${missing_deps[*]}"
echo "请运行: sudo apt-get install bc"
exit 1
fi
}
# 主函数
main() {
# 检查依赖
check_dependencies
# 记录开始日志
log_message "开始磁盘I/O监控 - 采集${SAMPLE_COUNT}次数据,每次间隔${SAMPLE_INTERVAL}秒"
# 执行监控
get_process_io_stats
# 记录完成日志
log_message "磁盘I/O监控完成"
}
# 运行主函数
main "$@"
运行之后的 Log 文件格式如下:
[2025-10-01 14:11:50] 开始磁盘I/O监控 - 采集5次数据,每次间隔2秒
[2025-10-01 14:12:29] No processes found with I/O above 50 MiB/s
[2025-10-01 14:12:29] ==================================================
[2025-10-01 14:12:29] 磁盘I/O监控完成
[2025-10-01 14:12:50] 开始磁盘I/O监控 - 采集5次数据,每次间隔2秒
[2025-10-01 14:15:39] HIGH_IO_PROCESS pid=147318, name=fail2ban-server, user=0
[2025-10-01 14:15:41] Read: 83.81 MiB/s (threshold: 50 MiB/s)
[2025-10-01 14:15:42] Write: 0 MiB/s (threshold: 50 MiB/s)
[2025-10-01 14:15:42] Sample count: 4
[2025-10-01 14:15:42] Command: /usr/bin/python3 /usr/bin/fail2ban-server -xf start
[2025-10-01 14:15:42] ---
[2025-10-01 14:16:40] ==================================================
[2025-10-01 14:16:40] 磁盘I/O监控完成
评论区