目 录CONTENT

文章目录

Ubuntu 服务器轻量级磁盘 I/O 负载监控程序

KD Mercury
2025-10-01 / 0 评论 / 0 点赞 / 9 阅读 / 0 字
温馨提示:
本文最后更新于2025-10-01,若内容或图片失效,请留言反馈。 多喝热水。想想有多久没有仔细瞭望星空了(

目的:在计算资源匮乏的服务器上,通过监控磁盘 I/O 负载情况,不受脚本执行时间影响,记录对磁盘读写超过设定阈值的程序的详细信息。

需求:

  1. 监控磁盘 I/O 负载情况,使用/proc文件系统直接读取进程I/O数据,避免使用资源消耗大的工具,实现轻量化监控;
  2. 该脚本运行时,每两秒采集一次所有程序对磁盘 I/O 的负载情况,脚本运行期间总共采集5次;
  3. 记录每一个程序对磁盘的 I/O 使用情况并计算平均使用情况,将磁盘平均读取速度大于50MiB/s的程序的详细信息记录到 log 文件中,并记录时间,以及对磁盘的使用情况;
  4. 记录每一个程序对磁盘的 I/O 使用情况并计算平均使用情况,将磁盘平均写入速度大于50MiB/s的程序的详细信息记录到 log 文件中,并记录时间,以及对磁盘的使用情况;
  5. 对于日志文件,采用在末尾追加的模式;

使用方法:使用一个定时程序,实现每隔一段时间执行一次该脚本。

#!/bin/bash

# 磁盘I/O监控脚本 - 精确时间版本
# 作者: 运维工程师
# 功能: 监控程序级别的磁盘I/O负载,记录高I/O使用情况

# 配置参数
SAMPLE_COUNT=5
SAMPLE_INTERVAL=2
THRESHOLD_MB=50
LOG_FILE="/var/log/disk_io_monitor.log"

# 确保日志目录存在
mkdir -p "$(dirname "$LOG_FILE")"

# 函数: 记录日志
log_message() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG_FILE"
}

# 函数: 获取单次采样数据
get_single_sample() {
    local sample_num=$1
    local sample_time=$(date +%s.%N)
    local temp_file="$2"
    
    local pid_list=$(find /proc -maxdepth 1 -name "[0-9]*" -type d 2>/dev/null | sed 's/\/proc\///' | grep -E '^[0-9]+$')
    
    # 遍历所有进程收集I/O数据
    for pid in $pid_list; do
        if [ -r "/proc/$pid/io" ] && [ -r "/proc/$pid/comm" ]; then
            # 获取进程名和I/O统计
            local proc_name=$(cat "/proc/$pid/comm" 2>/dev/null || echo "unknown")
            local read_bytes=$(grep "read_bytes" "/proc/$pid/io" 2>/dev/null | awk '{print $2}' || echo "0")
            local write_bytes=$(grep "write_bytes" "/proc/$pid/io" 2>/dev/null | awk '{print $2}' || echo "0")
            
            # 将数据写入临时文件
            echo "$sample_time|$pid|$proc_name|$read_bytes|$write_bytes" >> "$temp_file"
        fi
    done
}

# 函数: 获取进程I/O统计信息
get_process_io_stats() {
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    # 创建临时文件存储采样数据
    local temp_dir=$(mktemp -d)
    local temp_files=()
    
    # 采集SAMPLE_COUNT次数据
    for sample in $(seq 1 $SAMPLE_COUNT); do
        local temp_file="$temp_dir/sample_${sample}.txt"
        temp_files+=("$temp_file")
        
        # 获取采样数据
        get_single_sample "$sample" "$temp_file"
        
        # 等待采样间隔(除了最后一次)
        if [ $sample -lt $SAMPLE_COUNT ]; then
            sleep $SAMPLE_INTERVAL
        fi
    done
    
    # 处理采集的数据并计算瞬时速度的平均值
    process_io_data "$temp_dir"
    
    # 清理临时文件
    rm -rf "$temp_dir"
}

# 函数: 处理I/O数据并计算瞬时速度的平均值
process_io_data() {
    local temp_dir=$1
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    # 创建关联数组存储进程每次采样的速度
    declare -A process_speeds
    
    # 从第二次采样开始计算瞬时速度(需要前一次的数据)
    for sample in $(seq 2 $SAMPLE_COUNT); do
        local current_sample="$temp_dir/sample_${sample}.txt"
        local previous_sample="$temp_dir/sample_$((sample-1)).txt"
        
        [ -f "$current_sample" ] || continue
        [ -f "$previous_sample" ] || continue
        
        # 创建关联数组存储前一次的数据
        declare -A previous_data
        while IFS='|' read -r prev_time pid proc_name read_bytes write_bytes; do
            local key="${pid}_${proc_name}"
            previous_data["$key"]="$prev_time|$read_bytes|$write_bytes"
        done < "$previous_sample"
        
        # 计算当前采样间隔内的瞬时速度
        while IFS='|' read -r curr_time pid proc_name current_read current_write; do
            local key="${pid}_${proc_name}"
            if [[ -n "${previous_data[$key]}" ]]; then
                IFS='|' read -r prev_time prev_read prev_write <<< "${previous_data[$key]}"
                
                # 计算采样间隔内的字节变化
                local read_diff=$((current_read - prev_read))
                local write_diff=$((current_write - prev_write))
                
                # 处理可能的负值(进程重启等情况)
                if [ $read_diff -lt 0 ]; then read_diff=0; fi
                if [ $write_diff -lt 0 ]; then write_diff=0; fi
                
                # 计算实际时间间隔(纳秒转秒)
                local time_diff=$(echo "scale=6; ($curr_time - $prev_time)" | bc 2>/dev/null || echo "$SAMPLE_INTERVAL")
                
                # 计算瞬时速度 (字节/秒)
                if [ "$(echo "$time_diff > 0" | bc 2>/dev/null || echo "1")" = "1" ]; then
                    local read_speed=$(echo "scale=2; $read_diff / $time_diff" | bc 2>/dev/null || echo "0")
                    local write_speed=$(echo "scale=2; $write_diff / $time_diff" | bc 2>/dev/null || echo "0")
                else
                    local read_speed="0"
                    local write_speed="0"
                fi
                
                # 存储速度数据
                if [[ -z "${process_speeds[$key]}" ]]; then
                    process_speeds["$key"]="$read_speed,$write_speed,1"
                else
                    local existing_data="${process_speeds[$key]}"
                    IFS=',' read -r total_read_speed total_write_speed count <<< "$existing_data"
                    local new_read_speed=$(echo "scale=2; $total_read_speed + $read_speed" | bc 2>/dev/null || echo "$total_read_speed")
                    local new_write_speed=$(echo "scale=2; $total_write_speed + $write_speed" | bc 2>/dev/null || echo "$total_write_speed")
                    local new_count=$((count + 1))
                    process_speeds["$key"]="$new_read_speed,$new_write_speed,$new_count"
                fi
            fi
        done < "$current_sample"
    done
    
    # 计算平均速度并筛选高I/O进程
    local high_io_found=false
    
    for key in "${!process_speeds[@]}"; do
        IFS='_' read -r pid proc_name <<< "$key"
        IFS=',' read -r total_read_speed total_write_speed count <<< "${process_speeds[$key]}"
        
        # 计算平均速度 (字节/秒)
        local avg_read_speed=$(echo "scale=2; $total_read_speed / $count" | bc 2>/dev/null || echo "0")
        local avg_write_speed=$(echo "scale=2; $total_write_speed / $count" | bc 2>/dev/null || echo "0")
        
        # 转换为MiB/s (1 MiB = 1024*1024 bytes)
        local avg_read_mib=$(echo "scale=2; $avg_read_speed / 1024 / 1024" | bc 2>/dev/null || echo "0")
        local avg_write_mib=$(echo "scale=2; $avg_write_speed / 1024 / 1024" | bc 2>/dev/null || echo "0")
        
        # 检查是否超过阈值
        local read_over_threshold=$(echo "$avg_read_mib > $THRESHOLD_MB" | bc 2>/dev/null || echo "0")
        local write_over_threshold=$(echo "$avg_write_mib > $THRESHOLD_MB" | bc 2>/dev/null || echo "0")
        
        if [ "$read_over_threshold" = "1" ] || [ "$write_over_threshold" = "1" ]; then
            high_io_found=true
            
            # 获取更多进程信息
            local cmd_line="unknown"
            local user="unknown"
            if [ -r "/proc/$pid/cmdline" ]; then
                cmd_line=$(cat "/proc/$pid/cmdline" 2>/dev/null | tr '\0' ' ' | sed 's/ $//' || echo "unknown")
            fi
            if [ -r "/proc/$pid/status" ]; then
                user=$(grep "Uid:" "/proc/$pid/status" 2>/dev/null | awk '{print $2}' || echo "unknown")
            fi
            
            log_message "HIGH_IO_PROCESS pid=$pid, name=$proc_name, user=$user"
            log_message "  Read: ${avg_read_mib} MiB/s (threshold: ${THRESHOLD_MB} MiB/s)"
            log_message "  Write: ${avg_write_mib} MiB/s (threshold: ${THRESHOLD_MB} MiB/s)"
            log_message "  Sample count: $count"
            log_message "  Command: $cmd_line"
            log_message "---"
        fi
    done
    
    if [ "$high_io_found" = "false" ]; then
        log_message "No processes found with I/O above ${THRESHOLD_MB} MiB/s"
    fi
    
    log_message "=================================================="
}

# 函数: 检查依赖
check_dependencies() {
    local missing_deps=()
    
    # 检查bc是否安装
    if ! command -v bc &> /dev/null; then
        missing_deps+=("bc")
    fi
    
    if [ ${#missing_deps[@]} -gt 0 ]; then
        echo "错误: 缺少必要的依赖包: ${missing_deps[*]}"
        echo "请运行: sudo apt-get install bc"
        exit 1
    fi
}

# 主函数
main() {
    # 检查依赖
    check_dependencies
    
    # 记录开始日志
    log_message "开始磁盘I/O监控 - 采集${SAMPLE_COUNT}次数据,每次间隔${SAMPLE_INTERVAL}秒"
    
    # 执行监控
    get_process_io_stats
    
    # 记录完成日志
    log_message "磁盘I/O监控完成"
}

# 运行主函数
main "$@"

运行之后的 Log 文件格式如下:

[2025-10-01 14:11:50] 开始磁盘I/O监控 - 采集5次数据,每次间隔2秒
[2025-10-01 14:12:29] No processes found with I/O above 50 MiB/s
[2025-10-01 14:12:29] ==================================================
[2025-10-01 14:12:29] 磁盘I/O监控完成
[2025-10-01 14:12:50] 开始磁盘I/O监控 - 采集5次数据,每次间隔2秒
[2025-10-01 14:15:39] HIGH_IO_PROCESS pid=147318, name=fail2ban-server, user=0
[2025-10-01 14:15:41]   Read: 83.81 MiB/s (threshold: 50 MiB/s)
[2025-10-01 14:15:42]   Write: 0 MiB/s (threshold: 50 MiB/s)
[2025-10-01 14:15:42]   Sample count: 4
[2025-10-01 14:15:42]   Command: /usr/bin/python3 /usr/bin/fail2ban-server -xf start
[2025-10-01 14:15:42] ---
[2025-10-01 14:16:40] ==================================================
[2025-10-01 14:16:40] 磁盘I/O监控完成
0

评论区