PromQL 常用查询语句示例
· 阅读需 9 分钟
本文收集整理了在 Prometheus 监控中最常用的 PromQL 查询语句,方便日常查询和告警规则编写时参考。
系统监控
CPU 相关查询
# CPU 使用率
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# CPU 各个状态占比
rate(node_cpu_seconds_total[5m]) * 100
# 按核心统计 CPU 使用率
avg by (cpu) (rate(node_cpu_seconds_total{mode!="idle"}[5m]) * 100)
# CPU 负载
node_load1 # 1分钟负载
node_load5 # 5分钟负载
node_load15 # 15分钟负载
内存相关查询
# 内存使用率
(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) * 100
# 可用内存
node_memory_MemAvailable_bytes / 1024 / 1024 / 1024
# 内存使用量前五的进程
topk(5, sum by (name) (container_memory_usage_bytes{container!=""}))
# Swap 使用率
(1 - node_memory_SwapFree_bytes/node_memory_SwapTotal_bytes) * 100
磁盘相关查询
# 磁盘使用率
100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes)
# 磁盘读写速率
rate(node_disk_read_bytes_total[5m])
rate(node_disk_written_bytes_total[5m])
# 磁盘 IO 使用率
rate(node_disk_io_time_seconds_total[5m]) * 100
# 预测磁盘满的时间(小时)
predict_linear(node_filesystem_free_bytes[1h], 4 * 3600) < 0
网络相关查询
# 网络接口流量
rate(node_network_receive_bytes_total[5m])
rate(node_network_transmit_bytes_total[5m])
# 网络接口错误率
rate(node_network_receive_errs_total[5m])
rate(node_network_transmit_errs_total[5m])
# TCP 连接状态
node_netstat_Tcp_CurrEstab
容器监控
容器资源使用
# 容器 CPU 使用率
sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (container) * 100
# 容器内存使用量(GB)
sum(container_memory_usage_bytes{container!=""}) by (container) / 1024^3
# 容器网络 IO
sum(rate(container_network_receive_bytes_total[5m])) by (container)
sum(rate(container_network_transmit_bytes_total[5m])) by (container)
应用监控
HTTP 服务监控
# 请求速率(QPS)
sum(rate(http_requests_total[5m])) by (handler)
# 错误率
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m])) * 100
# 平均响应时间
rate(http_request_duration_seconds_sum[5m])
/
rate(http_request_duration_seconds_count[5m])
# P90/P95/P99 延迟
histogram_quantile(0.90, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
JVM 监控
# 堆内存使用率
jvm_memory_bytes_used{area="heap"}
/
jvm_memory_bytes_max{area="heap"} * 100
# GC 次数
rate(jvm_gc_collection_seconds_count[5m])
# GC 耗时
rate(jvm_gc_collection_seconds_sum[5m])
# 线程数
jvm_threads_current
告警规则示例
系统告警
# CPU 使用率过高
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
# 内存使用率过高
(1 - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes) * 100 > 90
# 磁盘使用率过高
100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes) > 85
# 磁盘将在 4 小时内满
predict_linear(node_filesystem_free_bytes[1h], 4 * 3600) < 0
应用告警
# 服务实例不可用
up{job="my-service"} == 0
# 错误率过高
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m])) * 100 > 5
# 响应延迟过高
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 1
性能优化查询
# 查找高基数的指标
topk(10, count by (__name__) ({__name__=~".+"}))
# 查找最活跃的 targets
topk(10, count by (job) (up))
# 查找采 集速率最高的指标
topk(10, rate(prometheus_tsdb_head_samples_appended_total[5m]))
高级监控场景
Kubernetes 集群监控
# 节点 Ready 状态
kube_node_status_condition{condition="Ready",status="true"}
# Pod 运行状态统计
sum by (namespace) (kube_pod_status_phase{phase="Running"})
# 容器重启次数
sum by (namespace, pod) (kube_pod_container_status_restarts_total)
# 节点资源压力
sum by (node) (
kube_pod_container_resource_requests{resource="cpu"}
) / sum by (node) (
kube_node_status_allocatable{resource="cpu"}
) * 100
# 命名空间资源配额使用率
sum by (namespace) (
kube_resourcequota{type="used"}
) / sum by (namespace) (
kube_resourcequota{type="hard"}
) * 100
数据库监控
MySQL 监控
# 连接数使用率
mysql_global_status_threads_connected
/
mysql_global_variables_max_connections * 100
# 慢查询统计
rate(mysql_global_status_slow_queries[5m])
# InnoDB 缓冲池使用率
mysql_global_status_innodb_buffer_pool_pages_total
-
mysql_global_status_innodb_buffer_pool_pages_free
# 事务统计
rate(mysql_global_status_commands_total{command="commit"}[5m])
Redis 监控
# 内存使用率
redis_memory_used_bytes / redis_memory_max_bytes * 100
# 命令执行率
rate(redis_commands_total[5m])
# 键过期率
rate(redis_expired_keys_total[5m])
# 连接数
redis_connected_clients
消息队列监控
RabbitMQ 监控
# 队列消息堆积
rabbitmq_queue_messages_ready
# 消费者数量
rabbitmq_queue_consumers
# 消息处理率
rate(rabbitmq_queue_messages_delivered_total[5m])
# 未确认消息数
rabbitmq_queue_messages_unacknowledged
Kafka 监控
# 主题消息率
rate(kafka_topic_partition_current_offset[5m])
# 消费组延迟
sum by (topic) (
kafka_consumergroup_lag
)
# Broker 活跃连接数
kafka_server_socket_server_metrics_connection_count
# 副本同步延迟
kafka_replica_lag
网关和代理监控
Nginx 监控
# 请求处理率
rate(nginx_http_requests_total[5m])
# 活跃连接数
nginx_connections_active
# 错误率
sum(rate(nginx_http_requests_total{status=~"5.."}[5m]))
/
sum(rate(nginx_http_requests_total[5m])) * 100
# 上游响应时间
histogram_quantile(0.95,
rate(nginx_upstream_response_time_seconds_bucket[5m])
)
应用性能监控
服务依赖监控
# 服务调用错误率
sum by (service) (
rate(service_calls_total{result="error"}[5m])
) / sum by (service) (
rate(service_calls_total[5m])
) * 100
# 服务依赖可用性
sum by (dependency) (
rate(dependency_up[5m])
)
# 外部服务调用延迟
histogram_quantile(0.95,
rate(external_service_response_time_bucket[5m])
)
缓存性能监控
# 缓存命中率
sum(rate(cache_hits_total[5m]))
/
sum(rate(cache_requests_total[5m])) * 100
# 缓存过期率
rate(cache_evictions_total[5m])
# 缓存延迟分布
histogram_quantile(0.99,
rate(cache_operation_duration_seconds_bucket[5m])
)
日志相关监控
# 错误日志率
rate(log_messages_total{level="error"}[5m])
# 按照服务统计错误数
sum by (service) (
increase(log_errors_total[1h])
)
# 日志写入延迟
histogram_quantile(0.95,
rate(log_write_duration_seconds_bucket[5m])
)
安全监控
# 认证失败次数
rate(auth_failures_total[5m])
# IP 封禁次数
increase(ip_blacklist_total[1h])
# HTTPS 证书过期时间(天)
(
ssl_certificate_expiry_timestamp_seconds
-
time()
) / 86400
# 异常登录尝试
sum by (user) (
rate(failed_login_attempts_total[5m])
)
高级告警规则
趋势预测告警
# 预测 4 小时后的值是否超过阈值
predict_linear(
http_requests_total[1h],
4 * 3600
) > 1000
# 异常值检测
abs(
rate(http_requests_total[5m])
-
avg_over_time(rate(http_requests_total[5m])[1h:5m])
) > 2
复合告警条件
# CPU 和内存同时高负载
(
instance:cpu_usage:rate5m > 80
and
instance:memory_usage:ratio > 80
)
# 服务多项指标异常
(
service:error_rate:5m > 1
and
service:latency:p95_5m > 0.5
and
service:success_rate:5m < 99
)
最佳实践补充
查询优化进阶
- 使用子查询优化复杂计算
# 优化前
sum(rate(http_requests_total[5m])) / sum(rate(http_requests_total[5m]))
# 优化后
sum(rate(http_requests_total[5m]))
/
group(sum(rate(http_requests_total[5m])))
- 使用 recording rules 优化常用查询
# 记录规则示例
rules:
- record: job:http_errors:rate5m
expr: sum by (job) (rate(http_requests_total{status=~"5.."}[5m]))
微服务监控
服务网格监控
# 服务间调用延迟
histogram_quantile(0.95,
sum(rate(istio_request_duration_milliseconds_bucket[5m])) by (source_app, destination_app, le)
)
# 服务错误率
sum(rate(istio_requests_total{response_code=~"5.*"}[5m])) by (destination_service)
/
sum(rate(istio_requests_total[5m])) by (destination_service) * 100
# 服务重试率
sum(rate(istio_requests_total{response_flags="RR"}[5m])) by (destination_service)
/
sum(rate(istio_requests_total[5m])) by (destination_service) * 100
# 断路器开启次数
increase(istio_circuit_breaker_open_total[5m])
链路追踪相关
# 追踪采样率
sum(rate(spans_sampled_total[5m]))
/
sum(rate(spans_total[5m])) * 100
# 追踪延迟分布
histogram_quantile(0.95, sum(rate(trace_duration_seconds_bucket[5m])) by (service, le))
# 追踪错误率
sum(rate(spans_errors_total[5m])) by (service)
/
sum(rate(spans_total[5m])) by (service) * 100
云原生组件监控
Etcd 监控
# Leader 选举状态
etcd_server_is_leader
# 写入延迟
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
# 数据库大小
etcd_debugging_mvcc_db_total_size_in_bytes
# Raft 提案失败率
rate(etcd_server_proposals_failed_total[5m])
/
rate(etcd_server_proposals_committed_total[5m]) * 100
CoreDNS 监控
# DNS 查询率
sum(rate(coredns_dns_requests_total[5m])) by (zone)
# 查询延迟
histogram_quantile(0.95, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (zone, le))
# 错误率
sum(rate(coredns_dns_responses_total{rcode!="NOERROR"}[5m]))
/
sum(rate(coredns_dns_responses_total[5m])) * 100
存储系统监控
Elasticsearch 监控
# 集群状态
elasticsearch_cluster_health_status{color="green"}
# 索引写入延迟
rate(elasticsearch_index_stats_indexing_index_time_seconds_total[5m])
/
rate(elasticsearch_index_stats_indexing_index_total[5m])
# JVM 堆使用率
elasticsearch_jvm_memory_used_bytes{area="heap"}
/
elasticsearch_jvm_memory_max_bytes{area="heap"} * 100
# 搜索延迟
rate(elasticsearch_indices_search_fetch_time_seconds[5m])
/
rate(elasticsearch_indices_search_fetch_total[5m])
MongoDB 监控
# 连接数
mongodb_connections{state="current"}
# 操作延迟
rate(mongodb_op_latencies_latency_total[5m])
/
rate(mongodb_op_latencies_ops_total[5m])
# 复制延迟
mongodb_replset_member_optime_date{state="SECONDARY"}
-
mongodb_replset_member_optime_date{state="PRIMARY"}
# 慢查询数
rate(mongodb_mongod_metrics_query_executor_total{state="scanned_objects"}[5m])
网络监控进阶
网络质量监控
# 网络延迟
avg_over_time(ping_average_response_ms[5m])
# 丢包率
rate(ping_loss_count[5m])
/
rate(ping_count[5m]) * 100
# 网络抖动
stddev_over_time(ping_average_response_ms[5m])
协议监控
# TCP 重传率
rate(node_netstat_Tcp_RetransSegs[5m])
/
rate(node_netstat_Tcp_OutSegs[5m]) * 100
# TCP 连接状态分布
node_netstat_Tcp_CurrEstab
# UDP 缓冲区溢出
rate(node_netstat_Udp_RcvbufErrors[5m])
自定义业务监控
业务指标监控
# 订单处理速率
rate(business_orders_processed_total[5m])
# 支付成功率
sum(rate(payment_transactions_total{status="success"}[5m]))
/
sum(rate(payment_transactions_total[5m])) * 100
# 用户会话数
sum(rate(user_sessions_total{status="active"}[5m]))
# 业务错误分布
topk(10, sum by (error_type) (rate(business_errors_total[1h])))
用户体验监控
# 页面加载时间
histogram_quantile(0.95, sum(rate(page_load_time_seconds_bucket[5m])) by (page, le))
# JS 错误率
sum(rate(frontend_errors_total[5m])) by (error_type)
# API 响应时间
histogram_quantile(0.99, sum(rate(api_response_time_seconds_bucket[5m])) by (api, le))
高级告警场景
智能告警
# 动态阈值告警
abs(
rate(http_requests_total[5m])
-
avg_over_time(rate(http_requests_total[5m])[1d:5m])
) > stddev_over_time(rate(http_requests_total[5m])[1d:5m]) * 3
# 季节性感知告警
(
rate(http_requests_total[5m])
/
avg_over_time(rate(http_requests_total[5m] offset 7d)[1h:5m])
) > 2
多维度告警
# 服务质量综合告警
(
service:error_rate:5m > 1
and
service:latency:p95_5m > 0.5
and
service:success_rate:5m < 99
and
service:traffic:rate5m > 10
)
# 资源饱和度告警
(
instance:cpu_usage:rate5m > 80
or
instance:memory_usage:ratio > 80
or
instance:disk_usage:ratio > 85
)
and
instance:load1 > count(instance:cpu_cores) by (instance)