目标:我希望在警报条件后 5 分钟收到通知,然后每 30 分钟收到一次通知。
我玩弄了.count()和time函数,但没有成功,我不想计算出那些乱七八糟的东西,而且我想不出一种让它对用户友好和可靠的方法。
我现在使用的解决方案是让两个流具有单独的窗口。
var initialData = stream
|from()
.database(db)
.measurement(metricType)
.retentionPolicy(rPolicy)
.groupBy(group)
.where(lambda: "cpu" == 'cpu-total')
.where(lambda: "host" =~ hostFilter)
|mean(metric)
.as('initialStat')
|window()
.period(10m)
.every(5m)
.align()
var continuousData = stream
|from()
.database(db)
.measurement(metricType)
.retentionPolicy(rPolicy)
.groupBy(group)
.where(lambda: metricType == 'cpu-total')
.where(lambda: "host" =~ hostFilter)
|mean(metric)
.as('continuousStat')
|window()
.period(10m)
.every(30)
.align()
除了这看起来很奇怪之外,我还需要计算每个节点的值,并且我还需要单独的|alert()
节点。第一个节点只会通知状态变化,但第二个节点不能有这个,所以我每 N 分钟收到一个警报提醒。我还有一个问题,第一个|alert()
节点会发出通知,第二个节点也会在N 分钟后OK
发出一个欺骗。OK
我觉得必须有更好的方法来做到这一点。我想我可以if
在第二个|alert()
节点中使用一个语句来不发送通知,OK
因为第一个节点|window
会处理这个问题。在这一点上,我还没有想出如何做到这一点,但我相信这是可能的。我也不想和 tickscript 打架,我知道它并不是按照Issue 741设计的完整语言
完整的脚本如下
// CONFIGURATION PARAMETERS
// Alerting
var emailAddress = '$EMAIL'
var pagerdutyKey = '$PD'
var slackChannel = '$SLACK'
// Static Thresholds in percent cpu steal used
var warn = 85
var crit = 95
// Dynamic thresholds in number of std deviations
var warnSig = 2.5
var critSig = 3.5
// Print INFO level (every result will be an alert)
// AlertNode.StateChangesOnly will also need to be disabled
// NOTE:
// INFO level alerts will be disregarded by the pagerduty handler, this is not configurable.
var debug = FALSE
// Datastream
// Define the data that will be acted upon
var db = 'telegraf'
var group = 'host'
var metricType = 'cpu'
var metric = 'time_steal'
var rPolicy = 'default'
// Regex used to filter on a subset of hosts
var hostFilter = /.+/
// Window
var dataPeriod = 10m
var initialFrequency = 5m
var continuousFrequency = 30m
// DATAFRAME
var initialData = stream
|from()
.database(db)
.measurement(metricType)
.retentionPolicy(rPolicy)
.groupBy(group)
.where(lambda: metricType == 'cpu-total')
.where(lambda: "host" =~ hostFilter)
|mean(metric)
.as('initialStat')
|window()
.period(dataPeriod)
.every(initialFrequency)
.align()
var continuousData = stream
|from()
.database(db)
.measurement(metricType)
.retentionPolicy(rPolicy)
.groupBy(group)
.where(lambda: metricType == 'cpu-total')
.where(lambda: "host" =~ hostFilter)
|mean(metric)
.as('continuousStat')
|window()
.period(dataPeriod)
.every(continuousFrequency)
.align()
// Calculations
var initialCalculation = initialData
|eval(lambda: sigma("initialStat"))
.as('intialSigma')
.keep()
var continuousCalculation = continuousData
|eval(lambda: sigma("continuousStat"))
.as('continuousSigma')
.keep()
// ALERT CONDITIONS
var initialCondition = initialCalculation
|alert()
.id('{{ index .Tags "host" }}')
.message('{{ .ID }} is {{ .Level }}: CPU STEAL USAGE {{ index .Fields "initialStat" }}% SHORT')
.details('this is an alert')
.stateChangesOnly()
.info(lambda: debug)
.warn(lambda: "stat" < warn OR
"sigma" > warnSig)
.crit(lambda: "stat" < crit OR
"sigma" > critSig)
var continuousCondition = continuousCalculation
|alert()
.id('{{ index .Tags "host" }}')
.message('{{ .ID }} is {{ .Level }}: CPU STEAL USAGE {{ index .Fields "continuousStat" }}% LONG')
.details('this is an alert')
.info(lambda: debug)
.warn(lambda: "stat" < warn OR
"sigma" > warnSig)
.crit(lambda: "stat" < crit OR
"sigma" > critSig)
// ACTIONS
continuousCondition
// .log('/tmp/alerts/cpu_steal_usage_alerts')
// .slack()
// .channel(slackChannel)
.email(emailAddress)
.pagerDuty()
.serviceKey(pagerdutyKey)
initialCondition
// .log('/tmp/alerts/cpu_steal_usage_alerts')
// .slack()
// .channel(slackChannel)
.email(emailAddress)
.pagerDuty()
.serviceKey(pagerdutyKey)