zabbix wait for 15s seconds 出現原因及調優建議
在監控裝置的時候,在server端的日誌中有時候會見到類似another network error, wait for 15s seconds的異常,今天我們看下這個問題的出現原因和解決方案:
問題定位到poller.c,看下下面兩份程式碼:
這個get_values的部分程式碼:
for (i = 0; i < num; i++) { switch (errcodes[i]) { case SUCCEED: case NOTSUPPORTED: case AGENT_ERROR: if (HOST_AVAILABLE_TRUE != last_available) { zbx_activate_item_host(&items[i], ×pec); last_available = HOST_AVAILABLE_TRUE; } break; case NETWORK_ERROR: case GATEWAY_ERROR: case TIMEOUT_ERROR: if (HOST_AVAILABLE_FALSE != last_available) { zbx_deactivate_item_host(&items[i], ×pec, results[i].msg); last_available = HOST_AVAILABLE_FALSE; } break; case CONFIG_ERROR: /* nothing to do */ break; default: zbx_error("unknown response code returned: %d", errcodes[i]); THIS_SHOULD_NEVER_HAPPEN; }
這裡是zbx_deactivate_item_host的程式碼:
voidzbx_deactivate_item_host(DC_ITEM *item, zbx_timespec_t *ts, const char *error)//#0 { const char*__function_name = "zbx_deactivate_item_host"; zbx_host_availability_tin, out;//#1 unsigned charagent_type;//#2 zabbix_log(LOG_LEVEL_DEBUG, "In %s() hostid:" ZBX_FS_UI64 " itemid:" ZBX_FS_UI64 " type:%d",//#3 __function_name, item->host.hostid, item->itemid, (int)item->type); zbx_host_availability_init(&in, item->host.hostid);//#4 zbx_host_availability_init(&out,item->host.hostid);//#5 if (ZBX_AGENT_UNKNOWN == (agent_type = host_availability_agent_by_item_type(item->type)))//#6 goto out; if (FAIL == host_get_availability(&item->host, agent_type, &in))//#7 goto out; if (FAIL == DChost_deactivate(item->host.hostid, agent_type, ts, &in.agents[agent_type],//#8 &out.agents[agent_type], error)) { goto out; } if (FAIL == db_host_update_availability(&out))//#9 goto out; host_set_availability(&item->host, agent_type, &out);//#10 if (0 == in.agents[agent_type].errors_from)//#11 { zabbix_log(LOG_LEVEL_WARNING, "%s item \"%s\" on host \"%s\" failed:"//#12 " first network error, wait for %d seconds", zbx_agent_type_string(item->type), item->key_orig, item->host.host, out.agents[agent_type].disable_until - ts->sec); } else { if (HOST_AVAILABLE_FALSE != in.agents[agent_type].available)//#13 { if (HOST_AVAILABLE_FALSE != out.agents[agent_type].available)//#14 { zabbix_log(LOG_LEVEL_WARNING, "%s item \"%s\" on host \"%s\" failed:"//#15 " another network error, wait for %d seconds", zbx_agent_type_string(item->type), item->key_orig, item->host.host, out.agents[agent_type].disable_until - ts->sec); } else { zabbix_log(LOG_LEVEL_WARNING, "temporarily disabling %s checks on host \"%s\":"//#16 " host unavailable", zbx_agent_type_string(item->type), item->host.host); } } } zabbix_log(LOG_LEVEL_DEBUG, "%s() errors_from:%d available:%d", __function_name, out.agents[agent_type].errors_from, out.agents[agent_type].available); out: zbx_host_availability_clean(&out); zbx_host_availability_clean(&in); zabbix_log(LOG_LEVEL_DEBUG, "End of %s()", __function_name); }
下面看下這裡是zbx_deactivate_item_host的程式碼的邏輯:
#0 zbx_deactivate_item_host函式接收三個引數
1 結構體指標,主機的一些綜合引數 //dbcache.h typedef struct { DC_HOSThost; DC_INTERFACEinterface; zbx_uint64_titemid; zbx_uint64_tlastlogsize; zbx_uint64_tvaluemapid; unsigned chartype; unsigned charvalue_type; unsigned charstate; unsigned charsnmpv3_securitylevel; unsigned charauthtype; unsigned charflags; unsigned charsnmpv3_authprotocol; unsigned charsnmpv3_privprotocol; unsigned charinventory_link; unsigned charstatus; unsigned charhistory; unsigned chartrends; unsigned charfollow_redirects; unsigned charpost_type; unsigned charretrieve_mode; unsigned charrequest_method; unsigned charoutput_format; unsigned charverify_peer; unsigned charverify_host; unsigned charallow_traps; charkey_orig[ITEM_KEY_LEN * ZBX_MAX_BYTES_IN_UTF8_CHAR + 1], *key; char*units; char*delay; inthistory_sec; intnextcheck; intlastclock; intmtime; chartrapper_hosts[ITEM_TRAPPER_HOSTS_LEN_MAX]; charlogtimefmt[ITEM_LOGTIMEFMT_LEN_MAX]; charsnmp_community_orig[ITEM_SNMP_COMMUNITY_LEN_MAX], *snmp_community; charsnmp_oid_orig[ITEM_SNMP_OID_LEN_MAX], *snmp_oid; charsnmpv3_securityname_orig[ITEM_SNMPV3_SECURITYNAME_LEN_MAX], *snmpv3_securityname; charsnmpv3_authpassphrase_orig[ITEM_SNMPV3_AUTHPASSPHRASE_LEN_MAX], *snmpv3_authpassphrase; charsnmpv3_privpassphrase_orig[ITEM_SNMPV3_PRIVPASSPHRASE_LEN_MAX], *snmpv3_privpassphrase; charipmi_sensor[ITEM_IPMI_SENSOR_LEN_MAX]; char*params; charusername_orig[ITEM_USERNAME_LEN_MAX], *username; charpublickey_orig[ITEM_PUBLICKEY_LEN_MAX], *publickey; charprivatekey_orig[ITEM_PRIVATEKEY_LEN_MAX], *privatekey; charpassword_orig[ITEM_PASSWORD_LEN_MAX], *password; charsnmpv3_contextname_orig[ITEM_SNMPV3_CONTEXTNAME_LEN_MAX], *snmpv3_contextname; charjmx_endpoint_orig[ITEM_JMX_ENDPOINT_LEN_MAX], *jmx_endpoint; chartimeout_orig[ITEM_TIMEOUT_LEN_MAX], *timeout; charurl_orig[ITEM_URL_LEN_MAX], *url; charquery_fields_orig[ITEM_QUERY_FIELDS_LEN_MAX], *query_fields; char*posts; charstatus_codes_orig[ITEM_STATUS_CODES_LEN_MAX], *status_codes; charhttp_proxy_orig[ITEM_HTTP_PROXY_LEN_MAX], *http_proxy; char*headers; charssl_cert_file_orig[ITEM_SSL_CERT_FILE_LEN_MAX], *ssl_cert_file; charssl_key_file_orig[ITEM_SSL_KEY_FILE_LEN_MAX], *ssl_key_file; charssl_key_password_orig[ITEM_SSL_KEY_PASSWORD_LEN_MAX], *ssl_key_password; char*error; } DC_ITEM; 2 結構體指標 //common.h typedef struct { intsec;/* seconds */ intns;/* nanoseconds */ } zbx_timespec_t; 3 錯誤資訊
#1 定義了兩個結構體陣列 in 和 out
//db.h typedef struct { /* flags specifying which fields are set, see ZBX_FLAGS_AGENT_STATUS_* defines */ unsigned charflags; /* agent availability fields */ unsigned charavailable; char*error; interrors_from; intdisable_until; } zbx_agent_availability_t; typedef struct { zbx_uint64_thostid; zbx_agent_availability_tagents[ZBX_AGENT_MAX];//這裡的ZBX_AGENT_MAX 為4 ,分別代表ZABBIX, SNMP, IPMI, JMX4種類型 } zbx_host_availability_t;
#2 宣告unsigned char agent_type,unsigned char和char的區別是char表示-128-127,unsigned char 表示0-255,這裡的255會在後面遇到,所以需要255的這個表示範圍
#3 記錄DEBUG 的log,如果需要顯示這份日誌,需要將server端的配置檔案debug等級更改為5,不過我不建議你這麼做
#4 初始化主機IN可用性資料
//dbconfig.c voidzbx_host_availability_init(zbx_host_availability_t *availability, zbx_uint64_t hostid) { memset(availability, 0, sizeof(zbx_host_availability_t)); availability->hostid = hostid; }
#5 同#4一樣,只不過是OUT
#6 為agent_type賦值,如果agent_type不屬於#1中的四種,跳至out處
1、host_availability_agent_by_item_type 位於poller.c,接收item的type欄位,用來判斷監控型別 //poller.c static unsigned charhost_availability_agent_by_item_type(unsigned char type) { switch (type) { case ITEM_TYPE_ZABBIX: return ZBX_AGENT_ZABBIX; break; case ITEM_TYPE_SNMPv1: case ITEM_TYPE_SNMPv2c: case ITEM_TYPE_SNMPv3: return ZBX_AGENT_SNMP; break; case ITEM_TYPE_IPMI: return ZBX_AGENT_IPMI; break; case ITEM_TYPE_JMX: return ZBX_AGENT_JMX; break; default: return ZBX_AGENT_UNKNOWN; } } 2、ZBX_AGENT_UNKNOWN 常量 為 255 對應之前的 #2
#7 根據agent_type來判斷主機的可用性,網路裝置會匹配到ZBX_AGENT_SNMP,四個值分別代表的意思是
//poller.c static inthost_get_availability(const DC_HOST *dc_host, unsigned char agent, zbx_host_availability_t *ha) { zbx_agent_availability_t*availability = &ha->agents[agent]; availability->flags = ZBX_FLAGS_AGENT_STATUS; switch (agent) { case ZBX_AGENT_ZABBIX: availability->available = dc_host->available; availability->error = zbx_strdup(NULL, dc_host->error); availability->errors_from = dc_host->errors_from; availability->disable_until = dc_host->disable_until; break; case ZBX_AGENT_SNMP: availability->available = dc_host->snmp_available;//主機的snmp可用狀態 availability->error = zbx_strdup(NULL, dc_host->snmp_error);//錯誤資訊 availability->errors_from = dc_host->snmp_errors_from;//錯誤發生時間 availability->disable_until = dc_host->snmp_disable_until;//下次延遲檢測時間 break; case ZBX_AGENT_IPMI: availability->available = dc_host->ipmi_available; availability->error = zbx_strdup(NULL, dc_host->ipmi_error); availability->errors_from = dc_host->ipmi_errors_from; availability->disable_until = dc_host->ipmi_disable_until; break; case ZBX_AGENT_JMX: availability->available = dc_host->jmx_available; availability->error = zbx_strdup(NULL, dc_host->jmx_error); availability->disable_until = dc_host->jmx_disable_until; availability->errors_from = dc_host->jmx_errors_from; break; default: return FAIL; } ha->hostid = dc_host->hostid; return SUCCEED; } //dbcache.h typedef struct { zbx_uint64_thostid; zbx_uint64_tproxy_hostid; charhost[HOST_HOST_LEN_MAX]; charname[HOST_NAME_LEN * ZBX_MAX_BYTES_IN_UTF8_CHAR + 1]; unsigned charmaintenance_status; unsigned charmaintenance_type; intmaintenance_from; interrors_from; unsigned charavailable; intdisable_until; intsnmp_errors_from; unsigned charsnmp_available; intsnmp_disable_until; intipmi_errors_from; unsigned charipmi_available; intipmi_disable_until; signed charipmi_authtype; unsigned charipmi_privilege; charipmi_username[HOST_IPMI_USERNAME_LEN_MAX]; charipmi_password[HOST_IPMI_PASSWORD_LEN_MAX]; intjmx_errors_from; unsigned charjmx_available; intjmx_disable_until; charinventory_mode; unsigned charstatus; unsigned chartls_connect; unsigned chartls_accept; #if defined(HAVE_POLARSSL) || defined(HAVE_GNUTLS) || defined(HAVE_OPENSSL) chartls_issuer[HOST_TLS_ISSUER_LEN_MAX]; chartls_subject[HOST_TLS_SUBJECT_LEN_MAX]; chartls_psk_identity[HOST_TLS_PSK_IDENTITY_LEN_MAX]; chartls_psk[HOST_TLS_PSK_LEN_MAX]; #endif charerror[HOST_ERROR_LEN_MAX]; charsnmp_error[HOST_ERROR_LEN_MAX]; charipmi_error[HOST_ERROR_LEN_MAX]; charjmx_error[HOST_ERROR_LEN_MAX]; } DC_HOST; //db.h #define ZBX_FLAGS_AGENT_STATUS_AVAILABLE0x00000001 #define ZBX_FLAGS_AGENT_STATUS_ERROR0x00000002 #define ZBX_FLAGS_AGENT_STATUS_ERRORS_FROM0x00000004 #define ZBX_FLAGS_AGENT_STATUS_DISABLE_UNTIL0x00000008 #define ZBX_FLAGS_AGENT_STATUS(ZBX_FLAGS_AGENT_STATUS_AVAILABLE |\ ZBX_FLAGS_AGENT_STATUS_ERROR |\ ZBX_FLAGS_AGENT_STATUS_ERRORS_FROM |\ ZBX_FLAGS_AGENT_STATUS_DISABLE_UNTIL) //common.h #defineFAIL-1
#8 根據agent_type 設定主機狀態
//dbconfig.c intDChost_deactivate(zbx_uint64_t hostid, unsigned char agent_type, const zbx_timespec_t *ts, zbx_agent_availability_t *in, zbx_agent_availability_t *out, const char *error_msg) { intret = FAIL, errors_from,disable_until; const char*error; unsigned charavailable; ZBX_DC_HOST*dc_host; /* don't try deactivating host if the unreachable delay has not passed since the first error */ if (CONFIG_UNREACHABLE_DELAY > ts->sec - in->errors_from) goto out; WRLOCK_CACHE; if (NULL == (dc_host = (ZBX_DC_HOST *)zbx_hashset_search(&config->hosts, &hostid))) goto unlock; /* Don't try deactivating host if:*/ /* - (server, proxy) it's not monitored any more; */ /* - (server) it's monitored by proxy.*/ if ((0 != (program_type & ZBX_PROGRAM_TYPE_SERVER) && 0 != dc_host->proxy_hostid) || HOST_STATUS_MONITORED != dc_host->status) { goto unlock; } DChost_get_agent_availability(dc_host, agent_type, in); available = in->available; error = in->error; if (0 == in->errors_from) { /* first error, schedule next unreachable check */ errors_from = ts->sec; disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { errors_from = in->errors_from; disable_until = in->disable_until; /* Check if other pollers haven't already attempted deactivating host. */ /* In that case should wait the initial unreachable delay before*/ /* trying to make it unavailable.*/ if (CONFIG_UNREACHABLE_DELAY <= ts->sec - errors_from) { /* repeating error */ if (CONFIG_UNREACHABLE_PERIOD > ts->sec - errors_from) { /* leave host available, schedule next unreachable check */ disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { /* make host unavailable, schedule next unavailable check */ disable_until = ts->sec + CONFIG_UNAVAILABLE_DELAY; available = HOST_AVAILABLE_FALSE; error = error_msg; } } } zbx_agent_availability_init(out, available, error, errors_from, disable_until); DChost_set_agent_availability(dc_host, ts->sec, agent_type, out); if (ZBX_FLAGS_AGENT_STATUS_NONE != out->flags) ret = SUCCEED; unlock: UNLOCK_CACHE; out: return ret; }
主要看下這段:
if (0 == in->errors_from) { /* first error, schedule next unreachable check */ errors_from = ts->sec; disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { errors_from = in->errors_from; disable_until = in->disable_until; /* Check if other pollers haven't already attempted deactivating host. */ /* In that case should wait the initial unreachable delay before*/ /* trying to make it unavailable.*/ if (CONFIG_UNREACHABLE_DELAY <= ts->sec - errors_from) { /* repeating error */ if (CONFIG_UNREACHABLE_PERIOD > ts->sec - errors_from) { /* leave host available, schedule next unreachable check */ disable_until = ts->sec + CONFIG_UNREACHABLE_DELAY; } else { /* make host unavailable, schedule next unavailable check */ disable_until = ts->sec + CONFIG_UNAVAILABLE_DELAY; available = HOST_AVAILABLE_FALSE; error = error_msg; } } }
如果錯誤第一次出現: 錯誤發生時間=檢查的時間戳 下次的檢查時間 = 時間戳+15s 否則: 錯誤發生時間 = in->errors_from 下次檢查時間 = in->disable_until 檢查的時間戳-錯誤發生時間>=15s: 檢查的時間戳-錯誤發生時間< 45s: 下次的檢查時間 = 檢查的時間戳+15s 否則: 下一次檢查時間 =檢查的時間戳+15s 主機可用性為不可用
用配置檔案來解釋就是: 如果由於網路等原因沒有實現專案的及時監控,第一次的監控間隔為UnreachableDelay時間(15s),如果這次也失敗了,那麼從第一次失敗到本次檢查在UnreachablePeriod時間內,會再次在UnreachableDelay時間後監控
#9 更新資料庫中的主機可用性資訊
// poller.c static intdb_host_update_availability(const zbx_host_availability_t *ha) { char*sql = NULL; size_tsql_alloc = 0, sql_offset = 0; if (SUCCEED == zbx_sql_add_host_availability(&sql, &sql_alloc, &sql_offset, ha)) { DBbegin(); DBexecute("%s", sql); DBcommit(); zbx_free(sql); return SUCCEED; } return FAIL; }
#10 根據agent_type設定主機可用性資訊
//poller.c static inthost_set_availability(DC_HOST *dc_host, unsigned char agent, const zbx_host_availability_t *ha) { const zbx_agent_availability_t*availability = &ha->agents[agent]; unsigned char*pavailable; int*perrors_from, *pdisable_until; char*perror; switch (agent) { case ZBX_AGENT_ZABBIX: pavailable = &dc_host->available; perror = dc_host->error; perrors_from = &dc_host->errors_from; pdisable_until = &dc_host->disable_until; break; case ZBX_AGENT_SNMP: pavailable = &dc_host->snmp_available; perror = dc_host->snmp_error; perrors_from = &dc_host->snmp_errors_from; pdisable_until = &dc_host->snmp_disable_until; break; case ZBX_AGENT_IPMI: pavailable = &dc_host->ipmi_available; perror = dc_host->ipmi_error; perrors_from = &dc_host->ipmi_errors_from; pdisable_until = &dc_host->ipmi_disable_until; break; case ZBX_AGENT_JMX: pavailable = &dc_host->jmx_available; perror = dc_host->jmx_error; pdisable_until = &dc_host->jmx_disable_until; perrors_from = &dc_host->jmx_errors_from; break; default: return FAIL; } if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_AVAILABLE)) *pavailable = availability->available; if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_ERROR)) zbx_strlcpy(perror, availability->error, HOST_ERROR_LEN_MAX); if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_ERRORS_FROM)) *perrors_from = availability->errors_from; if (0 != (availability->flags & ZBX_FLAGS_AGENT_STATUS_DISABLE_UNTIL)) *pdisable_until = availability->disable_until; return SUCCEED; }
#11-16
如果是第一次檢查:
記錄日誌first network error, wait for 15 seconds
否則:
如果資料庫中的主機如果顯示可用:
記錄日誌another network error, wait for 15 seconds
否則
記錄日誌temporarily disabling(這是前段頁面的綠色圖示會變為紅色)
從上面的程式碼可以看出,在三中情況下會產生network error, wait for 15s seconds的日誌,分別是在poller過程中產生的網路錯誤,閘道器問題,或者是檢查超時。總結下來就是:zabbix server 與zabbix agentd的連線和資料的收發不能成功或者在取得資料的一系列處理中花費的時間超過了zabbix server 的Timeout引數情況下發生。
從正常取值到出現異常的處理過程是這樣的:
正常取值 UnreachableDelay UnreachableDelay UnreachableDelay UnnavailableDelay 恢復
| | |
| | |
-----------------------UnreachablePeriod------------
1 2 3 4 5
過程 日誌
-
1 獲取正常監控資料
2 發生錯誤 ------------>first network
3 再次發生錯誤 ------------>another network
4 置為不可用 ------------> temporarily disabling
5 恢復 ------------> resuming
日誌中的15s在配置檔案中對應的配置UnreachableDelay,預設為15s,在原始碼中的位置是server.c中的CONFIG_UNREACHABLE_DELAY,
但注意這個配置不會解決任何network error的問題 ,只是為計算下一個檢查時間提供時間依據。還有大家應該注意到了UnreachableDelay引數和UnreachablePeriod 是倍數關係。我們在調優的時候需要注意下。
從zabbix 1.8版使用至今,根據我這幾年的經驗分析產生此類日誌基本出現在網路裝置,伺服器很少出現,這與SNMP使用UDP協議有關係,但主要問題還是幾方面問題:
- 1、網路不穩定
- 2、裝置端問題
- 3、poller排隊了
- 4、Timeout超時了
這四種中的Timeout和poller又是有相互聯絡的,關於伺服器如何設定poller,我後面的文章再介紹,先暫時分別來看下這四種情況:
網路不穩定多出現於幾種情況:
- 1、使用公網實現和IDC互連,也就是被檢查裝置和server不在一個IDC,這種情況建議在另一端增加proxy,使對端裝置的檢測都在內網進行
- 2、使用雲端網路,使用雲端的網路互連方式打通雲端裝置和IDC的互連,這種情況的網路對於使用者來說就是一種黑盒,基本無法排障,如果你使用大廠的服務,會偶爾出現日誌報錯,但不會影響到使用體驗
網路裝置端問題的情況:
- 1、裝置效能:如何判斷網路裝置端問題呢?可以在網路裝置上debug snmp資訊,看每個包是否是都回了還是報錯了,這種情況可以將snmp的採取間隔加大,
- 2、對端和server連線的埠頻寬打滿了
poller排隊處理;
poller數量是由zabbix_server配置檔案中的startpollers指定,poller.c主要做幾件事:1、從佇列中獲取item的資料 2、獲取item獲取監控資料 3、把資料放入快取
poller只會處理被動狀態的監控項:
如果你是伺服器出現此類日誌:解決方法一種是增大poller的數量,一種是把被動模式改為主動模式,
如果你是網路裝置:改用指令碼實現,或者增大poller數量
關於Timeout ,這裡有同學可能會說將伺服器的檢查時間調長為30s,這種設定如果檢查裝置少沒關係,數量比較多我不建議這樣調整,超過2s的檢測項都改在agentd改用指令碼實現吧
以上,是我使用zabbix中關於日誌報警wait for 15s seconds 的一些理解和心得,如果文章內容對你有所幫助,請點個贊吧。如果你發現文中有錯誤的方面,也請留言給我,謝謝!