# # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # # NAGIOS SERVER Check (status log update) <%if scope.function_hdp_nagios_members_exist('nagios-server')-%> define service { name hadoop-service use generic-service notification_options w,u,c first_notification_delay 0 notification_interval 0 # Send the notification once } define service { hostgroup_name nagios-server use hadoop-service service_description NAGIOS::Nagios status log staleness servicegroups NAGIOS check_command check_nagios!10!/var/nagios/status.dat!<%=nagios_lookup_daemon_str%> normal_check_interval 5 retry_check_interval 0.5 max_check_attempts 2 } # NAGIOS SERVER HDFS Checks define service { hostgroup_name nagios-server use hadoop-service service_description HDFS::Percent DataNodes storage full servicegroups HDFS check_command check_aggregate!"DATANODE::DataNode storage full"!10%!30% normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 } define service { hostgroup_name nagios-server use hadoop-service service_description HDFS::Percent DataNodes down servicegroups HDFS check_command check_aggregate!"DATANODE::DataNode process down"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } # NAGIOS SERVER MAPREDUCE Checks define service { hostgroup_name nagios-server use hadoop-service service_description MAPREDUCE::Percent TaskTrackers down servicegroups MAPREDUCE check_command check_aggregate!"TASKTRACKER::TaskTracker process down"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } # AMBARI AGENT Checks define service { hostgroup_name agent-servers use hadoop-service service_description AMBARI::Check ambari-agent process servicegroups AMBARI check_command check_ambari_agent_status normal_check_interval 5 retry_check_interval 0.5 max_check_attempts 2 } # NAGIOS SERVER ZOOKEEPER Checks <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> define service { hostgroup_name nagios-server use hadoop-service service_description ZOOKEEPER::Percent ZooKeeper Servers down servicegroups ZOOKEEPER check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process down"!35%!70% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } <%end-%> # NAGIOS SERVER HBASE Checks <%if scope.function_hdp_nagios_members_exist('hbasemasters')-%> define service { hostgroup_name nagios-server use hadoop-service service_description HBASE::Percent RegionServers down servicegroups HBASE check_command check_aggregate!"REGIONSERVER::RegionServer process down"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } <%end-%> <%end-%> # GANGLIA SERVER Checks <%if scope.function_hdp_nagios_members_exist('ganglia-server')-%> define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia [gmetad] process down servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia Collector [gmond] process down alert for slaves servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_slaves_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia Collector [gmond] process down alert for NameNode servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_namenode_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia Collector [gmond] process down alert for JobTracker servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_jobtracker_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } <%if scope.function_hdp_nagios_members_exist('hbasemasters')-%> define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia Collector [gmond] process down alert for HBase Master servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_hbase_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } <%end-%> <%if scope.function_hdp_nagios_members_exist('resorcemanager')-%> define service { hostgroup_name resorcemanager use hadoop-service service_description GANGLIA::Ganglia Collector [gmond] process down alert for Resource Manager servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_rm_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } <%end-%> <%if scope.function_hdp_nagios_members_exist('nodemanagers')-%> define service { hostgroup_name nodemanagers use hadoop-service service_description GANGLIA::Ganglia Collector [gmond] process down alert for Node Manager servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_nm_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } <%end-%> <%if scope.function_hdp_nagios_members_exist('historyserver2')-%> define service { hostgroup_name historyserver2 use hadoop-service service_description GANGLIA::Ganglia Collector [gmond] process down alert for History Server 2 servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_hs_port")%>!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } <%end-%> <%end-%> <%if scope.function_hdp_nagios_members_exist('snamenode')-%> # Secondary namenode checks define service { hostgroup_name snamenode use hadoop-service service_description NAMENODE::Secondary NameNode process down servicegroups HDFS check_command check_tcp!<%=scope.function_hdp_template_var("snamenode_port")%>!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('namenode')-%> # HDFS Checks define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::NameNode Web UI down servicegroups HDFS check_command check_webui!namenode!<%=scope.function_hdp_template_var("namenode_port")%> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::NameNode edit logs directory status servicegroups HDFS check_command check_name_dir_status!<%=scope.function_hdp_template_var("namenode_port")%> normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 } <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::NameNode host CPU utilization servicegroups HDFS check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } <% end %> define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::NameNode process down servicegroups HDFS check_command check_tcp!<%=scope.function_hdp_template_var("namenode_metadata_port")%>!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } define service { hostgroup_name namenode use hadoop-service service_description HDFS::Corrupt/Missing blocks servicegroups HDFS check_command check_hdfs_blocks!<%=scope.function_hdp_template_var("namenode_port")%>!0%!0% normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 } define service { hostgroup_name namenode use hadoop-service service_description HDFS::HDFS capacity utilization servicegroups HDFS check_command check_hdfs_capacity!<%=scope.function_hdp_template_var("namenode_port")%>!80%!90% normal_check_interval 10 retry_check_interval 1 max_check_attempts 1 } define service { hostgroup_name namenode use hadoop-service service_description HDFS::NameNode RPC latency servicegroups HDFS check_command check_rpcq_latency!NameNode!<%=scope.function_hdp_template_var("namenode_port")%>!3000!5000 normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 } <%end-%> # MAPREDUCE Checks <%if scope.function_hdp_nagios_members_exist('jobtracker')-%> define service { hostgroup_name jobtracker use hadoop-service service_description JOBTRACKER::JobTracker Web UI down servicegroups MAPREDUCE check_command check_webui!jobtracker!<%=scope.function_hdp_template_var("jtnode_port")%> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name jobtracker use hadoop-service service_description JOBTRACKER::JobHistory Web UI down servicegroups MAPREDUCE check_command check_webui!jobhistory!<%=scope.function_hdp_template_var("jobhistory_port")%> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> define service { hostgroup_name jobtracker use hadoop-service service_description JOBTRACKER::JobTracker CPU utilization servicegroups MAPREDUCE check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } <% end %> define service { hostgroup_name jobtracker use hadoop-service use hadoop-service service_description JOBTRACKER::JobTracker process down servicegroups MAPREDUCE check_command check_tcp!<%=scope.function_hdp_template_var("jtnode_port")%>!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name jobtracker use hadoop-service service_description MAPREDUCE::JobTracker RPC latency servicegroups MAPREDUCE check_command check_rpcq_latency!JobTracker!<%=scope.function_hdp_template_var("jtnode_port")%>!3000!5000 normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 } # MAPREDUCE::TASKTRACKER Checks define service { hostgroup_name tasktracker-servers use hadoop-service service_description TASKTRACKER::TaskTracker process down servicegroups MAPREDUCE check_command check_tcp!<%=scope.function_hdp_template_var("tasktracker_port")%>!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } # MAPREDUCE::TASKTRACKER Mapreduuce locad dir used space define service { hostgroup_name tasktracker-servers use hadoop-service service_description TASKTRACKER::Mapreduce local dir used space servicegroups MAPREDUCE check_command check_mapred_local_dir_used_space!<%=scope.function_hdp_default("mapred-site/mapred.local.dir")%>!85% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('resorcemanager')-%> # YARN::RESOURCEMANAGER Checks define service { hostgroup_name resorcemanager use hadoop-service service_description RESOURCEMANAGER::Resource Manager Web UI down servicegroups YARN check_command check_webui!resorcemanager!<%=scope.function_hdp_template_var("rm_port")%> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name resorcemanager use hadoop-service service_description RESOURCEMANAGER::Resource Manager CPU utilization servicegroups YARN check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } define service { hostgroup_name resorcemanager use hadoop-service service_description RESOURCEMANAGER::Resource Manager RPC latency servicegroups YARN check_command check_rpcq_latency!ResorceManager!<%=scope.function_hdp_template_var("rm_port")%>!3000!5000 normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 } <% end %> <%if scope.function_hdp_nagios_members_exist('nodemanagers')-%> # YARN::NODEMANAGER Checks define service { hostgroup_name nodemanagers use hadoop-service service_description NODEMANAGER::Node Manager process down servicegroups YARN check_command check_tcp!<%=scope.function_hdp_template_var("nm_port")%>!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } <% end %> <%if scope.function_hdp_nagios_members_exist('historyserver2')-%> # MAPREDUCE::HISTORYSERVER2 Checks define service { hostgroup_name historyserver2 use hadoop-service service_description HISTORYSERVER2::History Server 2 Web UI down servicegroups MAPREDUCE check_command check_webui!historyserver2!<%=scope.function_hdp_template_var("hs_port")%> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name historyserver2 use hadoop-service service_description HISTORYSERVER::History Server 2 CPU utilization servicegroups MAPREDUCE check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } define service { hostgroup_name historyserver2 use hadoop-service service_description HISTORYSERVER::History Server 2 RPC latency servicegroups MAPREDUCE check_command check_rpcq_latency!JobHistoryServer!<%=scope.function_hdp_template_var("hs_port")%>!3000!5000 normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 } <% end %> <%if scope.function_hdp_nagios_members_exist('slaves')-%> # HDFS::DATANODE Checks define service { hostgroup_name slaves use hadoop-service service_description DATANODE::DataNode process down servicegroups HDFS check_command check_tcp!<%=scope.function_hdp_template_var("datanode_port")%>!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } define service { hostgroup_name slaves use hadoop-service service_description DATANODE::DataNode storage full servicegroups HDFS check_command check_datanode_storage!<%=scope.function_hdp_template_var("datanode_port")%>!90%!90% normal_check_interval 5 retry_check_interval 1 max_check_attempts 2 } <%end-%> <%if scope.function_hdp_nagios_members_exist('flume-servers')-%> # FLUME Checks define service { hostgroup_name flume-servers use hadoop-service service_description FLUME::Flume Agent process down servicegroups FLUME check_command check_tcp!<%=scope.function_hdp_template_var("flume_port")%>!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> # ZOOKEEPER Checks define service { hostgroup_name zookeeper-servers use hadoop-service service_description ZOOKEEPER::ZooKeeper Server process down servicegroups ZOOKEEPER check_command check_tcp!<%=scope.function_hdp_template_var("clientPort")%>!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('hbasemasters')-%> # HBASE::REGIONSERVER Checks define service { hostgroup_name region-servers use hadoop-service service_description REGIONSERVER::RegionServer process down servicegroups HBASE check_command check_tcp!<%=scope.function_hdp_template_var("hbase_rs_port")%>!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } # HBASE:: MASTER Checks define service { hostgroup_name hbasemasters use hadoop-service service_description HBASEMASTER::HBase Master Web UI down servicegroups HBASE check_command check_webui!hbase!<%=scope.function_hdp_template_var("hbase_master_port")%> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> define service { hostgroup_name hbasemasters use hadoop-service service_description HBASEMASTER::HBase Master CPU utilization servicegroups HBASE check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } <% end %> define service { hostgroup_name hbasemasters use hadoop-service service_description HBASEMASTER::HBase Master process down servicegroups HBASE check_command check_tcp!<%=scope.function_hdp_template_var("hbase_master_port")%>!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 4 } <%end-%> <%if scope.function_hdp_nagios_members_exist('hiveserver')-%> # HIVE Metastore check define service { hostgroup_name hiveserver use hadoop-service service_description HIVE-METASTORE::Hive Metastore status check servicegroups HIVE-METASTORE <%if scope.function_hdp_template_var("security_enabled")-%> check_command check_hive_metastore_status!<%=scope.function_hdp_template_var("hive_metastore_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> <%else-%> check_command check_hive_metastore_status!<%=scope.function_hdp_template_var("hive_metastore_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!false <%end-%> normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('oozie-server')-%> # Oozie check define service { hostgroup_name oozie-server use hadoop-service service_description OOZIE::Oozie Server status check servicegroups OOZIE <%if scope.function_hdp_template_var("security_enabled")-%> check_command check_oozie_status!<%=scope.function_hdp_template_var("oozie_server_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> <%else-%> check_command check_oozie_status!<%=scope.function_hdp_template_var("oozie_server_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!false <%end-%> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('webhcat-server')-%> # WEBHCAT check define service { hostgroup_name webhcat-server use hadoop-service service_description WEBHCAT::WebHCat Server status check servicegroups WEBHCAT <%if scope.function_hdp_template_var("security_enabled")-%> check_command check_templeton_status!<%=scope.function_hdp_template_var("templeton_port")%>!v1!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> <%else-%> check_command check_templeton_status!<%=scope.function_hdp_template_var("templeton_port")%>!v1!false <%end-%> normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('hue-server')-%> define service { hostgroup_name hue-server use hadoop-service service_description HUE::Hue Server status check servicegroups HUE check_command check_hue_status normal_check_interval 100 retry_check_interval 0.5 max_check_attempts 3 } <%end-%>