# # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # # NAGIOS SERVER Check (status log update) <%if scope.function_hdp_nagios_members_exist('nagios-server')-%> define service { name hadoop-service use generic-service notification_options w,u,c first_notification_delay 0 notification_interval 0 # Send the notification once } define service { hostgroup_name all-servers use hadoop-service service_description PUPPET::Puppet agent down servicegroups PUPPET check_command check_tcp!8139!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name nagios-server use hadoop-service service_description NAGIOS::Nagios status log staleness servicegroups NAGIOS check_command check_nagios!10!/var/nagios/status.dat!/usr/bin/nagios normal_check_interval 5 retry_check_interval 0.5 max_check_attempts 2 } # NAGIOS SERVER HDFS Checks define service { hostgroup_name nagios-server use hadoop-service service_description HDFS::Percent DataNodes storage full servicegroups HDFS check_command check_aggregate!"DATANODE::Storage full"!10%!30% normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 } define service { hostgroup_name nagios-server use hadoop-service service_description HDFS::Percent DataNodes down servicegroups HDFS check_command check_aggregate!"DATANODE::Process down"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } # NAGIOS SERVER YARN Checks define service { hostgroup_name nagios-server use hadoop-service service_description YARN::Percent NodeManagers down servicegroups YARN check_command check_aggregate!"NODEMANAGER::Process down"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } # NAGIOS SERVER ZOOKEEPER Checks <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> define service { hostgroup_name nagios-server use hadoop-service service_description ZOOKEEPER::Percent zookeeper servers down servicegroups ZOOKEEPER check_command check_aggregate!"ZKSERVERS::ZKSERVERS Process down"!35%!70% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } <%end-%> # NAGIOS SERVER HBASE Checks <%if scope.function_hdp_nagios_members_exist('hbasemaster')-%> define service { hostgroup_name nagios-server use hadoop-service service_description HBASE::Percent region servers down servicegroups HBASE check_command check_aggregate!"REGIONSERVER::Process down"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } <%end-%> <%end-%> # GANGLIA SERVER Checks <%if scope.function_hdp_nagios_members_exist('ganglia-server')-%> define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia [gmetad] Process down servicegroups GANGLIA check_command check_tcp!8651!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia collector [gmond] Process down alert for slaves servicegroups GANGLIA check_command check_tcp!8660!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia collector [gmond] Process down alert for namenode servicegroups GANGLIA check_command check_tcp!8661!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia collector [gmond] Process down alert for resourcemanager servicegroups GANGLIA check_command check_tcp!8662!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } <%if scope.function_hdp_nagios_members_exist('hbasemaster')-%> define service { hostgroup_name ganglia-server use hadoop-service service_description GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster servicegroups GANGLIA check_command check_tcp!8663!-w 1 -c 1 normal_check_interval 0.25 retry_check_interval 0.25 max_check_attempts 4 } <%end-%> <%end-%> <%if scope.function_hdp_nagios_members_exist('snamenode')-%> # Secondary namenode checks define service { hostgroup_name snamenode use hadoop-service service_description NAMENODE::Secondary Namenode Process down servicegroups HDFS check_command check_tcp!50090!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('namenode')-%> # HDFS Checks define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::Namenode Web UI down servicegroups HDFS check_command check_webui!namenode normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::Namenode Edit logs directory status servicegroups HDFS check_command check_name_dir_status!50070 normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 } define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::Namenode Host CPU utilization servicegroups HDFS check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } define service { hostgroup_name namenode use hadoop-service service_description NAMENODE::Namenode Process down servicegroups HDFS check_command check_tcp!8020!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 } define service { hostgroup_name namenode use hadoop-service service_description HDFS::Corrupt/Missing blocks servicegroups HDFS check_command check_hdfs_blocks!50070!0%!0% normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 } define service { hostgroup_name namenode use hadoop-service service_description HDFS::HDFS Capacity utilization servicegroups HDFS check_command check_hdfs_capacity!50070!80%!90% normal_check_interval 10 retry_check_interval 1 max_check_attempts 1 } define service { hostgroup_name namenode use hadoop-service service_description HDFS::Namenode RPC Latency servicegroups HDFS check_command check_rpcq_latency!NameNode!50070!3000!5000 normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 } <%end-%> # YARN Checks <%if scope.function_hdp_nagios_members_exist('resourcemanager')-%> define service { hostgroup_name resourcemanager use hadoop-service service_description RESOURCEMANAGER::ResourceManager Web UI down servicegroups YARN check_command check_webui!resourcemanager normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name resourcemanager use hadoop-service service_description RESOURCEMANAGER::JobHistoryServer Web UI down servicegroups YARN check_command check_webui!jobhistoryserver2 normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name resourcemanager use hadoop-service service_description RESOURCEMANAGER::ResourceManager CPU utilization servicegroups YARN check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } define service { hostgroup_name resourcemanager use hadoop-service service_description RESOURCEMANAGER::ResourceManager Process down servicegroups YARN check_command check_tcp!8088!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 4 } define service { hostgroup_name resourcemanager use hadoop-service service_description YARN::ResourceManager RPC Latency servicegroups YARN check_command check_rpcq_latency!ResourceManager!8088!3000!5000 normal_check_interval 5 retry_check_interval 1 max_check_attempts 5 } <%end-%> <%if scope.function_hdp_nagios_members_exist('slaves')-%> # HDFS::DATANODE Checks define service { hostgroup_name slaves use hadoop-service service_description DATANODE::Process down servicegroups HDFS check_command check_tcp!50010!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } define service { hostgroup_name slaves use hadoop-service service_description DATANODE::Storage full servicegroups HDFS check_command check_datanode_storage!50075!90%!90% normal_check_interval 5 retry_check_interval 1 max_check_attempts 2 } # YARN::NODEMANAGER Checks define service { hostgroup_name slaves use hadoop-service service_description NODEMANAGER::Process down servicegroups YARN check_command check_tcp!50060!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> # ZOOKEEPER Checks define service { hostgroup_name zookeeper-servers use hadoop-service service_description ZKSERVERS::ZKSERVERS Process down servicegroups ZOOKEEPER check_command check_tcp!2181!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('hbasemaster')-%> # HBASE::REGIONSERVER Checks define service { hostgroup_name region-servers use hadoop-service service_description REGIONSERVER::Process down servicegroups HBASE check_command check_tcp!60020!-w 1 -c 1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } # HBASE:: MASTER Checks define service { hostgroup_name hbasemaster use hadoop-service service_description HBASEMASTER::HBase Web UI down servicegroups HBASE check_command check_webui!hbase normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } define service { hostgroup_name hbasemaster use hadoop-service service_description HBASEMASTER::HBaseMaster CPU utilization servicegroups HBASE check_command check_cpu!200%!250% normal_check_interval 5 retry_check_interval 2 max_check_attempts 5 } define service { hostgroup_name hbasemaster use hadoop-service service_description HBASEMASTER::HBaseMaster Process down servicegroups HBASE check_command check_tcp!60000!-w 1 -c 1 normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 4 } <%end-%> <%if scope.function_hdp_nagios_members_exist('hiveserver')-%> # HIVE Metastore check define service { hostgroup_name hiveserver use hadoop-service service_description HIVE-METASTORE::HIVE-METASTORE status check servicegroups HIVE-METASTORE check_command check_hive_metastore_status!9083 normal_check_interval 0.5 retry_check_interval 0.5 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('oozie-server')-%> # Oozie check define service { hostgroup_name oozie-server use hadoop-service service_description OOZIE::Oozie status check servicegroups OOZIE check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java32_home") %> normal_check_interval 1 retry_check_interval 1 max_check_attempts 3 } <%end-%> <%if scope.function_hdp_nagios_members_exist('templeton-server')-%> # Templeton check define service { hostgroup_name templeton-server use hadoop-service service_description TEMPLETON::Templeton status check servicegroups TEMPLETON check_command check_templeton_status!50111!v1 normal_check_interval 1 retry_check_interval 0.5 max_check_attempts 3 } <%end-%>