#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Install CDH. # set -x set -e ################################################################################ # Initialize variables ################################################################################ ROLES=$1 shift NN_HOST= JT_HOST= CLOUD_PROVIDER= while getopts "n:j:c:" OPTION; do case $OPTION in n) NN_HOST="$OPTARG" ;; j) JT_HOST="$OPTARG" ;; c) CLOUD_PROVIDER="$OPTARG" ;; esac done case $CLOUD_PROVIDER in ec2) # Use public hostname for EC2 SELF_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname` ;; *) SELF_HOST=`/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}'` ;; esac REPO=${REPO:-cdh3} HADOOP=hadoop-${HADOOP_VERSION:-0.20} HADOOP_CONF_DIR=/etc/$HADOOP/conf.dist for role in $(echo "$ROLES" | tr "," "\n"); do case $role in nn) NN_HOST=$SELF_HOST ;; jt) JT_HOST=$SELF_HOST ;; esac done function update_repo() { if which dpkg &> /dev/null; then cat > /etc/apt/sources.list.d/cloudera.list < /dev/null; then rm -f /etc/yum.repos.d/cloudera.repo REPO_NUMBER=`echo $REPO | sed -e 's/cdh\([0-9][0-9]*\)/\1/'` cat > /etc/yum.repos.d/cloudera-$REPO.repo < /dev/null; then apt-get update apt-get -y install $@ elif which rpm &> /dev/null; then yum install -y $@ else echo "No package manager found." fi } # Install Hadoop packages and dependencies function install_hadoop() { if which dpkg &> /dev/null; then apt-get update apt-get -y install $HADOOP cp -r /etc/$HADOOP/conf.empty $HADOOP_CONF_DIR update-alternatives --install /etc/$HADOOP/conf $HADOOP-conf $HADOOP_CONF_DIR 90 elif which rpm &> /dev/null; then yum install -y $HADOOP cp -r /etc/$HADOOP/conf.empty $HADOOP_CONF_DIR alternatives --install /etc/$HADOOP/conf $HADOOP-conf $HADOOP_CONF_DIR 90 fi } function prep_disk() { mount=$1 device=$2 automount=${3:-false} echo "warning: ERASING CONTENTS OF $device" mkfs.xfs -f $device if [ ! -e $mount ]; then mkdir $mount fi mount -o defaults,noatime $device $mount if $automount ; then echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab fi } function make_hadoop_dirs { for mount in "$@"; do if [ ! -e $mount/hadoop ]; then mkdir -p $mount/hadoop chown hadoop:hadoop $mount/hadoop fi done } # Configure Hadoop by setting up disks and site file function configure_hadoop() { MOUNT=/data FIRST_MOUNT=$MOUNT DFS_NAME_DIR=$MOUNT/hadoop/hdfs/name FS_CHECKPOINT_DIR=$MOUNT/hadoop/hdfs/secondary DFS_DATA_DIR=$MOUNT/hadoop/hdfs/data MAPRED_LOCAL_DIR=$MOUNT/hadoop/mapred/local MAX_MAP_TASKS=2 MAX_REDUCE_TASKS=1 CHILD_OPTS=-Xmx550m CHILD_ULIMIT=1126400 TMP_DIR=$MOUNT/tmp/hadoop-\${user.name} mkdir -p $MOUNT/hadoop chown hadoop:hadoop $MOUNT/hadoop mkdir $MOUNT/tmp chmod a+rwxt $MOUNT/tmp ############################################################################## # Modify this section to customize your Hadoop cluster. ############################################################################## cat > $HADOOP_CONF_DIR/hadoop-site.xml < dfs.block.size 134217728 true dfs.data.dir $DFS_DATA_DIR true dfs.datanode.du.reserved 1073741824 true dfs.datanode.handler.count 3 true dfs.name.dir $DFS_NAME_DIR true dfs.namenode.handler.count 5 true dfs.permissions true true dfs.replication $DFS_REPLICATION fs.checkpoint.dir $FS_CHECKPOINT_DIR true fs.default.name hdfs://$NN_HOST:8020/ fs.trash.interval 1440 true hadoop.tmp.dir $MOUNT/tmp/hadoop-\${user.name} true io.file.buffer.size 65536 mapred.child.java.opts $CHILD_OPTS mapred.child.ulimit $CHILD_ULIMIT true mapred.job.tracker $JT_HOST:8021 mapred.job.tracker.handler.count 5 true mapred.local.dir $MAPRED_LOCAL_DIR true mapred.map.tasks.speculative.execution true mapred.reduce.parallel.copies 10 mapred.reduce.tasks 10 mapred.reduce.tasks.speculative.execution false mapred.submit.replication 10 mapred.system.dir /hadoop/system/mapred mapred.tasktracker.map.tasks.maximum $MAX_MAP_TASKS true mapred.tasktracker.reduce.tasks.maximum $MAX_REDUCE_TASKS true tasktracker.http.threads 46 true mapred.compress.map.output true mapred.output.compression.type BLOCK hadoop.rpc.socket.factory.class.default org.apache.hadoop.net.StandardSocketFactory true hadoop.rpc.socket.factory.class.ClientProtocol true hadoop.rpc.socket.factory.class.JobSubmissionProtocol true io.compression.codecs org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec dfs.namenode.plugins org.apache.hadoop.thriftfs.NamenodePlugin Comma-separated list of namenode plug-ins to be activated. dfs.datanode.plugins org.apache.hadoop.thriftfs.DatanodePlugin Comma-separated list of datanode plug-ins to be activated. dfs.thrift.address 0.0.0.0:9090 jobtracker.thrift.address 0.0.0.0:9290 mapred.jobtracker.plugins org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin Comma-separated list of jobtracker plug-ins to be activated. EOF # Expose /metrics URL endpoint cat > $HADOOP_CONF_DIR/hadoop-metrics.properties < /dev/null; then apt-get -y install hue-common apt-get -y install hue-useradmin hue-jobsub hue-beeswax elif which rpm &> /dev/null; then yum install -y hue-common yum install hue-useradmin hue-jobsub hue-beeswax fi # Configure hue sed -i -e "s|http_port=8088|http_port=80|" /etc/hue/hue.ini # Hue logs should be on the /mnt partition rm -rf /var/log/hue mkdir -p $MOUNT/hue/logs chown hue:hue $MOUNT/hue/logs ln -s $MOUNT/hue/logs /var/log/hue chown -R hue:hue /var/log/hue } function install_hue_plugins() { if which dpkg &> /dev/null; then apt-get -y install hue-plugins elif which rpm &> /dev/null; then yum install -y hue-plugins fi } function start_hue() { /etc/init.d/hue start } function start_namenode() { if which dpkg &> /dev/null; then AS_HADOOP="su -s /bin/bash - hadoop -c" # Format HDFS [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP namenode -format" update-rc.d $HADOOP-namenode defaults update-rc.d $HADOOP-secondarynamenode defaults elif which rpm &> /dev/null; then AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" # Format HDFS [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP namenode -format" chkconfig --add $HADOOP-namenode chkconfig --add $HADOOP-secondarynamenode fi service $HADOOP-namenode start service $HADOOP-secondarynamenode start $AS_HADOOP "$HADOOP dfsadmin -safemode wait" $AS_HADOOP "/usr/bin/$HADOOP fs -mkdir /user" # The following is questionable, as it allows a user to delete another user # It's needed to allow users to create their own user directories $AS_HADOOP "/usr/bin/$HADOOP fs -chmod +w /user" # Create temporary directory for Pig and Hive in HDFS $AS_HADOOP "/usr/bin/$HADOOP fs -mkdir /tmp" $AS_HADOOP "/usr/bin/$HADOOP fs -chmod +w /tmp" $AS_HADOOP "/usr/bin/$HADOOP fs -mkdir /user/hive/warehouse" $AS_HADOOP "/usr/bin/$HADOOP fs -chmod +w /user/hive/warehouse" } function start_daemon() { daemon=$1 if which dpkg &> /dev/null; then update-rc.d $HADOOP-$daemon defaults elif which rpm &> /dev/null; then chkconfig --add $HADOOP-$daemon fi service $HADOOP-$daemon start } update_repo install_hadoop configure_hadoop install_hue_plugins for role in $(echo "$ROLES" | tr "," "\n"); do case $role in nn) install_hue start_namenode start_hue ;; snn) start_daemon secondarynamenode ;; jt) start_daemon jobtracker ;; dn) start_daemon datanode ;; tt) start_daemon tasktracker ;; esac done