#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Install Apache Hadoop. # set -x set -e ################################################################################ # Initialize variables ################################################################################ ROLES=$1 shift NN_HOST= JT_HOST= CLOUD_PROVIDER= while getopts "n:j:c:" OPTION; do case $OPTION in n) NN_HOST="$OPTARG" ;; j) JT_HOST="$OPTARG" ;; c) CLOUD_PROVIDER="$OPTARG" ;; esac done case $CLOUD_PROVIDER in ec2) # Use public hostname for EC2 SELF_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname` ;; *) SELF_HOST=`/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}'` ;; esac HADOOP_VERSION=${HADOOP_VERSION:-0.20.2} HADOOP_HOME=/usr/local/hadoop-$HADOOP_VERSION HADOOP_CONF_DIR=$HADOOP_HOME/conf for role in $(echo "$ROLES" | tr "," "\n"); do case $role in nn) NN_HOST=$SELF_HOST ;; jt) JT_HOST=$SELF_HOST ;; esac done function update_repo() { if which dpkg &> /dev/null; then sudo apt-get update elif which rpm &> /dev/null; then yum update -y yum fi } # Install a list of packages on debian or redhat as appropriate function install_packages() { if which dpkg &> /dev/null; then apt-get update apt-get -y install $@ elif which rpm &> /dev/null; then yum install -y $@ else echo "No package manager found." fi } function install_hadoop() { useradd hadoop hadoop_tar_url=http://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz hadoop_tar_file=`basename $hadoop_tar_url` hadoop_tar_md5_file=`basename $hadoop_tar_url.md5` curl="curl --retry 3 --silent --show-error --fail" for i in `seq 1 3`; do $curl -O $hadoop_tar_url $curl -O $hadoop_tar_url.md5 if md5sum -c $hadoop_tar_md5_file; then break; else rm -f $hadoop_tar_file $hadoop_tar_md5_file fi done if [ ! -e $hadoop_tar_file ]; then echo "Failed to download $hadoop_tar_url. Aborting." exit 1 fi tar zxf $hadoop_tar_file -C /usr/local rm -f $hadoop_tar_file $hadoop_tar_md5_file echo "export HADOOP_HOME=$HADOOP_HOME" >> ~root/.bashrc echo 'export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH' >> ~root/.bashrc } function prep_disk() { mount=$1 device=$2 automount=${3:-false} echo "warning: ERASING CONTENTS OF $device" mkfs.xfs -f $device if [ ! -e $mount ]; then mkdir $mount fi mount -o defaults,noatime $device $mount if $automount ; then echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab fi } function make_hadoop_dirs { for mount in "$@"; do if [ ! -e $mount/hadoop ]; then mkdir -p $mount/hadoop chown hadoop:hadoop $mount/hadoop fi done } # Configure Hadoop by setting up disks and site file function configure_hadoop() { case $CLOUD_PROVIDER in ec2) MOUNT=/mnt ;; *) MOUNT=/data ;; esac FIRST_MOUNT=$MOUNT DFS_NAME_DIR=$MOUNT/hadoop/hdfs/name FS_CHECKPOINT_DIR=$MOUNT/hadoop/hdfs/secondary DFS_DATA_DIR=$MOUNT/hadoop/hdfs/data MAPRED_LOCAL_DIR=$MOUNT/hadoop/mapred/local MAX_MAP_TASKS=2 MAX_REDUCE_TASKS=1 CHILD_OPTS=-Xmx550m CHILD_ULIMIT=1126400 TMP_DIR=$MOUNT/tmp/hadoop-\${user.name} mkdir -p $MOUNT/hadoop chown hadoop:hadoop $MOUNT/hadoop mkdir $MOUNT/tmp chmod a+rwxt $MOUNT/tmp mkdir /etc/hadoop ln -s $HADOOP_CONF_DIR /etc/hadoop/conf ############################################################################## # Modify this section to customize your Hadoop cluster. ############################################################################## cat > $HADOOP_CONF_DIR/hadoop-site.xml < dfs.block.size 134217728 true dfs.data.dir $DFS_DATA_DIR true dfs.datanode.du.reserved 1073741824 true dfs.datanode.handler.count 3 true dfs.name.dir $DFS_NAME_DIR true dfs.namenode.handler.count 5 true dfs.permissions true true dfs.replication $DFS_REPLICATION fs.checkpoint.dir $FS_CHECKPOINT_DIR true fs.default.name hdfs://$NN_HOST:8020/ fs.trash.interval 1440 true hadoop.tmp.dir $MOUNT/tmp/hadoop-\${user.name} true io.file.buffer.size 65536 mapred.child.java.opts $CHILD_OPTS mapred.child.ulimit $CHILD_ULIMIT true mapred.job.tracker $JT_HOST:8021 mapred.job.tracker.handler.count 5 true mapred.local.dir $MAPRED_LOCAL_DIR true mapred.map.tasks.speculative.execution true mapred.reduce.parallel.copies 10 mapred.reduce.tasks 10 mapred.reduce.tasks.speculative.execution false mapred.submit.replication 10 mapred.system.dir /hadoop/system/mapred mapred.tasktracker.map.tasks.maximum $MAX_MAP_TASKS true mapred.tasktracker.reduce.tasks.maximum $MAX_REDUCE_TASKS true tasktracker.http.threads 46 true mapred.compress.map.output true mapred.output.compression.type BLOCK hadoop.rpc.socket.factory.class.default org.apache.hadoop.net.StandardSocketFactory true hadoop.rpc.socket.factory.class.ClientProtocol true hadoop.rpc.socket.factory.class.JobSubmissionProtocol true io.compression.codecs org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec EOF # Keep PID files in a non-temporary directory sed -i -e "s|# export HADOOP_PID_DIR=.*|export HADOOP_PID_DIR=/var/run/hadoop|" \ $HADOOP_CONF_DIR/hadoop-env.sh mkdir -p /var/run/hadoop chown -R hadoop:hadoop /var/run/hadoop # Set SSH options within the cluster sed -i -e 's|# export HADOOP_SSH_OPTS=.*|export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no"|' \ $HADOOP_CONF_DIR/hadoop-env.sh # Disable IPv6 sed -i -e 's|# export HADOOP_OPTS=.*|export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true"|' \ $HADOOP_CONF_DIR/hadoop-env.sh # Hadoop logs should be on the /mnt partition sed -i -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/var/log/hadoop/logs|' \ $HADOOP_CONF_DIR/hadoop-env.sh rm -rf /var/log/hadoop mkdir $MOUNT/hadoop/logs chown hadoop:hadoop $MOUNT/hadoop/logs ln -s $MOUNT/hadoop/logs /var/log/hadoop chown -R hadoop:hadoop /var/log/hadoop } # Sets up small website on cluster. function setup_web() { if which dpkg &> /dev/null; then apt-get -y install thttpd WWW_BASE=/var/www elif which rpm &> /dev/null; then # enable EPEL for thttpd package rpm -Uvh http://download.fedora.redhat.com/pub/epel/5/i386/epel-release-5-4.noarch.rpm sed -i -e 's/enabled=1/enabled=0/' /etc/yum.repos.d/epel.repo yum install -y --enablerepo=epel thttpd chkconfig --add thttpd WWW_BASE=/var/www/thttpd/html fi cat > $WWW_BASE/index.html << END Hadoop Cloud Cluster

Hadoop Cloud Cluster

To browse the cluster you need to have a proxy configured. Start the proxy with hadoop-cloud proxy <cluster_name>, and point your browser to this Proxy Auto-Configuration (PAC) file. To manage multiple proxy configurations, you may wish to use FoxyProxy. END service thttpd restart } function start_namenode() { if which dpkg &> /dev/null; then AS_HADOOP="su -s /bin/bash - hadoop -c" elif which rpm &> /dev/null; then AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" fi # Format HDFS [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP_HOME/bin/hadoop namenode -format" $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start namenode" $AS_HADOOP "$HADOOP_HOME/bin/hadoop dfsadmin -safemode wait" $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -mkdir /user" # The following is questionable, as it allows a user to delete another user # It's needed to allow users to create their own user directories $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -chmod +w /user" } function start_daemon() { if which dpkg &> /dev/null; then AS_HADOOP="su -s /bin/bash - hadoop -c" elif which rpm &> /dev/null; then AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" fi $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start $1" } update_repo install_hadoop configure_hadoop for role in $(echo "$ROLES" | tr "," "\n"); do case $role in nn) setup_web start_namenode ;; snn) start_daemon secondarynamenode ;; jt) start_daemon jobtracker ;; dn) start_daemon datanode ;; tt) start_daemon tasktracker ;; esac done