#!/bin/bash # Make sure we are in the mesos-ec2 directory cd /root/mesos-ec2 # Set hostname based on EC2 private DNS name, so that it is set correctly # even if the instance is restarted with a different private DNS name PRIVATE_DNS=`wget -q -O - http://instance-data.ec2.internal/latest/meta-data/local-hostname` hostname $PRIVATE_DNS echo $PRIVATE_DNS > /etc/hostname export HOSTNAME=$PRIVATE_DNS # Fix the bash built-in hostname variable too echo "Setting up Mesos master on `hostname`..." # Read command-line arguments OS_NAME=$1 DOWNLOAD_METHOD=$2 BRANCH=$3 MASTERS_FILE="masters" MASTERS=`cat $MASTERS_FILE` NUM_MASTERS=`cat $MASTERS_FILE | wc -l` OTHER_MASTERS=`cat $MASTERS_FILE | sed '1d'` SLAVES=`cat slaves` ZOOS=`cat zoo` if [[ $ZOOS = *NONE* ]]; then NUM_ZOOS=0 ZOOS="" else NUM_ZOOS=`cat zoo | wc -l` fi # Scripts that get used for/while running Mesos. SCRIPTS="copy-dir mesos-daemon redeploy-mesos setup-slave ssh-no-keychecking start-mesos stop-mesos" HADOOP_HOME=/root/hadoop-0.20.2 #TODO(*): update config scripts to have conditionals for handling different # platforms JAVA_HOME=/usr/lib/jvm/java-6-openjdk #JAVA_HOME=/usr/lib/jvm/java-6-sun #works for karmic, this is lucid SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5" if [[ `tty` == "not a tty" ]] ; then echo "Expecting a tty or pty! (use the ssh -t option)." exit 1 fi echo "Setting executable permissions on scripts..." for s in $SCRIPTS; do chmod u+x $s; done echo "Running setup-slave on master to mount filesystems, etc..." ./setup-slave echo "SSH'ing to master machine(s) to approve key(s)..." for master in $MASTERS; do echo $master ssh $SSH_OPTS $master echo -n & sleep 0.3 done ssh $SSH_OPTS localhost echo -n & ssh $SSH_OPTS `hostname` echo -n & wait if [[ $NUM_ZOOS != 0 ]] ; then echo "SSH'ing to ZooKeeper server(s) to approve keys..." zid=1 for zoo in $ZOO; do echo $zoo ssh $SSH_OPTS $zoo echo -n \; mkdir -p /tmp/zookeeper \; echo $zid \> /tmp/zookeeper/myid & zid=$(($zid+1)) sleep 0.3 done fi # Try to SSH to each cluster node to approve their key. Since some nodes may # be slow in starting, we retry failed slaves up to 3 times. TODO="$SLAVES $ZOO $OTHER_MASTERS" # List of nodes to try (initially all) TRIES="0" # Number of times we've tried so far echo "SSH'ing to other cluster nodes to approve keys..." while [ "e$TODO" != "e" ] && [ $TRIES -lt 4 ] ; do NEW_TODO= for slave in $TODO; do echo $slave ssh $SSH_OPTS $slave echo -n if [ $? != 0 ] ; then NEW_TODO="$NEW_TODO $slave" fi done TRIES=$[$TRIES + 1] if [ "e$NEW_TODO" != "e" ] && [ $TRIES -lt 4 ] ; then sleep 15 TODO="$NEW_TODO" echo "Re-attempting SSH to cluster nodes to approve keys..." else break; fi done echo "RSYNC'ing /root/mesos-ec2 to other cluster nodes..." for node in $SLAVES $ZOO $OTHER_MASTERS; do echo $node rsync -e "ssh $SSH_OPTS" -az /root/mesos-ec2 $node:/root & scp $SSH_OPTS ~/.ssh/id_rsa $node:.ssh & sleep 0.3 done wait echo "Running slave setup script on other cluster nodes..." for node in $SLAVES $ZOO $OTHER_MASTERS; do echo $node ssh -t $SSH_OPTS root@$node "mesos-ec2/setup-slave" & sleep 0.3 done wait echo "RSYNC'ing HDFS config files to other cluster nodes..." for node in $SLAVES $ZOO $OTHER_MASTERS; do echo $node rsync -e "ssh $SSH_OPTS" -az $HADOOP_HOME/conf $node:$HADOOP_HOME & sleep 0.3 done wait DOWNLOADED=0 if [[ "$DOWNLOAD_METHOD" == "git" ]] ; then # change git's ssh command so it does not ask to accept a keys export GIT_SSH=/root/mesos-ec2/ssh-no-keychecking REPOSITORY=git://github.com/mesos/mesos.git echo "Checking out Mesos from $REPOSITORY" pushd /root > /dev/null 2>&1 rm -rf mesos mesos.tgz # Set git SSH command to a script that uses -o StrictHostKeyChecking=no git clone $REPOSITORY mesos pushd mesos 2>&1 git checkout -b $BRANCH --track origin/$BRANCH popd > /dev/null 2>&1 popd > /dev/null 2>&1 DOWNLOADED=1 fi # Build Mesos if we downloaded it if [[ "$DOWNLOADED" == "1" ]] ; then echo "Building Mesos..." pushd /root/mesos > /dev/null 2>&1 ./configure.ubuntu-lucid-64 make clean make popd > /dev/null 2>&1 if [ -d /root/spark ] ; then echo "Building Spark..." pushd /root/spark > /dev/null 2>&1 MESOS_HOME=/root/mesos make all native popd > /dev/null 2>&1 fi echo "Building Hadoop framework..." pushd /root/mesos/frameworks/hadoop-0.20.2 > /dev/null 2>&1 ant ant examples popd > /dev/null 2>&1 fi echo "Setting up Hadoop framework config files..." cp hadoop-framework-conf/* /root/mesos/frameworks/hadoop-0.20.2/conf echo "Setting up haproxy+apache framework config files..." cp haproxy+apache/* /root/mesos/frameworks/haproxy+apache echo "Setting up Spark config files..." # TODO: This currently overwrites whatever the user wrote there; on # the other hand, we also don't want to leave an old file created by # us because it would have the wrong hostname for HDFS etc mkdir -p /root/spark/conf echo "-Dspark.dfs=hdfs://$HOSTNAME:9000 -Dspark.repl.classdir=/nfs" \ > /root/spark/conf/java-opts echo "Redeploying /root/mesos..." ./redeploy-mesos echo "Setting up NFS..." if [ ! -e /nfs ] ; then mkdir -p /mnt/nfs rm -fr /nfs ln -s /mnt/nfs /nfs fi if ! grep -e '^/nfs ' /etc/exports; then echo "/nfs 10.0.0.0/8(ro,async,no_subtree_check)" >> /etc/exports fi # Unexport and re-export everything in /etc/exports because, if we are # restarting a stopped EC2 instance, we might have had an entry for /nfs in # /etc/exports before we created /mnt/nfs. exportfs -ua exportfs -a echo "Mounting NFS on slaves..." for slave in $SLAVES; do echo $slave ssh -t $SSH_OPTS root@$slave "mkdir -p /nfs; mount $HOSTNAME:/nfs /nfs" & sleep 0.3 done wait echo "Formatting HDFS namenode..." $HADOOP_HOME/bin/hadoop namenode -format echo "Starting HDFS..." $HADOOP_HOME/bin/start-dfs.sh sleep 1 if [[ $NUM_ZOOS != 0 ]]; then echo "Starting ZooKeeper quorum..." for zoo in $ZOOS; do ssh $SSH_OPTS $zoo "/root/mesos/third_party/zookeeper-*/bin/zkServer.sh start /dev/null" & sleep 0.1 done wait sleep 5 fi echo "Stopping any existing Mesos cluster..." ./stop-mesos sleep 2 echo "Starting Mesos cluster..." ./start-mesos