#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # The Pig command script # # Environment Variables # # JAVA_HOME The java implementation to use. Overrides JAVA_HOME. # # PIG_CLASSPATH Extra Java CLASSPATH entries. # # PIG_USER_CLASSPATH_FIRST If set, add user provided classpath entries to # the top of classpath instead of appending them. # Default is unset, i.e. the classpath entries are # placed normally at the end of a pre-defined classpath. # # HADOOP_HOME/HADOOP_PREFIX Environment HADOOP_HOME/HADOOP_PREFIX(0.20.205) # # HADOOP_CONF_DIR Hadoop conf dir # # PIG_HEAPSIZE The maximum amount of heap to use, in MB. # Default is 1000. # # PIG_OPTS Extra Java runtime options. # # PIG_CONF_DIR Alternate conf dir. Default is ${PIG_HOME}/conf. # # HBASE_HOME Optionally, the HBase installation directory. # Defaults to ${PIG_HOME}/share/hbase # # HBASE_CONF_DIR - Optionally, the HBase configuration to run against # when using HBaseStorage. Defaults to ${HBASE_HOME}/conf cygwin=false case "`uname`" in CYGWIN*) cygwin=true;; esac debug=false remaining=() includeHCatalog=""; addJarString=-Dpig.additional.jars.uris\=; additionalJars=""; prevArgExecType=false; isSparkMode=false; isSparkLocalMode=false; sparkversion=2; #verify the execType is SPARK or SPARK_LOCAL or not function processExecType(){ execType=$1 execTypeUpperCase=$(echo $execType |tr [a-z] [A-Z]) if [[ "$execTypeUpperCase" == "SPARK" ]]; then isSparkMode=true elif [[ "$execTypeUpperCase" == "SPARK_LOCAL" ]]; then isSparkLocalMode=true fi } # filter command line parameter for f in "$@"; do if [[ $f == "-secretDebugCmd" || $f == "-printCmdDebug" ]]; then debug=true elif [[ $f == "-useHCatalog" ]]; then # if need to use hcatalog, we need to add the hcatalog and hive jars # to the classpath and also include the hive configuration xml file # for pig to work correctly with hcatalog # because of PIG-2532, including the jars in the classpath is # sufficient to ensure that they are registered as well includeHCatalog=true; elif [[ "$includeHCatalog" == "true" && $f == $addJarString* ]]; then additionalJars=`echo $f | sed s/$addJarString//` elif [[ "$f" == "-x" || "$f" == "-exectype" ]]; then prevArgExecType=true; remaining[${#remaining[@]}]="$f" elif [[ "$prevArgExecType" == "true" ]]; then prevArgExecType=false; processExecType $f remaining[${#remaining[@]}]="$f" else remaining[${#remaining[@]}]="$f" fi done # resolve links - $0 may be a softlink this="${BASH_SOURCE-$0}" # convert relative path to absolute path bin=$(cd -P -- "$(dirname -- "$this")">/dev/null && pwd -P) script="$(basename -- "$this")" this="$bin/$script" # the root of the Pig installation if [ -z "$PIG_HOME" ]; then export PIG_HOME=`dirname "$this"`/.. fi if [ -z "$PIG_CONF_DIR" ]; then if [ -f ${PIG_HOME}/conf/pig.properties ]; then PIG_CONF_DIR=${PIG_HOME}/conf fi fi if [ -z "$PIG_CONF_DIR" ]; then if [ -d /etc/pig ]; then # if installed with rpm/deb package PIG_CONF_DIR="/etc/pig" fi fi if [ -f "${PIG_CONF_DIR}/pig-env.sh" ]; then . "${PIG_CONF_DIR}/pig-env.sh" fi # some Java parameters if [ "$JAVA_HOME" != "" ]; then #echo "run java in $JAVA_HOME" JAVA_HOME=$JAVA_HOME fi if [ "$JAVA_HOME" = "" ]; then echo "Error: JAVA_HOME is not set." exit 1 fi JAVA=$JAVA_HOME/bin/java JAVA_HEAP_MAX=-Xmx1000m # check envvars which might override default args if [ "$PIG_HEAPSIZE" != "" ]; then JAVA_HEAP_MAX="-Xmx""$PIG_HEAPSIZE""m" fi # CLASSPATH initially contains $PIG_CONF_DIR CLASSPATH="${PIG_CONF_DIR}" CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar if [ "$includeHCatalog" == "true" ]; then # need to provide the hcatalog jar file path as well as # the location of the hive jars on which hcatalog depends hiveMetaStoreJar=hive-metastore-*.jar thriftJar=libthrift-*.jar hiveExecJar=hive-exec-*.jar fbJar=libfb303-*.jar jdoECJar=jdo*-api-*.jar slfJar=slf4j-api-*.jar hbaseHiveJar=hive-hbase-handler-*.jar if [ "$HIVE_HOME" == "" ]; then if [ -d "/usr/lib/hive" ]; then HIVE_HOME=/usr/lib/hive else echo "Please initialize HIVE_HOME" exit -1 fi fi hiveMetaStoreVersion=`ls $HIVE_HOME/lib/$hiveMetaStoreJar` thriftVersion=`ls $HIVE_HOME/lib/$thriftJar` hiveExecVersion=`ls $HIVE_HOME/lib/$hiveExecJar` fbJarVersion=`ls $HIVE_HOME/lib/$fbJar` jdoECJarVersion=`ls $HIVE_HOME/lib/$jdoECJar` slfJarVersion=`ls $HIVE_HOME/lib/$slfJar` hbaseHiveVersion=`ls $HIVE_HOME/lib/$hbaseHiveJar` # hcatalog jar name for 0.4 and earlier hcatJarOld=hcatalog-*.jar # hcatalog jar name for 0.5 and newer hcatJar=*hcatalog-core-*.jar hbaseHCatJar=*hbase-storage-handler-*.jar pigHCatJar=*hcatalog-pig-adapter-*.jar if [ "$HCAT_HOME" == "" ]; then if [ -d "/usr/lib/hcatalog" ]; then HCAT_HOME=/usr/lib/hcatalog elif [ -d "/usr/lib/hive-hcatalog" ]; then HCAT_HOME=/usr/lib/hive-hcatalog else echo "Please initialize HCAT_HOME" exit -1 fi fi hcatJarPath=`ls $HCAT_HOME/share/hcatalog/$hcatJar` # if hcat jar is not found may be we are on hcatalog 0.4 or older if [ 'xx' == "x${hcatJarPath}x" ]; then hcatJarPath=`ls $HCAT_HOME/share/hcatalog/$hcatJarOld | grep -v server` fi # if we are using an older hcatalog version then the jar is on a different path if [ -d "$HCAT_HOME/share/hcatalog/storage-handlers/hbase/lib" ]; then # in 0.5 and newer we need to add multiple jars to the class path hbaseHCatJarPath="$HCAT_HOME/share/hcatalog/storage-handlers/hbase/lib/*" else hbaseHCatJarPath=`ls $HCAT_HOME/lib/$hbaseHCatJar` fi # get the pig storage handler jar pigHCatJarPath=`ls $HCAT_HOME/share/hcatalog/${pigHCatJar}` HCAT_CLASSPATHS=$hiveMetaStoreVersion:$thriftVersion:$hiveExecVersion:$fbJarVersion:$jdoECJarVersion:$slfJarVersion:$hbaseHiveVersion:$hcatJarPath:$hbaseHCatJarPath:$pigHCatJarPath ADDITIONAL_CLASSPATHS=file://$hiveMetaStoreVersion,file://$thriftVersion,file://$hiveExecVersion,file://$fbJarVersion,file://$jdoECJarVersion,file://$slfJarVersion,file://$hbaseHiveVersion,file://$hcatJarPath,file://$hbaseHCatJarPath,file://$pigHCatJarPath if [ "$additionalJars" != "" ]; then ADDITIONAL_CLASSPATHS=$ADDITIONAL_CLASSPATHS,$additionalJars fi CLASSPATH=${CLASSPATH}:$HCAT_CLASSPATHS:$HIVE_HOME/conf fi # Add user-specified CLASSPATH entries via PIG_CLASSPATH # If PIG_USER_CLASSPATH_FIRST is set, prepend the entries if [ "$PIG_CLASSPATH" != "" ]; then if [ "$PIG_USER_CLASSPATH_FIRST" == "" ]; then CLASSPATH=${CLASSPATH}:${PIG_CLASSPATH} else CLASSPATH=${PIG_CLASSPATH}:${CLASSPATH} fi fi # add HADOOP_CONF_DIR if [ "$HADOOP_CONF_DIR" != "" ]; then CLASSPATH=${CLASSPATH}:${HADOOP_CONF_DIR} fi # so that filenames w/ spaces are handled correctly in loops below IFS= shopt -s extglob shopt -s nullglob for f in $PIG_HOME/lib/*.jar; do CLASSPATH=${CLASSPATH}:$f; done JYTHON_JAR=`echo ${PIG_HOME}/lib/jython*.jar` if [ -z "$JYTHON_JAR" ]; then JYTHON_JAR=`echo $PIG_HOME/build/ivy/lib/Pig/jython*.jar` if [ -n "$JYTHON_JAR" ]; then CLASSPATH=${CLASSPATH}:$JYTHON_JAR fi fi JRUBY_JAR=`echo ${PIG_HOME}/lib/jruby-complete-*.jar` if [ -z "$JRUBY_JAR" ]; then JRUBY_JAR=`echo $PIG_HOME/build/ivy/lib/Pig/jruby-complete-*.jar` if [ -n "$JRUBY_JAR" ]; then CLASSPATH=${CLASSPATH}:$JRUBY_JAR fi fi for f in $PIG_HOME/share/pig/lib/*.jar; do CLASSPATH=${CLASSPATH}:$f; done # For Hadoop 0.23.0+ # #if [ -d "${PIG_HOME}/share/hadoop/common" ]; then # for f in ${PIG_HOME}/share/hadoop/common/hadoop*.jar; do # CLASSPATH=${CLASSPATH}:$f; # done #fi # #if [ -d "${PIG_HOME}/share/hadoop/hdfs" ]; then # for f in ${PIG_HOME}/share/hadoop/hdfs/hadoop*.jar; do # CLASSPATH=${CLASSPATH}:$f; # done #fi # #if [ -d "${PIG_HOME}/share/hadoop/mapreduce" ]; then # for f in ${PIG_HOME}/share/hadoop/mapreduce/hadoop*.jar; do # CLASSPATH=${CLASSPATH}:$f; # done #fi if which hadoop >/dev/null; then HADOOP_BIN=`which hadoop` fi if [[ -z "$HADOOP_BIN" && -n "$HADOOP_PREFIX" ]]; then if [ -f $HADOOP_PREFIX/bin/hadoop ]; then HADOOP_BIN=$HADOOP_PREFIX/bin/hadoop fi fi if [[ -z "$HADOOP_BIN" && -n "$HADOOP_HOME" && -d "$HADOOP_HOME" ]]; then if [ -f $HADOOP_HOME/bin/hadoop ]; then HADOOP_BIN=$HADOOP_HOME/bin/hadoop fi fi if [ -z "$HADOOP_BIN" ]; then # if installed with rpm/deb package if [ -f /usr/bin/hadoop ]; then HADOOP_BIN=/usr/bin/hadoop fi fi # find out the HADOOP_HOME in order to find hadoop jar # we use the name of hadoop jar to decide if user is using # hadoop 1 or hadoop 2 if [[ -z "$HADOOP_HOME" && -n "$HADOOP_PREFIX" ]]; then HADOOP_HOME=$HADOOP_PREFIX fi if [[ -z "$HADOOP_HOME" && -n "$HADOOP_BIN" ]]; then HADOOP_HOME=`dirname $HADOOP_BIN`/.. fi # if using HBase, likely want to include HBase jars and config HBH=${HBASE_HOME:-"${PIG_HOME}/share/hbase"} if [ -d "${HBH}" ]; then for f in ${HBH}/hbase-*.jar; do CLASSPATH=${CLASSPATH}:$f done for f in ${HBH}/lib/*.jar; do CLASSPATH=${CLASSPATH}:$f done HBASE_CONF_DIR=${HBASE_CONF_DIR:-"${HBH}/conf"} fi if [ -n "$HBASE_CONF_DIR" ] && [ -d "$HBASE_CONF_DIR" ]; then CLASSPATH=$HBASE_CONF_DIR:$CLASSPATH fi if [ -d "${PIG_HOME}/etc/hadoop" ]; then CLASSPATH=${CLASSPATH}:${PIG_HOME}/etc/hadoop; fi # locate ZooKeeper ZKH=${ZOOKEEPER_HOME:-"${PIG_HOME}/share/zookeeper"} if [ -d "$ZKH" ] ; then for f in ${ZKH}/zookeeper-*.jar; do CLASSPATH=${CLASSPATH}:$f done fi # default log directory & file if [ "$PIG_LOG_DIR" = "" ]; then PIG_LOG_DIR="$PIG_HOME/logs" fi if [ "$PIG_LOGFILE" = "" ]; then PIG_LOGFILE='pig.log' fi # cygwin path translation if $cygwin; then CLASSPATH=`cygpath -p -w "$CLASSPATH"` PIG_HOME=`cygpath -d "$PIG_HOME"` PIG_LOG_DIR=`cygpath -d "$PIG_LOG_DIR"` fi # restore ordinary behaviour unset IFS PIG_OPTS="$PIG_OPTS -Dpig.log.dir=$PIG_LOG_DIR" PIG_OPTS="$PIG_OPTS -Dpig.log.file=$PIG_LOGFILE" PIG_OPTS="$PIG_OPTS -Dpig.home.dir=$PIG_HOME" if [ "$includeHCatalog" == "true" ]; then addJars=`echo $PIG_OPTS | awk '{ for (i=1; i<=NF; i++) print $i; }' | grep "\-Dpig.additional.jars.uris=" | sed s/-Dpig.additional.jars.uris=//` if [ "$addJars" != "" ]; then ADDITIONAL_CLASSPATHS=$addJars,$ADDITIONAL_CLASSPATHS PIG_OPTS=`echo $PIG_OPTS | sed 's/-Dpig.additional.jars.uris=[^ ]*//'` fi PIG_OPTS="$PIG_OPTS -Dpig.additional.jars.uris=$ADDITIONAL_CLASSPATHS" fi ################# ADDING SPARK DEPENDENCIES ################## # For spark_local mode: if [ "$isSparkLocalMode" == "true" ]; then #SPARK_MASTER is forced to be "local" in spark_local mode SPARK_MASTER="local" for f in $PIG_HOME/lib/spark/*.jar; do CLASSPATH=${CLASSPATH}:$f; done fi # For spark mode: # Please specify SPARK_HOME first so that we can locate $SPARK_HOME/lib/spark-assembly*.jar, # we will add spark-assembly*.jar to the classpath. if [ "$isSparkMode" == "true" ]; then if [ -z "$SPARK_HOME" ]; then echo "Error: SPARK_HOME is not set!" exit 1 fi # spark-tags*.jar appears in spark2, spark1 does not include this jar, we use this jar to judge current spark is spark1 or spark2. SPARK_TAG_JAR=`find $SPARK_HOME -name 'spark-tags*.jar'|wc -l` if [ "$SPARK_TAG_JAR" -eq 0 ];then sparkversion="1" fi if [ "$sparkversion" == "1" ]; then # Please specify SPARK_JAR which is the hdfs path of spark-assembly*.jar to allow YARN to cache spark-assembly*.jar on nodes so that it doesn't need to be distributed each time an application runs. if [ -z "$SPARK_JAR" ]; then echo "Error: SPARK_JAR is not set, SPARK_JAR stands for the hdfs location of spark-assembly*.jar. This allows YARN to cache spark-assembly*.jar on nodes so that it doesn't need to be distributed each time an application runs." exit 1 fi if [ -n "$SPARK_HOME" ]; then echo "Using Spark Home: " ${SPARK_HOME} SPARK_ASSEMBLY_JAR=`ls ${SPARK_HOME}/lib/spark-assembly*` CLASSPATH=${CLASSPATH}:$SPARK_ASSEMBLY_JAR fi fi if [ "$sparkversion" == "2" ]; then if [ -n "$SPARK_HOME" ]; then echo "Using Spark Home: " ${SPARK_HOME} for f in $SPARK_HOME/jars/*.jar; do CLASSPATH=${CLASSPATH}:$f done fi fi fi #spark-assembly.jar contains jcl-over-slf4j which would create a LogFactory implementation that is incompatible if [ "$isSparkMode" == "true" ]; then PIG_OPTS="$PIG_OPTS -Dorg.apache.commons.logging.LogFactory=org.apache.commons.logging.impl.LogFactoryImpl" fi ################# ADDING SPARK DEPENDENCIES ################## # run it if [ -n "$HADOOP_BIN" ]; then if [ "$debug" == "true" ]; then echo "Found hadoop at $HADOOP_BIN" fi HADOOP_VERSION_LONG=`hadoop version 2>/dev/null | head -1 | sed -e 's/Hadoop //g'` HADOOP_VERSION=`echo "$HADOOP_VERSION_LONG" | cut -c 1` PIG_JAR=`echo $PIG_HOME/pig*-core-h${HADOOP_VERSION}.jar` # for deb/rpm package, add pig jar in /usr/share/pig if [ -z "$PIG_JAR" ]; then PIG_JAR=`echo $PIG_HOME/share/pig/pig*-core-h${HADOOP_VERSION}.jar` fi if [ -n "$PIG_JAR" ]; then CLASSPATH=${CLASSPATH}:$PIG_JAR else if [ "$HADOOP_VERSION" == "2" ]; then echo "Cannot locate pig-core-h${HADOOP_VERSION}.jar (found Hadoop $HADOOP_VERSION_LONG). Do 'ant clean jar', and try again" else echo "Cannot locate pig-core-h${HADOOP_VERSION}.jar (found Hadoop $HADOOP_VERSION_LONG). Do 'ant -Dhadoopversion=3 clean jar', and try again" fi exit 1 fi for f in $PIG_HOME/lib/h${HADOOP_VERSION}/*.jar; do CLASSPATH=${CLASSPATH}:$f; done export HADOOP_CLASSPATH=$CLASSPATH:$HADOOP_CLASSPATH export HADOOP_CLIENT_OPTS="$JAVA_HEAP_MAX $PIG_OPTS $HADOOP_CLIENT_OPTS" if [ "$debug" == "true" ]; then echo "dry run:" echo "HADOOP_CLASSPATH: $HADOOP_CLASSPATH" echo "HADOOP_OPTS: $HADOOP_OPTS" echo "HADOOP_CLIENT_OPTS: $HADOOP_CLIENT_OPTS" echo "$HADOOP_BIN" jar "$PIG_JAR" "${remaining[@]}" echo else exec "$HADOOP_BIN" jar "$PIG_JAR" "${remaining[@]}" fi else # use bundled hadoop to run local mode PIG_JAR=`echo $PIG_HOME/pig*-core-h*.jar` HADOOP_VERSION=`echo "$PIG_JAR" | rev | cut -c -5 | rev | cut -c 1` if [ -n "$PIG_JAR" ]; then CLASSPATH="${CLASSPATH}:$PIG_JAR" else if [ "$HADOOP_VERSION" == "2" ]; then echo "Cannot locate pig-core-h${HADOOP_VERSION}.jar (found no Hadoop installation). Do 'ant clean jar', and try again" else echo "Cannot locate pig-core-h${HADOOP_VERSION}.jar (found no Hadoop installation). Do 'ant -Dhadoopversion=3 clean jar', and try again" fi exit 1 fi for f in $PIG_HOME/lib/h${HADOOP_VERSION}/*.jar; do CLASSPATH=${CLASSPATH}:$f; done # Add bundled hadoop jars for f in $PIG_HOME/lib/hadoop${HADOOP_VERSION}-runtime/*.jar; do CLASSPATH=${CLASSPATH}:$f; done if [ "$debug" == "true" ]; then echo "Cannot find local hadoop installation, using bundled `java -cp $CLASSPATH org.apache.hadoop.util.VersionInfo | head -1`" fi CLASS=org.apache.pig.Main if [ "$debug" == "true" ]; then echo "dry run:" echo "$JAVA" $JAVA_HEAP_MAX $PIG_OPTS -classpath "$CLASSPATH" $CLASS "${remaining[@]}" echo else exec "$JAVA" $JAVA_HEAP_MAX $PIG_OPTS -classpath "$CLASSPATH" $CLASS "${remaining[@]}" fi fi shopt -u nullglob shopt -u extglob