#!/bin/sh # # The Nutch command script # # Environment Variables # # NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME. # # NUTCH_HEAPSIZE The maximum amount of heap to use, in MB. # Default is 1000. # # NUTCH_OPTS Extra Java runtime options. # # resolve links - $0 may be a softlink THIS="$0" while [ -h "$THIS" ]; do ls=`ls -ld "$THIS"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '.*/.*' > /dev/null; then THIS="$link" else THIS=`dirname "$THIS"`/"$link" fi done # if no args specified, show usage if [ $# = 0 ]; then echo "Usage: nutch COMMAND" echo "where COMMAND is one of:" echo " crawl one-step crawler for intranets" echo " admin database administration, including creation" echo " inject inject new urls into the database" echo " generate generate new segments to fetch" echo " fetchlist print the fetchlist of a segment" echo " fetch fetch a segment's pages" echo " parse parse a segment's pages" echo " index run the indexer on a segment's fetcher output" echo " merge merge several segment indexes" echo " dedup remove duplicates from a set of segment indexes" echo " updatedb update db from segments after fetching" echo " updatesegs update segments with link data from the db" echo " mergesegs merge multiple segments into a single segment" echo " readdb examine arbitrary fields of the database" echo " analyze adjust database link-analysis scoring" echo " prune prune segment index(es) of unwanted content" echo " segread read, fix and dump segment data" echo " segslice append, join and slice segment data" echo " server run a search server" echo " namenode run the NDFS namenode" echo " datanode run an NDFS datanode" echo " ndfs run an NDFS admin client" echo " jobtracker run the MapReduce job Tracker node" echo " tasktracker run a MapReduce task Tracker node" echo " or" echo " CLASSNAME run the class named CLASSNAME" echo "Most commands print help when invoked w/o parameters." exit 1 fi # get arguments COMMAND=$1 shift # some directories THIS_DIR=`dirname "$THIS"` NUTCH_HOME=`cd "$THIS_DIR/.." ; pwd` # some Java parameters if [ "$NUTCH_JAVA_HOME" != "" ]; then echo "run java in $NUTCH_JAVA_HOME" JAVA_HOME=$NUTCH_JAVA_HOME fi if [ "$JAVA_HOME" = "" ]; then echo "Error: JAVA_HOME is not set." exit 1 fi JAVA=$JAVA_HOME/bin/java JAVA_HEAP_MAX=-Xmx1000m # check envvars which might override default args if [ "$NUTCH_HEAPSIZE" != "" ]; then echo "run with heapsize $NUTCH_HEAPSIZE" JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m" echo $JAVA_HEAP_MAX fi # CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf} # for developers, add Nutch classes to CLASSPATH if [ -d "$NUTCH_HOME/build/classes" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes fi if [ -d "$NUTCH_HOME/build/plugins" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build fi if [ -d "$NUTCH_HOME/build/test/classes" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes fi # so that filenames w/ spaces are handled correctly in loops below IFS= # for releases, add Nutch jar to CLASSPATH for f in $NUTCH_HOME/nutch-*.jar; do CLASSPATH=${CLASSPATH}:$f; done # add plugins to classpath if [ -d "$NUTCH_HOME/plugins" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME fi # add libs to CLASSPATH for f in $NUTCH_HOME/lib/*.jar; do CLASSPATH=${CLASSPATH}:$f; done for f in $NUTCH_HOME/lib/jettyext/*.jar; do CLASSPATH=${CLASSPATH}:$f; done # restore ordinary behaviour unset IFS # figure out which class to run if [ "$COMMAND" = "crawl" ] ; then CLASS=org.apache.nutch.tools.CrawlTool elif [ "$COMMAND" = "admin" ] ; then CLASS=org.apache.nutch.tools.WebDBAdminTool elif [ "$COMMAND" = "inject" ] ; then CLASS=org.apache.nutch.db.WebDBInjector elif [ "$COMMAND" = "generate" ] ; then CLASS=org.apache.nutch.tools.FetchListTool elif [ "$COMMAND" = "fetchlist" ] ; then CLASS=org.apache.nutch.pagedb.FetchListEntry elif [ "$COMMAND" = "fetch" ] ; then CLASS=org.apache.nutch.fetcher.Fetcher elif [ "$COMMAND" = "parse" ] ; then CLASS=org.apache.nutch.tools.ParseSegment elif [ "$COMMAND" = "index" ] ; then CLASS=org.apache.nutch.indexer.IndexSegment elif [ "$COMMAND" = "merge" ] ; then CLASS=org.apache.nutch.indexer.IndexMerger elif [ "$COMMAND" = "dedup" ] ; then CLASS=org.apache.nutch.indexer.DeleteDuplicates elif [ "$COMMAND" = "updatedb" ] ; then CLASS=org.apache.nutch.tools.UpdateDatabaseTool elif [ "$COMMAND" = "updatesegs" ] ; then CLASS=org.apache.nutch.tools.UpdateSegmentsFromDb elif [ "$COMMAND" = "mergesegs" ] ; then CLASS=org.apache.nutch.tools.SegmentMergeTool elif [ "$COMMAND" = "readdb" ] ; then CLASS=org.apache.nutch.db.WebDBReader elif [ "$COMMAND" = "prune" ] ; then CLASS=org.apache.nutch.tools.PruneIndexTool elif [ "$COMMAND" = "segread" ] ; then CLASS=org.apache.nutch.segment.SegmentReader elif [ "$COMMAND" = "segslice" ] ; then CLASS=org.apache.nutch.segment.SegmentSlicer elif [ "$COMMAND" = "analyze" ] ; then CLASS=org.apache.nutch.tools.LinkAnalysisTool elif [ "$COMMAND" = "server" ] ; then CLASS='org.apache.nutch.searcher.DistributedSearch$Server' elif [ "$COMMAND" = "namenode" ] ; then CLASS='org.apache.nutch.ndfs.NDFS$NameNode' elif [ "$COMMAND" = "datanode" ] ; then CLASS='org.apache.nutch.ndfs.NDFS$DataNode' elif [ "$COMMAND" = "ndfs" ] ; then CLASS=org.apache.nutch.fs.TestClient elif [ "$COMMAND" = "jobtracker" ] ; then CLASS=org.apache.nutch.mapReduce.JobTracker elif [ "$COMMAND" = "tasktracker" ] ; then CLASS=org.apache.nutch.mapReduce.TaskTracker else CLASS=$COMMAND fi # cygwin path translation if expr `uname` : 'CYGWIN*' > /dev/null; then CLASSPATH=`cygpath -p -w "$CLASSPATH"` fi # run it exec "$JAVA" $JAVA_HEAP_MAX $NUTCH_OPTS -classpath "$CLASSPATH" $CLASS "$@"