#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
#    -i|--index      Indexes crawl results into a configured indexer
#    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
#                    are scheduled for fetching. Suffix can be: s for second,
#                    m for minute, h for hour and d for day. If no suffix is
#                    specified second is used by default.
#    -D              A Java property to pass to Nutch calls
#    Seed Dir        Directory in which to look for a seeds file
#    Crawl Dir       Directory where the crawl/link/segments dirs are saved
#    Num Rounds      The number of rounds to run this crawl for
#
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
# INDEXING FOR EACH SEGMENT

INDEXFLAG=false
JAVA_PROPERTIES=""
WAIT=-1 # don't wait if there are no URLs to fetch

function __to_seconds() {
  NUMBER=$(echo $1 | tr -dc '0-9')
  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')

  case $MODIFIER in
      m|M)
        SECONDS=`expr $NUMBER \* 60`
        ;;
      h|H)
        SECONDS=`expr $NUMBER \* 120`
        ;;
      d|D)
        SECONDS=`expr $NUMBER \* 86400`
        ;;
      s|S|*)
        SECONDS=$NUMBER
        ;;
  esac

  echo $SECONDS
}

while [[ $# > 0 ]]
do
    case $1 in
        -i|--index)
            INDEXFLAG=true
            shift
            ;;
        -D)
            JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
            shift 2
            ;;
        -w|--wait)
            WAIT="${2}"
            shift 2
            ;;
        *)
            break
            ;;
    esac
done

if [[ $# != 3 ]]; then
    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
    echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
    echo -e "\t-D\t\tA Java property to pass to Nutch calls"
    echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
    echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
    echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
    echo -e "\t\t\tspecified second is used by default."
    echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
    echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
    echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
    exit 1
fi

SEEDDIR="$1"
CRAWL_PATH="$2"
LIMIT="$3"

# convert wait time to seconds for compatibility reasons
if [ "$WAIT" != "-1" ]; then
  WAIT=$( __to_seconds "$WAIT" )
  echo "Time to wait (--wait) = $WAIT sec."
fi

#############################################
# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
#############################################

# set the number of slaves nodes
numSlaves=1

# and the total number of available tasks
# sets Hadoop parameter "mapreduce.job.reduces"
numTasks=`expr $numSlaves \* 2`

# number of urls to fetch in one iteration
# 250K per task?
sizeFetchlist=`expr $numSlaves \* 50000`

# time limit for feching
timeLimitFetch=180

# num threads for fetching
numThreads=50

#############################################

bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"

# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
    mode=distributed
fi

# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"

 # check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
 if [ $(which hadoop | wc -l ) -eq 0 ]; then
    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
    exit -1;
 fi
fi


function __bin_nutch {
    # run $bin/nutch, exit if exit value indicates error

    echo "$bin/nutch $@" ;# echo command and arguments
    "$bin/nutch" "$@"

    RETCODE=$?
    if [ $RETCODE -ne 0 ]
    then
        echo "Error running:"
        echo "  $bin/nutch $@"
        echo "Failed with exit value $RETCODE."
        exit $RETCODE
    fi
}


# initial injection
echo "Injecting seed URLs"
__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"

# main loop : rounds of generate - fetch - parse - update
for ((a=1; ; a++))
do
  if [ -e ".STOP" ]
  then
   echo "STOP file found - escaping loop"
   break
  fi

  if [ $LIMIT -ne -1 ]; then
    if [ $a -gt $LIMIT ]; then
      echo `date` ": Finished loop with $LIMIT iterations"
      break
    fi
    echo `date` ": Iteration $a of $LIMIT"
  else
    echo `date` ": Iteration $a"
  fi

  echo "Generating a new segment"
  generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
  echo "$bin/nutch generate ${generate_args[@]}"
  $bin/nutch generate "${generate_args[@]}"
  RETCODE=$?
  if [ $RETCODE -eq 0 ]; then
      : # ok: no error
  elif [ $RETCODE -eq 1 ]; then
    echo "Generate returned 1 (no new segments created)"

    if [ "$WAIT" -ne -1 ]; then
      echo "Waiting for $WAIT sec. ..."
      sleep $WAIT
      continue
    else
      echo "Escaping loop: no more URLs to fetch now"
      break
    fi
  else
    echo "Error running:"
    echo "  $bin/nutch generate ${generate_args[@]}"
    echo "Failed with exit value $RETCODE."
    exit $RETCODE
  fi

  # capture the name of the segment
  # call hadoop in distributed mode
  # or use ls

  if [ $mode = "local" ]; then
   SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
  else
   SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
  fi

  echo "Operating on segment : $SEGMENT"

  # fetching the segment
  echo "Fetching : $SEGMENT"
  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads

  # parsing the segment
  echo "Parsing : $SEGMENT"
  # enable the skipping of records for the parsing so that a dodgy document
  # so that it does not fail the full task
  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
  __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

  # updatedb with this segment
  echo "CrawlDB update"
  __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT

# note that the link inversion - indexing routine can be done within the main loop
# on a per segment basis
  echo "Link inversion"
  __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

  echo "Dedup on crawldb"
  __bin_nutch dedup "$CRAWL_PATH"/crawldb

  if $INDEXFLAG; then
      echo "Indexing $SEGMENT to index"
      __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

      echo "Cleaning up index if possible"
      __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
  else
      echo "Skipping indexing ..."
  fi

  #######################################################
  # The following commands fall into WebGraph territory
  # and should be uncommented based on your requirements
  #######################################################
  #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
  #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

  #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"

  #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
  #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"

  #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
  #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

  #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
  #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

done

exit 0