#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] # -i|--index Indexes crawl results into a configured indexer # -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs # are scheduled for fetching. Suffix can be: s for second, # m for minute, h for hour and d for day. If no suffix is # specified second is used by default. # -D A Java property to pass to Nutch calls # Seed Dir Directory in which to look for a seeds file # Crawl Dir Directory where the crawl/link/segments dirs are saved # Num Rounds The number of rounds to run this crawl for # # # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND # INDEXING FOR EACH SEGMENT INDEXFLAG=false JAVA_PROPERTIES="" WAIT=-1 # don't wait if there are no URLs to fetch function __to_seconds() { NUMBER=$(echo $1 | tr -dc '0-9') MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') case $MODIFIER in m|M) SECONDS=`expr $NUMBER \* 60` ;; h|H) SECONDS=`expr $NUMBER \* 120` ;; d|D) SECONDS=`expr $NUMBER \* 86400` ;; s|S|*) SECONDS=$NUMBER ;; esac echo $SECONDS } while [[ $# > 0 ]] do case $1 in -i|--index) INDEXFLAG=true shift ;; -D) JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}" shift 2 ;; -w|--wait) WAIT="${2}" shift 2 ;; *) break ;; esac done if [[ $# != 3 ]]; then echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] " echo -e "\t-i|--index\tIndexes crawl results into a configured indexer" echo -e "\t-D\t\tA Java property to pass to Nutch calls" echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs" echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second," echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is" echo -e "\t\t\tspecified second is used by default." echo -e "\tSeed Dir\tDirectory in which to look for a seeds file" echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved" echo -e "\tNum Rounds\tThe number of rounds to run this crawl for" exit 1 fi SEEDDIR="$1" CRAWL_PATH="$2" LIMIT="$3" # convert wait time to seconds for compatibility reasons if [ "$WAIT" != "-1" ]; then WAIT=$( __to_seconds "$WAIT" ) echo "Time to wait (--wait) = $WAIT sec." fi ############################################# # MODIFY THE PARAMETERS BELOW TO YOUR NEEDS # ############################################# # set the number of slaves nodes numSlaves=1 # and the total number of available tasks # sets Hadoop parameter "mapreduce.job.reduces" numTasks=`expr $numSlaves \* 2` # number of urls to fetch in one iteration # 250K per task? sizeFetchlist=`expr $numSlaves \* 50000` # time limit for feching timeLimitFetch=180 # num threads for fetching numThreads=50 ############################################# bin="`dirname "$0"`" bin="`cd "$bin"; pwd`" # determines whether mode based on presence of job file mode=local if [ -f "${bin}"/../*nutch*.job ]; then mode=distributed fi # note that some of the options listed here could be set in the # corresponding hadoop site xml param file commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true" # check that hadoop can be found on the path if [ $mode = "distributed" ]; then if [ $(which hadoop | wc -l ) -eq 0 ]; then echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." exit -1; fi fi function __bin_nutch { # run $bin/nutch, exit if exit value indicates error echo "$bin/nutch $@" ;# echo command and arguments "$bin/nutch" "$@" RETCODE=$? if [ $RETCODE -ne 0 ] then echo "Error running:" echo " $bin/nutch $@" echo "Failed with exit value $RETCODE." exit $RETCODE fi } # initial injection echo "Injecting seed URLs" __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR" # main loop : rounds of generate - fetch - parse - update for ((a=1; ; a++)) do if [ -e ".STOP" ] then echo "STOP file found - escaping loop" break fi if [ $LIMIT -ne -1 ]; then if [ $a -gt $LIMIT ]; then echo `date` ": Finished loop with $LIMIT iterations" break fi echo `date` ": Iteration $a of $LIMIT" else echo `date` ": Iteration $a" fi echo "Generating a new segment" generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter) echo "$bin/nutch generate ${generate_args[@]}" $bin/nutch generate "${generate_args[@]}" RETCODE=$? if [ $RETCODE -eq 0 ]; then : # ok: no error elif [ $RETCODE -eq 1 ]; then echo "Generate returned 1 (no new segments created)" if [ "$WAIT" -ne -1 ]; then echo "Waiting for $WAIT sec. ..." sleep $WAIT continue else echo "Escaping loop: no more URLs to fetch now" break fi else echo "Error running:" echo " $bin/nutch generate ${generate_args[@]}" echo "Failed with exit value $RETCODE." exit $RETCODE fi # capture the name of the segment # call hadoop in distributed mode # or use ls if [ $mode = "local" ]; then SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` else SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` fi echo "Operating on segment : $SEGMENT" # fetching the segment echo "Fetching : $SEGMENT" __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads # parsing the segment echo "Parsing : $SEGMENT" # enable the skipping of records for the parsing so that a dodgy document # so that it does not fail the full task skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT # updatedb with this segment echo "CrawlDB update" __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT # note that the link inversion - indexing routine can be done within the main loop # on a per segment basis echo "Link inversion" __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT echo "Dedup on crawldb" __bin_nutch dedup "$CRAWL_PATH"/crawldb if $INDEXFLAG; then echo "Indexing $SEGMENT to index" __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT echo "Cleaning up index if possible" __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb else echo "Skipping indexing ..." fi ####################################################### # The following commands fall into WebGraph territory # and should be uncommented based on your requirements ####################################################### #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" #echo "Running Loops Job on WebGraph within $CRAWL_PATH" #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH" #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH" #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores done exit 0