| 1 |
/**
|
| 2 |
* Licensed to the Apache Software Foundation (ASF) under one or more
|
| 3 |
* contributor license agreements. See the NOTICE file distributed with
|
| 4 |
* this work for additional information regarding copyright ownership.
|
| 5 |
* The ASF licenses this file to You under the Apache License, Version 2.0
|
| 6 |
* (the "License"); you may not use this file except in compliance with
|
| 7 |
* the License. You may obtain a copy of the License at
|
| 8 |
*
|
| 9 |
* http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
*
|
| 11 |
* Unless required by applicable law or agreed to in writing, software
|
| 12 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
* See the License for the specific language governing permissions and
|
| 15 |
* limitations under the License.
|
| 16 |
*/
|
| 17 |
|
| 18 |
package org.apache.nutch.indexer;
|
| 19 |
|
| 20 |
import java.io.IOException;
|
| 21 |
import java.util.ArrayList;
|
| 22 |
import java.util.List;
|
| 23 |
|
| 24 |
import org.apache.commons.logging.Log;
|
| 25 |
import org.apache.commons.logging.LogFactory;
|
| 26 |
import org.apache.hadoop.conf.Configuration;
|
| 27 |
import org.apache.hadoop.conf.Configured;
|
| 28 |
import org.apache.hadoop.fs.Path;
|
| 29 |
import org.apache.hadoop.mapred.FileOutputFormat;
|
| 30 |
import org.apache.hadoop.mapred.JobClient;
|
| 31 |
import org.apache.hadoop.mapred.JobConf;
|
| 32 |
import org.apache.hadoop.util.StringUtils;
|
| 33 |
import org.apache.hadoop.util.Tool;
|
| 34 |
import org.apache.hadoop.util.ToolRunner;
|
| 35 |
import org.apache.nutch.indexer.lucene.LuceneWriter;
|
| 36 |
import org.apache.nutch.util.NutchConfiguration;
|
| 37 |
import org.apache.nutch.util.NutchJob;
|
| 38 |
|
| 39 |
/** Create indexes for segments. */
|
| 40 |
public class Indexer extends Configured implements Tool {
|
| 41 |
|
| 42 |
public static final String DONE_NAME = "index.done";
|
| 43 |
|
| 44 |
public static final Log LOG = LogFactory.getLog(Indexer.class);
|
| 45 |
|
| 46 |
public Indexer() {
|
| 47 |
super(null);
|
| 48 |
}
|
| 49 |
|
| 50 |
public Indexer(Configuration conf) {
|
| 51 |
super(conf);
|
| 52 |
}
|
| 53 |
|
| 54 |
public void index(Path luceneDir, Path crawlDb,
|
| 55 |
Path linkDb, List<Path> segments)
|
| 56 |
throws IOException {
|
| 57 |
LOG.info("Indexer: starting");
|
| 58 |
|
| 59 |
final JobConf job = new NutchJob(getConf());
|
| 60 |
job.setJobName("index-lucene " + luceneDir);
|
| 61 |
|
| 62 |
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
|
| 63 |
|
| 64 |
FileOutputFormat.setOutputPath(job, luceneDir);
|
| 65 |
|
| 66 |
LuceneWriter.addFieldOptions("segment", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
|
| 67 |
LuceneWriter.addFieldOptions("digest", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
|
| 68 |
LuceneWriter.addFieldOptions("boost", LuceneWriter.STORE.YES, LuceneWriter.INDEX.NO, job);
|
| 69 |
|
| 70 |
NutchIndexWriterFactory.addClassToConf(job, LuceneWriter.class);
|
| 71 |
|
| 72 |
JobClient.runJob(job);
|
| 73 |
LOG.info("Indexer: done");
|
| 74 |
}
|
| 75 |
|
| 76 |
public int run(String[] args) throws Exception {
|
| 77 |
if (args.length < 4) {
|
| 78 |
System.err.println("Usage: Indexer <index> <crawldb> <linkdb> <segment> ...");
|
| 79 |
return -1;
|
| 80 |
}
|
| 81 |
|
| 82 |
final Path luceneDir = new Path(args[0]);
|
| 83 |
final Path crawlDb = new Path(args[1]);
|
| 84 |
final Path linkDb = new Path(args[2]);
|
| 85 |
|
| 86 |
final List<Path> segments = new ArrayList<Path>();
|
| 87 |
for (int i = 3; i < args.length; i++) {
|
| 88 |
segments.add(new Path(args[i]));
|
| 89 |
}
|
| 90 |
|
| 91 |
try {
|
| 92 |
index(luceneDir, crawlDb, linkDb, segments);
|
| 93 |
return 0;
|
| 94 |
} catch (final Exception e) {
|
| 95 |
LOG.fatal("Indexer: " + StringUtils.stringifyException(e));
|
| 96 |
return -1;
|
| 97 |
}
|
| 98 |
}
|
| 99 |
|
| 100 |
public static void main(String[] args) throws Exception {
|
| 101 |
final int res = ToolRunner.run(NutchConfiguration.create(), new Indexer(), args);
|
| 102 |
System.exit(res);
|
| 103 |
}
|
| 104 |
}
|