Parent Directory | Revision Log | Patch
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java 2006/05/08 21:03:46 405164 +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java 2006/05/08 21:04:01 405165 @@ -13,319 +13,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.searcher; -import java.io.*; -import java.util.*; - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; +// Hadoop imports +import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.searcher.Summary.*; -import org.apache.nutch.analysis.NutchDocumentAnalyzer; -import org.apache.nutch.util.NutchConfiguration; +// Nutch imports +import org.apache.nutch.plugin.Pluggable; -/** Implements hit summarization. */ -public class Summarizer { - - /** Converts text to tokens. */ - private Analyzer ANALYZER; - private Configuration conf; - /** - * The constructor. - * @param conf - */ - public Summarizer(Configuration conf) { - this.conf = conf; - this.ANALYZER = new NutchDocumentAnalyzer(conf); - } +/** + * Extension point for summarizer. + * + * @author Jérôme Charron + */ +public interface Summarizer extends Configurable, Pluggable { + /** The name of the extension point. */ + public final static String X_POINT_ID = Summarizer.class.getName(); + /** - * Class Excerpt represents a single passage found in the document, with some - * appropriate regions highlit. + * Get a summary for a specified text. + * @param text is the text to summarize. + * @param query is the query for which the text is a hit. */ - class Excerpt { - Vector passages = new Vector(); - SortedSet tokenSet = new TreeSet(); - int numTerms = 0; - - /** - */ - public Excerpt() { - } - - /** - */ - public void addToken(String token) { - tokenSet.add(token); - } - - /** - * Return how many unique toks we have - */ - public int numUniqueTokens() { - return tokenSet.size(); - } - - /** - * How many fragments we have. - */ - public int numFragments() { - return passages.size(); - } - - public void setNumTerms(int numTerms) { - this.numTerms = numTerms; - } - - public int getNumTerms() { - return numTerms; - } - - /** - * Add a frag to the list. - */ - public void add(Fragment fragment) { - passages.add(fragment); - } - - /** - * Return an Enum for all the fragments - */ - public Enumeration elements() { - return passages.elements(); - } - } - - /** Returns a summary for the given pre-tokenized text. */ - public Summary getSummary(String text, Query query, int sumContext, int sumLength) throws IOException { - - // Simplistic implementation. Finds the first fragments in the document - // containing any query terms. - // - // TODO: check that phrases in the query are matched in the fragment - - Token[] tokens = getTokens(text); // parse text to token array - - if (tokens.length == 0) - return new Summary(); - - String[] terms = query.getTerms(); - HashSet highlight = new HashSet(); // put query terms in table - for (int i = 0; i < terms.length; i++) - highlight.add(terms[i]); - - // - // Create a SortedSet that ranks excerpts according to - // how many query terms are present. An excerpt is - // a Vector full of Fragments and Highlights - // - SortedSet excerptSet = new TreeSet(new Comparator() { - public int compare(Object o1, Object o2) { - Excerpt excerpt1 = (Excerpt) o1; - Excerpt excerpt2 = (Excerpt) o2; - - if (excerpt1 == null && excerpt2 != null) { - return -1; - } else if (excerpt1 != null && excerpt2 == null) { - return 1; - } else if (excerpt1 == null && excerpt2 == null) { - return 0; - } - - int numToks1 = excerpt1.numUniqueTokens(); - int numToks2 = excerpt2.numUniqueTokens(); - - if (numToks1 < numToks2) { - return -1; - } else if (numToks1 == numToks2) { - return excerpt1.numFragments() - excerpt2.numFragments(); - } else { - return 1; - } - } - } - ); - - // - // Iterate through all terms in the document - // - int lastExcerptPos = 0; - for (int i = 0; i < tokens.length; i++) { - // - // If we find a term that's in the query... - // - if (highlight.contains(tokens[i].termText())) { - // - // Start searching at a point SUM_CONTEXT terms back, - // and move SUM_CONTEXT terms into the future. - // - int startToken = (i > sumContext) ? i - sumContext : 0; - int endToken = Math.min(i + sumContext, tokens.length); - int offset = tokens[startToken].startOffset(); - int j = startToken; - - // - // Iterate from the start point to the finish, adding - // terms all the way. The end of the passage is always - // SUM_CONTEXT beyond the last query-term. - // - Excerpt excerpt = new Excerpt(); - if (i != 0) { - excerpt.add(new Summary.Ellipsis()); - } - - // - // Iterate through as long as we're before the end of - // the document and we haven't hit the max-number-of-items - // -in-a-summary. - // - while ((j < endToken) && (j - startToken < sumLength)) { - // - // Now grab the hit-element, if present - // - Token t = tokens[j]; - if (highlight.contains(t.termText())) { - excerpt.addToken(t.termText()); - excerpt.add(new Fragment(text.substring(offset, t.startOffset()))); - excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset()))); - offset = t.endOffset(); - endToken = Math.min(j + sumContext, tokens.length); - } - - j++; - } - - lastExcerptPos = endToken; - - // - // We found the series of search-term hits and added - // them (with intervening text) to the excerpt. Now - // we need to add the trailing edge of text. - // - // So if (j < tokens.length) then there is still trailing - // text to add. (We haven't hit the end of the source doc.) - // Add the words since the last hit-term insert. - // - if (j < tokens.length) { - excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset()))); - } - - // - // Remember how many terms are in this excerpt - // - excerpt.setNumTerms(j - startToken); - - // - // Store the excerpt for later sorting - // - excerptSet.add(excerpt); - - // - // Start SUM_CONTEXT places away. The next - // search for relevant excerpts begins at i-SUM_CONTEXT - // - i = j + sumContext; - } - } - - // - // If the target text doesn't appear, then we just - // excerpt the first SUM_LENGTH words from the document. - // - if (excerptSet.size() == 0) { - Excerpt excerpt = new Excerpt(); - int excerptLen = Math.min(sumLength, tokens.length); - lastExcerptPos = excerptLen; - - excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset()))); - excerpt.setNumTerms(excerptLen); - excerptSet.add(excerpt); - } - - // - // Now choose the best items from the excerpt set. - // Stop when our Summary grows too large. - // - double tokenCount = 0; - Summary s = new Summary(); - while (tokenCount <= sumLength && excerptSet.size() > 0) { - Excerpt excerpt = (Excerpt) excerptSet.last(); - excerptSet.remove(excerpt); - - double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); - for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) { - Fragment f = (Fragment) e.nextElement(); - // Don't add fragments if it takes us over the max-limit - if (tokenCount + tokenFraction <= sumLength) { - s.add(f); - } - tokenCount += tokenFraction; - } - } - - if (tokenCount > 0 && lastExcerptPos < tokens.length) - s.add(new Ellipsis()); - return s; - } - - private Token[] getTokens(String text) throws IOException { - ArrayList result = new ArrayList(); - TokenStream ts = ANALYZER.tokenStream("content", new StringReader(text)); - for (Token token = ts.next(); token != null; token = ts.next()) { - result.add(token); - } - return (Token[])result.toArray(new Token[result.size()]); - } - - /** - * Tests Summary-generation. User inputs the name of a - * text file and a query string - */ - public static void main(String argv[]) throws IOException { - // Test arglist - if (argv.length < 2) { - System.out.println("Usage: java org.apache.nutch.searcher.Summarizer <textfile> <queryStr>"); - return; - } - - Summarizer s = new Summarizer(NutchConfiguration.create()); - - // - // Parse the args - // - File textFile = new File(argv[0]); - StringBuffer queryBuf = new StringBuffer(); - for (int i = 1; i < argv.length; i++) { - queryBuf.append(argv[i]); - queryBuf.append(" "); - } - - // - // Load the text file into a single string. - // - StringBuffer body = new StringBuffer(); - BufferedReader in = new BufferedReader(new FileReader(textFile)); - try { - System.out.println("About to read " + textFile + " from " + in); - String str = in.readLine(); - while (str != null) { - body.append(str); - str = in.readLine(); - } - } finally { - in.close(); - } + public Summary getSummary(String text, Query query); - Configuration conf = NutchConfiguration.create(); - int sumContext = conf.getInt("searcher.summary.context", 5); - int sumLength = conf.getInt("searcher.summary.length", 20); - // Convert the query string into a proper Query - Query query = Query.parse(queryBuf.toString(), conf); - System.out.println("Summary: '" + s.getSummary(body.toString(), query, sumContext, sumLength) + "'"); - } }
infrastructure at apache.org | ViewVC Help |
Powered by ViewVC 1.1.26 |