/[Apache-SVN]/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
Diff of /lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java

Parent Directory | Revision Log | View Patch Patch
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java	2006/05/08 21:03:46	405164
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java	2006/05/08 21:04:01	405165
@@ -13,319 +13,30 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.searcher;
 
-import java.io.*;
-import java.util.*;
-
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.searcher.Summary.*;
-import org.apache.nutch.analysis.NutchDocumentAnalyzer;
-import org.apache.nutch.util.NutchConfiguration;
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
 
-/** Implements hit summarization. */
-public class Summarizer {
-   
-  /** Converts text to tokens. */
-  private Analyzer ANALYZER;
-  private Configuration conf;
 
-  /**
-   * The constructor.
-   * @param conf
-   */
-  public Summarizer(Configuration conf) {
-    this.conf = conf;
-    this.ANALYZER = new NutchDocumentAnalyzer(conf);
-  }
+/** 
+ * Extension point for summarizer.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface Summarizer extends Configurable, Pluggable {
 
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = Summarizer.class.getName();
+  
   /**
-   * Class Excerpt represents a single passage found in the document, with some
-   * appropriate regions highlit.
+   * Get a summary for a specified text.
+   * @param text is the text to summarize.
+   * @param query is the query for which the text is a hit.
    */
-  class Excerpt {
-      Vector passages = new Vector();
-      SortedSet tokenSet = new TreeSet();
-      int numTerms = 0;
-
-      /**
-       */
-      public Excerpt() {
-      }
-      
-      /**
-       */
-      public void addToken(String token) {
-          tokenSet.add(token);
-      }
-
-      /**
-       * Return how many unique toks we have
-       */
-      public int numUniqueTokens() {
-          return tokenSet.size();
-      }
-
-      /**
-       * How many fragments we have.
-       */
-      public int numFragments() {
-          return passages.size();
-      }
-
-      public void setNumTerms(int numTerms) {
-          this.numTerms = numTerms;
-      }
-
-      public int getNumTerms() {
-          return numTerms;
-      }
-
-      /**
-       * Add a frag to the list.
-       */
-      public void add(Fragment fragment) {
-          passages.add(fragment);
-      }
-
-      /**
-       * Return an Enum for all the fragments
-       */
-      public Enumeration elements() {
-          return passages.elements();
-      }
-  }
-
-  /** Returns a summary for the given pre-tokenized text. */
-  public Summary getSummary(String text, Query query, int sumContext, int sumLength) throws IOException {
-
-    // Simplistic implementation.  Finds the first fragments in the document
-    // containing any query terms.
-    //
-    // TODO: check that phrases in the query are matched in the fragment
-
-    Token[] tokens = getTokens(text);             // parse text to token array
-
-    if (tokens.length == 0)
-      return new Summary();
-
-    String[] terms = query.getTerms();
-    HashSet highlight = new HashSet();            // put query terms in table
-    for (int i = 0; i < terms.length; i++)
-      highlight.add(terms[i]);
-
-    //
-    // Create a SortedSet that ranks excerpts according to
-    // how many query terms are present.  An excerpt is
-    // a Vector full of Fragments and Highlights
-    //
-    SortedSet excerptSet = new TreeSet(new Comparator() {
-        public int compare(Object o1, Object o2) {
-            Excerpt excerpt1 = (Excerpt) o1;
-            Excerpt excerpt2 = (Excerpt) o2;
-
-            if (excerpt1 == null && excerpt2 != null) {
-                return -1;
-            } else if (excerpt1 != null && excerpt2 == null) {
-                return 1;
-            } else if (excerpt1 == null && excerpt2 == null) {
-                return 0;
-            }
-
-            int numToks1 = excerpt1.numUniqueTokens();
-            int numToks2 = excerpt2.numUniqueTokens();
-
-            if (numToks1 < numToks2) {
-                return -1;
-            } else if (numToks1 == numToks2) {
-                return excerpt1.numFragments() - excerpt2.numFragments();
-            } else {
-                return 1;
-            }
-        }
-    }
-        );
-
-    //
-    // Iterate through all terms in the document
-    //
-    int lastExcerptPos = 0;
-    for (int i = 0; i < tokens.length; i++) {
-      //
-      // If we find a term that's in the query...
-      //
-      if (highlight.contains(tokens[i].termText())) {
-        //
-        // Start searching at a point SUM_CONTEXT terms back,
-        // and move SUM_CONTEXT terms into the future.
-        //
-        int startToken = (i > sumContext) ? i - sumContext : 0;
-        int endToken = Math.min(i + sumContext, tokens.length);
-        int offset = tokens[startToken].startOffset();
-        int j = startToken;
-
-        //
-        // Iterate from the start point to the finish, adding
-        // terms all the way.  The end of the passage is always
-        // SUM_CONTEXT beyond the last query-term.
-        //
-        Excerpt excerpt = new Excerpt();
-        if (i != 0) {
-            excerpt.add(new Summary.Ellipsis());
-        }
-
-        //
-        // Iterate through as long as we're before the end of
-        // the document and we haven't hit the max-number-of-items
-        // -in-a-summary.
-        //
-        while ((j < endToken) && (j - startToken < sumLength)) {
-          //
-          // Now grab the hit-element, if present
-          //
-          Token t = tokens[j];
-          if (highlight.contains(t.termText())) {
-            excerpt.addToken(t.termText());
-            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
-            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
-            offset = t.endOffset();
-            endToken = Math.min(j + sumContext, tokens.length);
-          }
-
-          j++;
-        }
-
-        lastExcerptPos = endToken;
-
-        //
-        // We found the series of search-term hits and added
-        // them (with intervening text) to the excerpt.  Now 
-        // we need to add the trailing edge of text.
-        //
-        // So if (j < tokens.length) then there is still trailing
-        // text to add.  (We haven't hit the end of the source doc.)
-        // Add the words since the last hit-term insert.
-        //
-        if (j < tokens.length) {
-          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
-        }
-
-        //
-        // Remember how many terms are in this excerpt
-        //
-        excerpt.setNumTerms(j - startToken);
-
-        //
-        // Store the excerpt for later sorting
-        //
-        excerptSet.add(excerpt);
-
-        //
-        // Start SUM_CONTEXT places away.  The next
-        // search for relevant excerpts begins at i-SUM_CONTEXT
-        //
-        i = j + sumContext;
-      }
-    }
-
-    //
-    // If the target text doesn't appear, then we just
-    // excerpt the first SUM_LENGTH words from the document.
-    //
-    if (excerptSet.size() == 0) {
-        Excerpt excerpt = new Excerpt();
-        int excerptLen = Math.min(sumLength, tokens.length);
-        lastExcerptPos = excerptLen;
-
-        excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
-        excerpt.setNumTerms(excerptLen);
-        excerptSet.add(excerpt);
-    }
-
-    //
-    // Now choose the best items from the excerpt set.
-    // Stop when our Summary grows too large.
-    //
-    double tokenCount = 0;
-    Summary s = new Summary();
-    while (tokenCount <= sumLength && excerptSet.size() > 0) {
-        Excerpt excerpt = (Excerpt) excerptSet.last();
-        excerptSet.remove(excerpt);
-
-        double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
-        for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
-            Fragment f = (Fragment) e.nextElement();
-            // Don't add fragments if it takes us over the max-limit
-            if (tokenCount + tokenFraction <= sumLength) {
-                s.add(f);
-            }
-            tokenCount += tokenFraction;
-        }
-    }
-    
-    if (tokenCount > 0 && lastExcerptPos < tokens.length)
-      s.add(new Ellipsis());
-    return s;
-  }
-
-  private Token[] getTokens(String text) throws IOException {
-    ArrayList result = new ArrayList();
-    TokenStream ts = ANALYZER.tokenStream("content", new StringReader(text));
-    for (Token token = ts.next(); token != null; token = ts.next()) {
-      result.add(token);
-    }
-    return (Token[])result.toArray(new Token[result.size()]);
-  }
-
-    /**
-     * Tests Summary-generation.  User inputs the name of a 
-     * text file and a query string
-     */
-    public static void main(String argv[]) throws IOException {
-        // Test arglist
-        if (argv.length < 2) {
-            System.out.println("Usage: java org.apache.nutch.searcher.Summarizer <textfile> <queryStr>");
-            return;
-        }
-
-        Summarizer s = new Summarizer(NutchConfiguration.create());
-
-        //
-        // Parse the args
-        //
-        File textFile = new File(argv[0]);
-        StringBuffer queryBuf = new StringBuffer();
-        for (int i = 1; i < argv.length; i++) {
-            queryBuf.append(argv[i]);
-            queryBuf.append(" ");
-        }
-
-        //
-        // Load the text file into a single string.
-        //
-        StringBuffer body = new StringBuffer();
-        BufferedReader in = new BufferedReader(new FileReader(textFile));
-        try {
-            System.out.println("About to read " + textFile + " from " + in);
-            String str = in.readLine();
-            while (str != null) {
-                body.append(str);
-                str = in.readLine();
-            }
-        } finally {
-            in.close();
-        }
+  public Summary getSummary(String text, Query query);
 
-        Configuration conf = NutchConfiguration.create();
-        int sumContext = conf.getInt("searcher.summary.context", 5);
-        int sumLength = conf.getInt("searcher.summary.length", 20);
-        // Convert the query string into a proper Query
-        Query query = Query.parse(queryBuf.toString(), conf);
-        System.out.println("Summary: '" + s.getSummary(body.toString(), query, sumContext, sumLength) + "'");
-    }
 }
infrastructure at apache.org	ViewVC Help
Powered by ViewVC 1.1.26