/* * (c) Copyright 2008, 2009 Hewlett-Packard Development Company, LP * All rights reserved. * [See end of file] */ package search; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; public class SearchWikipedia extends SearchBase { // Unfinished. static String site = "en.wikipedia.org" ; // http://en.wikipedia.org/wiki/API static String serviceURL = "http://%s/wiki/Special:Search?search=%s&fulltext=Search" ; // Non-greedy .* // FIXME (this is Google) for the search results page. private static Pattern pattern1 = Pattern.compile(".*?
  • ") ; @Override protected Iterator parse(InputStream in) { // Get bored - read it all in. String content = readAll(in, "UTF-8") ; return parse(content) ; } public static Iterator parse(String content) { Matcher matcher = pattern1.matcher(content) ; ArrayList links = new ArrayList(20) ; while(matcher.find()) { String link = matcher.group(1) ; link = "http://"+site+link ; String title = matcher.group(2) ; if ( ! links.contains(link) ) links.add(link) ; if ( links.size() > 9 ) break ; } return links.iterator() ; } private String url(String searchExpr) { return String.format(serviceURL, site, searchExpr) ; } @Override protected InputStream execGet(String str) { URL target = null ; try { String qs = url(str) ; target = new URL(qs) ; } catch (MalformedURLException malEx) { throw new SearchException("Malformed URL: "+malEx) ; } try { HttpURLConnection httpConnection = (HttpURLConnection) target.openConnection(); // By default, following 3xx redirects is true //conn.setFollowRedirects(true) ; httpConnection.setRequestProperty("Accept", "text/html") ; httpConnection.setRequestProperty("Accept-Charset", "utf-8") ; httpConnection.setRequestProperty("User-Agent", "Mozilla 9 // ARQ SPARQL Query Engine") ; httpConnection.setDoInput(true); httpConnection.connect(); return execCommon(httpConnection); } catch (java.net.ConnectException connEx) { throw new SearchException("Failed to connect to remote server"); } catch (IOException ioEx) { throw new SearchException(ioEx); } } } /* * (c) Copyright 2008, 2009 Hewlett-Packard Development Company, LP * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */