/*
* (c) Copyright 2008, 2009 Hewlett-Packard Development Company, LP
* All rights reserved.
* [See end of file]
*/
package search;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SearchWikipedia extends SearchBase
{
// Unfinished.
static String site = "en.wikipedia.org" ;
// http://en.wikipedia.org/wiki/API
static String serviceURL = "http://%s/wiki/Special:Search?search=%s&fulltext=Search" ;
// Non-greedy .*
// FIXME (this is Google) for the search results page.
private static Pattern pattern1 = Pattern.compile(".*?
") ;
@Override
protected Iterator parse(InputStream in)
{
// Get bored - read it all in.
String content = readAll(in, "UTF-8") ;
return parse(content) ;
}
public static Iterator parse(String content)
{
Matcher matcher = pattern1.matcher(content) ;
ArrayList links = new ArrayList(20) ;
while(matcher.find())
{
String link = matcher.group(1) ;
link = "http://"+site+link ;
String title = matcher.group(2) ;
if ( ! links.contains(link) )
links.add(link) ;
if ( links.size() > 9 )
break ;
}
return links.iterator() ;
}
private String url(String searchExpr)
{
return String.format(serviceURL, site, searchExpr) ;
}
@Override
protected InputStream execGet(String str)
{
URL target = null ;
try {
String qs = url(str) ;
target = new URL(qs) ;
}
catch (MalformedURLException malEx)
{ throw new SearchException("Malformed URL: "+malEx) ; }
try
{
HttpURLConnection httpConnection = (HttpURLConnection) target.openConnection();
// By default, following 3xx redirects is true
//conn.setFollowRedirects(true) ;
httpConnection.setRequestProperty("Accept", "text/html") ;
httpConnection.setRequestProperty("Accept-Charset", "utf-8") ;
httpConnection.setRequestProperty("User-Agent", "Mozilla 9 // ARQ SPARQL Query Engine") ;
httpConnection.setDoInput(true);
httpConnection.connect();
return execCommon(httpConnection);
}
catch (java.net.ConnectException connEx)
{ throw new SearchException("Failed to connect to remote server"); }
catch (IOException ioEx)
{ throw new SearchException(ioEx); }
}
}
/*
* (c) Copyright 2008, 2009 Hewlett-Packard Development Company, LP
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/