View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.jetspeed.search.handlers;
18  
19  // Java imports
20  import java.io.IOException;
21  import java.net.URL;
22  
23  import org.apache.commons.httpclient.HttpClient;
24  import org.apache.commons.httpclient.HttpException;
25  import org.apache.commons.httpclient.methods.GetMethod;
26  import org.apache.jetspeed.search.AbstractObjectHandler;
27  import org.apache.jetspeed.search.BaseParsedObject;
28  
29  /***
30   * This object handler deals with URLs.
31   * 
32   * @author <a href="mailto:morciuch@apache.org">Mark Orciuch</a>
33   * @version $Id: URLToDocHandler.java 516448 2007-03-09 16:25:47Z ate $
34   */
35  public class URLToDocHandler extends AbstractObjectHandler
36  {
37      /***
38       * Static initialization of the logger for this class
39       */    
40      //private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLToDocHandler.class.getName());
41      
42      /***
43       * Parses a specific object into a document suitable for index placement
44       * 
45       * @param o
46       * @return 
47       */
48      public org.apache.jetspeed.search.ParsedObject parseObject(Object o)
49      {
50          org.apache.jetspeed.search.ParsedObject result = new BaseParsedObject();
51  
52          if ((o instanceof URL) == false)
53          {
54              //logger.error("URLToDocHandler: invalid object type: " + o);
55              return null;
56          }
57  
58          URL pageToAdd = (URL) o;
59  
60          HttpClient client = new HttpClient();
61          GetMethod method = new GetMethod(pageToAdd.toString());
62          method.setFollowRedirects(true);
63          int statusCode = -1;
64          int attempt = 0;
65  
66          try
67          {
68              // We will retry up to 3 times.
69              while (statusCode == -1 && attempt < 3)
70              {
71                  try
72                  {
73                      // execute the method.
74                      client.executeMethod(method);
75                      statusCode = method.getStatusCode();
76                      //if (logger.isDebugEnabled())
77                      {
78                          //logger.debug("URL = " + pageToAdd.toString() + "Status code = " + statusCode);
79                      }
80                  }
81                  catch (HttpException e)
82                  {
83                      // We will retry
84                      attempt++;
85                  }
86                  catch (IOException e)
87                  {
88                      return null;
89                  }
90              }
91              // Check that we didn't run out of retries.
92              if (statusCode != -1)
93              {
94                  String content = null;
95                  try
96                  {
97                      content = method.getResponseBodyAsString();
98                  }
99                  catch (Exception ioe)
100                 {
101                     //logger.error("Getting content for " + pageToAdd.toString(), ioe);
102                 }
103 
104                 if (content != null)
105                 {
106                     try
107                     {
108                         result.setKey(java.net.URLEncoder.encode(pageToAdd.toString(),"UTF-8"));
109                         result.setType(org.apache.jetspeed.search.ParsedObject.OBJECT_TYPE_URL);
110                         // TODO: We should extract the <title> tag here.
111                         result.setTitle(pageToAdd.toString());
112                         result.setContent(content);
113                         result.setDescription("");
114                         result.setLanguage("");
115                         result.setURL(pageToAdd);
116                         result.setClassName(o.getClass().getName());
117                         //logger.info("Parsed '" + pageToAdd.toString() + "'");
118                     }
119                     catch (Exception e)
120                     {
121                         e.printStackTrace();
122                         //logger.error("Adding document to index", e);
123                     }
124                 }
125             }
126         }
127         finally
128         {
129             method.releaseConnection();
130         }
131 
132         return result;
133 
134     }
135 }
136