View Javadoc

1   package org.apache.maven.doxia.linkcheck.validation;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.IOException;
23  
24  import java.net.URL;
25  import java.util.Iterator;
26  import java.util.Map;
27  
28  import org.apache.commons.httpclient.Credentials;
29  import org.apache.commons.httpclient.Header;
30  import org.apache.commons.httpclient.HostConfiguration;
31  import org.apache.commons.httpclient.HttpClient;
32  import org.apache.commons.httpclient.HttpException;
33  import org.apache.commons.httpclient.HttpMethod;
34  import org.apache.commons.httpclient.HttpState;
35  import org.apache.commons.httpclient.HttpStatus;
36  import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
37  import org.apache.commons.httpclient.NTCredentials;
38  import org.apache.commons.httpclient.StatusLine;
39  import org.apache.commons.httpclient.UsernamePasswordCredentials;
40  import org.apache.commons.httpclient.auth.AuthScope;
41  import org.apache.commons.httpclient.methods.GetMethod;
42  import org.apache.commons.httpclient.methods.HeadMethod;
43  import org.apache.commons.httpclient.params.HttpClientParams;
44  import org.apache.commons.httpclient.params.HttpMethodParams;
45  
46  import org.apache.commons.logging.Log;
47  import org.apache.commons.logging.LogFactory;
48  import org.apache.maven.doxia.linkcheck.HttpBean;
49  import org.apache.maven.doxia.linkcheck.model.LinkcheckFileResult;
50  import org.codehaus.plexus.util.StringUtils;
51  
52  /**
53   * Checks links which are normal URLs
54   *
55   * @author <a href="mailto:bwalding@apache.org">Ben Walding</a>
56   * @author <a href="mailto:aheritier@apache.org">Arnaud Heritier</a>
57   * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
58   * @version $Id: OnlineHTTPLinkValidator.java 1030039 2010-11-02 13:33:03Z ltheussl $
59   */
60  public final class OnlineHTTPLinkValidator
61      extends HTTPLinkValidator
62  {
63      /** Log for debug output. */
64      private static final Log LOG = LogFactory.getLog( OnlineHTTPLinkValidator.class );
65  
66      /** The maximum number of redirections for a link. */
67      private static final int MAX_NB_REDIRECT = 10;
68  
69      /** Use the get method to test pages. */
70      private static final String GET_METHOD = "get";
71  
72      /** Use the head method to test pages. */
73      private static final String HEAD_METHOD = "head";
74  
75      /** The http bean encapsuling all http parameters supported. */
76      private HttpBean http;
77  
78      /** The base URL for links that start with '/'. */
79      private String baseURL;
80  
81      /** The HttpClient. */
82      private transient HttpClient cl;
83  
84      /**
85       * Constructor: initialize settings, use "head" method.
86       */
87      public OnlineHTTPLinkValidator()
88      {
89          this( new HttpBean() );
90      }
91  
92      /**
93       * Constructor: initialize settings.
94       *
95       * @param bean The http bean encapsuling all HTTP parameters supported.
96       */
97      public OnlineHTTPLinkValidator( HttpBean bean )
98      {
99          if ( bean == null )
100         {
101             bean = new HttpBean();
102         }
103 
104         if ( LOG.isDebugEnabled() )
105         {
106             LOG.debug( "Will use method : [" + bean.getMethod() + "]" );
107         }
108 
109         this.http = bean;
110 
111         initHttpClient();
112     }
113 
114     /**
115      * The base URL.
116      *
117      * @return the base URL.
118      */
119     public String getBaseURL()
120     {
121         return this.baseURL;
122     }
123 
124     /**
125      * Sets the base URL. This is pre-pended to links that start with '/'.
126      *
127      * @param url the base URL.
128      */
129     public void setBaseURL( String url )
130     {
131         this.baseURL = url;
132     }
133 
134     /** {@inheritDoc} */
135     public LinkValidationResult validateLink( LinkValidationItem lvi )
136     {
137         if ( this.cl == null )
138         {
139             initHttpClient();
140         }
141 
142         if ( this.http.getHttpClientParameters() != null )
143         {
144             for ( Iterator it = this.http.getHttpClientParameters().entrySet().iterator(); it.hasNext(); )
145             {
146                 Map.Entry entry = (Map.Entry) it.next();
147 
148                 if ( entry.getValue() != null )
149                 {
150                     System.setProperty( entry.getKey().toString(), entry.getValue().toString() );
151                 }
152             }
153         }
154 
155         // Some web servers don't allow the default user-agent sent by httpClient
156         System.setProperty( HttpMethodParams.USER_AGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)" );
157         this.cl.getParams().setParameter( HttpMethodParams.USER_AGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)" );
158 
159         String link = lvi.getLink();
160         String anchor = "";
161         int idx = link.indexOf( '#' );
162         if ( idx != -1 )
163         {
164             anchor = link.substring( idx + 1 );
165             link = link.substring( 0, idx );
166         }
167 
168         try
169         {
170             if ( link.startsWith( "/" ) )
171             {
172                 if ( getBaseURL() == null )
173                 {
174                     if ( LOG.isWarnEnabled() )
175                     {
176                         LOG.warn( "Cannot check link [" + link + "] in page [" + lvi.getSource()
177                             + "], as no base URL has been set!" );
178                     }
179 
180                     return new LinkValidationResult( LinkcheckFileResult.WARNING_LEVEL, false,
181                                                      "No base URL specified" );
182                 }
183 
184                 link = getBaseURL() + link;
185             }
186 
187             HttpMethod hm = null;
188             try
189             {
190                 hm = checkLink( link, 0 );
191             }
192             catch ( Throwable t )
193             {
194                 if ( LOG.isDebugEnabled() )
195                 {
196                     LOG.debug( "Received: [" + t + "] for [" + link + "] in page [" + lvi.getSource() + "]", t );
197                 }
198 
199                 return new LinkValidationResult( LinkcheckFileResult.ERROR_LEVEL, false, t.getClass().getName()
200                     + " : " + t.getMessage() );
201             }
202 
203             if ( hm == null )
204             {
205                 return new LinkValidationResult( LinkcheckFileResult.ERROR_LEVEL, false,
206                                                  "Cannot retreive HTTP Status" );
207             }
208 
209             if ( hm.getStatusCode() == HttpStatus.SC_OK )
210             {
211                 // lets check if the anchor is present
212                 if ( anchor.length() > 0 )
213                 {
214                     String content = hm.getResponseBodyAsString();
215 
216                     if ( !Anchors.matchesAnchor( content, anchor ) )
217                     {
218                         return new HTTPLinkValidationResult( LinkcheckFileResult.VALID_LEVEL, false,
219                             "Missing anchor '" + anchor + "'" );
220                     }
221                 }
222                 return new HTTPLinkValidationResult( LinkcheckFileResult.VALID_LEVEL, true, hm.getStatusCode(),
223                                                      hm.getStatusText() );
224             }
225 
226             String msg =
227                 "Received: [" + hm.getStatusCode() + "] for [" + link + "] in page [" + lvi.getSource() + "]";
228             // If there's a redirection ... add a warning
229             if ( hm.getStatusCode() == HttpStatus.SC_MOVED_PERMANENTLY
230                 || hm.getStatusCode() == HttpStatus.SC_MOVED_TEMPORARILY
231                 || hm.getStatusCode() == HttpStatus.SC_TEMPORARY_REDIRECT )
232             {
233                 LOG.warn( msg );
234 
235                 return new HTTPLinkValidationResult( LinkcheckFileResult.WARNING_LEVEL, true, hm.getStatusCode(),
236                                                      hm.getStatusText() );
237             }
238 
239             LOG.debug( msg );
240 
241             return new HTTPLinkValidationResult( LinkcheckFileResult.ERROR_LEVEL, false, hm.getStatusCode(),
242                                                  hm.getStatusText() );
243         }
244         catch ( Throwable t )
245         {
246             String msg = "Received: [" + t + "] for [" + link + "] in page [" + lvi.getSource() + "]";
247             if ( LOG.isDebugEnabled() )
248             {
249                 LOG.debug( msg, t );
250             }
251             else
252             {
253                 LOG.error( msg );
254             }
255 
256             return new LinkValidationResult( LinkcheckFileResult.ERROR_LEVEL, false, t.getMessage() );
257         }
258         finally
259         {
260             System.getProperties().remove( HttpMethodParams.USER_AGENT );
261 
262             if ( this.http.getHttpClientParameters() != null )
263             {
264                 for ( Iterator it = this.http.getHttpClientParameters().entrySet().iterator(); it.hasNext(); )
265                 {
266                     Map.Entry entry = (Map.Entry) it.next();
267 
268                     if ( entry.getValue() != null )
269                     {
270                         System.getProperties().remove( entry.getKey().toString() );
271                     }
272                 }
273             }
274         }
275     }
276 
277     /** Initialize the HttpClient. */
278     private void initHttpClient()
279     {
280         LOG.debug( "A new HttpClient instance is needed ..." );
281 
282         this.cl = new HttpClient( new MultiThreadedHttpConnectionManager() );
283 
284         // Default params
285         if ( this.http.getTimeout() != 0 )
286         {
287             this.cl.getHttpConnectionManager().getParams().setConnectionTimeout( this.http.getTimeout() );
288             this.cl.getHttpConnectionManager().getParams().setSoTimeout( this.http.getTimeout() );
289         }
290         this.cl.getParams().setBooleanParameter( HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, true );
291 
292         HostConfiguration hc = new HostConfiguration();
293 
294         HttpState state = new HttpState();
295         if ( StringUtils.isNotEmpty( this.http.getProxyHost() ) )
296         {
297             hc.setProxy( this.http.getProxyHost(), this.http.getProxyPort() );
298 
299             if ( LOG.isDebugEnabled() )
300             {
301                 LOG.debug( "Proxy Host:" + this.http.getProxyHost() );
302                 LOG.debug( "Proxy Port:" + this.http.getProxyPort() );
303             }
304 
305             if ( StringUtils.isNotEmpty( this.http.getProxyUser() ) && this.http.getProxyPassword() != null )
306             {
307                 if ( LOG.isDebugEnabled() )
308                 {
309                     LOG.debug( "Proxy User:" + this.http.getProxyUser() );
310                 }
311 
312                 Credentials credentials;
313                 if ( StringUtils.isNotEmpty( this.http.getProxyNtlmHost() ) )
314                 {
315                     credentials =
316                         new NTCredentials( this.http.getProxyUser(), this.http.getProxyPassword(),
317                                            this.http.getProxyNtlmHost(), this.http.getProxyNtlmDomain() );
318                 }
319                 else
320                 {
321                     credentials =
322                         new UsernamePasswordCredentials( this.http.getProxyUser(), this.http.getProxyPassword() );
323                 }
324 
325                 state.setProxyCredentials( AuthScope.ANY, credentials );
326             }
327         }
328         else
329         {
330             LOG.debug( "Not using a proxy" );
331         }
332 
333         this.cl.setHostConfiguration( hc );
334         this.cl.setState( state );
335 
336         LOG.debug( "New HttpClient instance created." );
337     }
338 
339     /**
340      * Checks the given link.
341      *
342      * @param link the link to check.
343      * @param nbRedirect the number of current redirects.
344      * @return HttpMethod
345      * @throws IOException if something goes wrong.
346      */
347     private HttpMethod checkLink( String link, int nbRedirect )
348         throws IOException
349     {
350         int max = MAX_NB_REDIRECT;
351         if ( this.http.getHttpClientParameters() != null
352             && this.http.getHttpClientParameters().get( HttpClientParams.MAX_REDIRECTS ) != null )
353         {
354             try
355             {
356                 max =
357                     Integer.valueOf(
358                                      this.http.getHttpClientParameters().get( HttpClientParams.MAX_REDIRECTS )
359                                               .toString() ).intValue();
360             }
361             catch ( NumberFormatException e )
362             {
363                 if ( LOG.isWarnEnabled() )
364                 {
365                     LOG.warn( "HttpClient parameter '" + HttpClientParams.MAX_REDIRECTS
366                         + "' is not a number. Ignoring!" );
367                 }
368             }
369         }
370         if ( nbRedirect > max )
371         {
372             throw new HttpException( "Maximum number of redirections (" + max + ") exceeded" );
373         }
374 
375         HttpMethod hm;
376         if ( HEAD_METHOD.equalsIgnoreCase( this.http.getMethod() ) )
377         {
378             hm = new HeadMethod( link );
379         }
380         else if ( GET_METHOD.equalsIgnoreCase( this.http.getMethod() ) )
381         {
382             hm = new GetMethod( link );
383         }
384         else
385         {
386             if ( LOG.isErrorEnabled() )
387             {
388                 LOG.error( "Unsupported method: " + this.http.getMethod() + ", using 'get'." );
389             }
390             hm = new GetMethod( link );
391         }
392 
393         // Default
394         hm.setFollowRedirects( this.http.isFollowRedirects() );
395 
396         try
397         {
398             URL url = new URL( link );
399 
400             cl.getHostConfiguration().setHost( url.getHost(), url.getPort(), url.getProtocol() );
401 
402             cl.executeMethod( hm );
403 
404             StatusLine sl = hm.getStatusLine();
405             if ( sl == null )
406             {
407                 if ( LOG.isErrorEnabled() )
408                 {
409                     LOG.error( "Unknown error validating link : " + link );
410                 }
411 
412                 return null;
413             }
414 
415             if ( hm.getStatusCode() == HttpStatus.SC_MOVED_PERMANENTLY
416                 || hm.getStatusCode() == HttpStatus.SC_MOVED_TEMPORARILY
417                 || hm.getStatusCode() == HttpStatus.SC_TEMPORARY_REDIRECT )
418             {
419                 Header locationHeader = hm.getResponseHeader( "location" );
420 
421                 if ( locationHeader == null )
422                 {
423                     LOG.error( "Site sent redirect, but did not set Location header" );
424 
425                     return hm;
426                 }
427 
428                 String newLink = locationHeader.getValue();
429 
430                 // Be careful to absolute/relative links
431                 if ( !newLink.startsWith( "http://" ) && !newLink.startsWith( "https://" ) )
432                 {
433                     if ( newLink.startsWith( "/" ) )
434                     {
435                         URL oldUrl = new URL( link );
436 
437                         newLink =
438                             oldUrl.getProtocol() + "://" + oldUrl.getHost()
439                                 + ( oldUrl.getPort() > 0 ? ":" + oldUrl.getPort() : "" ) + newLink;
440                     }
441                     else
442                     {
443                         newLink = link + newLink;
444                     }
445                 }
446 
447                 HttpMethod oldHm = hm;
448 
449                 if ( LOG.isDebugEnabled() )
450                 {
451                     LOG.debug( "[" + link + "] is redirected to [" + newLink + "]" );
452                 }
453 
454                 oldHm.releaseConnection();
455 
456                 hm = checkLink( newLink, nbRedirect + 1 );
457 
458                 // Restore the hm to "Moved permanently" | "Moved temporarily" | "Temporary redirect"
459                 // if the new location is found to allow us to report it
460                 if ( hm.getStatusCode() == HttpStatus.SC_OK && nbRedirect == 0 )
461                 {
462                     return oldHm;
463                 }
464             }
465 
466         }
467         finally
468         {
469             hm.releaseConnection();
470         }
471 
472         return hm;
473     }
474 }