View Javadoc

1   package org.apache.maven.doxia.linkcheck;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.File;
23  import java.io.IOException;
24  import java.io.Reader;
25  import java.util.Locale;
26  import java.util.Set;
27  import java.util.TreeSet;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.codehaus.plexus.util.IOUtil;
32  import org.codehaus.plexus.util.ReaderFactory;
33  
34  /**
35   * Link matcher. Reads the contents of a file and tries to match the following:
36   * <pre>
37   * &lt;a href="".../&gt;
38   * &lt;link href="".../&gt;
39   * &lt;img src="".../&gt;
40   * &lt;script src="".../&gt;
41   * </pre>
42   *
43   * @author <a href="mailto:mac@apache.org">Ignacio G. Mac Dowell </a>
44   * @version $Id: LinkMatcher.java 800044 2009-08-02 12:28:50Z vsiveton $
45   */
46  class LinkMatcher
47  {
48      /** Regexp for link matching. */
49      private static final Pattern MATCH_PATTERN =
50          Pattern.compile( "<(?>link|a|img|script)[^>]*?(?>href|src)\\s*?=\\s*?[\\\"'](.*?)[\\\"'][^>]*?",
51                           Pattern.CASE_INSENSITIVE );
52  
53      /** No need to create a new object each time a file is processed. Just clear it. */
54      private static final Set LINK_LIST = new TreeSet();
55  
56      private LinkMatcher()
57      {
58          // nop
59      }
60  
61      /**
62       * Reads a file and returns its contents without any XML comments.
63       *
64       * @param file the file we are reading
65       * @param encoding the encoding file used
66       * @return a StringBuffer with file's contents.
67       * @throws IOException if something goes wrong.
68       * @see ReaderFactory#newReader(File, String)
69       * @see IOUtil#toString(Reader)
70       */
71      private static String toString( File file, String encoding )
72          throws IOException
73      {
74          String content;
75          Reader reader = null;
76          try
77          {
78              reader = ReaderFactory.newReader( file, encoding );
79  
80              content = IOUtil.toString( reader );
81          }
82          finally
83          {
84              IOUtil.close( reader );
85          }
86  
87          // some link could be in comments, remove them
88          return content.replaceAll( "(?s)<!--.*?-->", "" );
89      }
90  
91      /**
92       * Performs the actual matching.
93       *
94       * @param file the file to check
95       * @param encoding the encoding file used
96       * @return a set with all links to check
97       * @throws IOException if something goes wrong
98       */
99      static Set match( File file, String encoding )
100         throws IOException
101     {
102         LINK_LIST.clear();
103 
104         final Matcher m = MATCH_PATTERN.matcher( toString( file, encoding ) );
105 
106         String link;
107 
108         while ( m.find() )
109         {
110             link = m.group( 1 ).trim();
111 
112             if ( link.length() < 1 )
113             {
114                 continue;
115             }
116             else if ( link.toLowerCase( Locale.ENGLISH ).indexOf( "javascript" ) != -1 )
117             {
118                 continue;
119             }
120             // TODO: Review dead code and delete if not needed
121             // else if (link.toLowerCase( Locale.ENGLISH ).indexOf("mailto:") != -1) {
122             // continue;
123             // }
124 
125             LINK_LIST.add( link );
126         }
127 
128         return LINK_LIST;
129     }
130 }