1 package org.apache.maven.doxia.linkcheck; 2 3 /* 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, 15 * software distributed under the License is distributed on an 16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 * KIND, either express or implied. See the License for the 18 * specific language governing permissions and limitations 19 * under the License. 20 */ 21 22 import java.io.File; 23 import java.io.IOException; 24 import java.io.Reader; 25 import java.util.Locale; 26 import java.util.Set; 27 import java.util.TreeSet; 28 import java.util.regex.Matcher; 29 import java.util.regex.Pattern; 30 31 import org.codehaus.plexus.util.IOUtil; 32 import org.codehaus.plexus.util.ReaderFactory; 33 34 /** 35 * Link matcher. Reads the contents of a file and tries to match the following: 36 * <pre> 37 * <a href="".../> 38 * <link href="".../> 39 * <img src="".../> 40 * <script src="".../> 41 * </pre> 42 * 43 * @author <a href="mailto:mac@apache.org">Ignacio G. Mac Dowell </a> 44 * @version $Id: LinkMatcher.java 800044 2009-08-02 12:28:50Z vsiveton $ 45 */ 46 class LinkMatcher 47 { 48 /** Regexp for link matching. */ 49 private static final Pattern MATCH_PATTERN = 50 Pattern.compile( "<(?>link|a|img|script)[^>]*?(?>href|src)\\s*?=\\s*?[\\\"'](.*?)[\\\"'][^>]*?", 51 Pattern.CASE_INSENSITIVE ); 52 53 /** No need to create a new object each time a file is processed. Just clear it. */ 54 private static final Set LINK_LIST = new TreeSet(); 55 56 private LinkMatcher() 57 { 58 // nop 59 } 60 61 /** 62 * Reads a file and returns its contents without any XML comments. 63 * 64 * @param file the file we are reading 65 * @param encoding the encoding file used 66 * @return a StringBuffer with file's contents. 67 * @throws IOException if something goes wrong. 68 * @see ReaderFactory#newReader(File, String) 69 * @see IOUtil#toString(Reader) 70 */ 71 private static String toString( File file, String encoding ) 72 throws IOException 73 { 74 String content; 75 Reader reader = null; 76 try 77 { 78 reader = ReaderFactory.newReader( file, encoding ); 79 80 content = IOUtil.toString( reader ); 81 } 82 finally 83 { 84 IOUtil.close( reader ); 85 } 86 87 // some link could be in comments, remove them 88 return content.replaceAll( "(?s)<!--.*?-->", "" ); 89 } 90 91 /** 92 * Performs the actual matching. 93 * 94 * @param file the file to check 95 * @param encoding the encoding file used 96 * @return a set with all links to check 97 * @throws IOException if something goes wrong 98 */ 99 static Set match( File file, String encoding ) 100 throws IOException 101 { 102 LINK_LIST.clear(); 103 104 final Matcher m = MATCH_PATTERN.matcher( toString( file, encoding ) ); 105 106 String link; 107 108 while ( m.find() ) 109 { 110 link = m.group( 1 ).trim(); 111 112 if ( link.length() < 1 ) 113 { 114 continue; 115 } 116 else if ( link.toLowerCase( Locale.ENGLISH ).indexOf( "javascript" ) != -1 ) 117 { 118 continue; 119 } 120 // TODO: Review dead code and delete if not needed 121 // else if (link.toLowerCase( Locale.ENGLISH ).indexOf("mailto:") != -1) { 122 // continue; 123 // } 124 125 LINK_LIST.add( link ); 126 } 127 128 return LINK_LIST; 129 } 130 }