001package org.apache.maven.doxia.module.twiki.parser;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import java.util.ArrayList;
023import java.util.List;
024import java.util.StringTokenizer;
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027
028/**
029 * Parse almost plain text in search of WikiWords, links, ...
030 *
031 * @author Juan F. Codagnone
032 * @version $Id$
033 */
034public class TextParser
035{
036    /**
037     * pattern to detect WikiWords
038     */
039    private static final Pattern WIKIWORD_PATTERN =
040        Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" );
041
042    /**
043     * pattern to detect SpecificLinks links [[reference][text]]
044     */
045    private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" );
046
047    /**
048     * pattern to detect ForcedLinks links [[reference asd]]
049     */
050    private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" );
051
052    /**
053     * anchor name
054     */
055    private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" );
056
057    /**
058     * url word
059     */
060    private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" );
061
062    /**
063     *  image pattern specification
064     */
065    private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" );
066
067    /**
068     *  image tag pattern specification (used for images at relative URLs)
069     */
070    private static final Pattern IMAGE_TAG_PATTERN =
071        Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE );
072
073    /** HTML tag pattern */
074    private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL );
075
076    /**
077     * resolves wikiWordLinks
078     */
079    private final WikiWordLinkResolver wikiWordLinkResolver;
080
081    /** resolves noautolink tag */
082    private boolean noautolink;
083
084    /**
085     * Creates the TextParser.
086     *
087     * @param resolver resolver for wikiWord links
088     */
089    public TextParser( final WikiWordLinkResolver resolver )
090    {
091        this.wikiWordLinkResolver = resolver;
092    }
093
094    /**
095     * <p>parse.</p>
096     *
097     * @param line line to parse
098     * @return a list of block that represents the input
099     */
100    public final List<Block> parse( final String line )
101    {
102        final List<Block> ret = new ArrayList<Block>();
103
104        final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line );
105        final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line );
106        final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line );
107        final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line );
108        final Matcher urlMatcher = URL_PATTERN.matcher( line );
109        final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line );
110
111        final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line );
112        Matcher xhtmlMatcher = null;
113        if ( tagMatcher.find() )
114        {
115            String tag = tagMatcher.group( 2 );
116
117            Pattern pattern =
118                Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL );
119            xhtmlMatcher = pattern.matcher( line );
120        }
121
122        if ( xhtmlMatcher != null && xhtmlMatcher.find() )
123        {
124            parseXHTML( line, ret, xhtmlMatcher );
125        }
126        else if ( linkMatcher.find() )
127        {
128            parseLink( line, ret, linkMatcher );
129        }
130        else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink )
131        {
132            parseWiki( line, ret, wikiMatcher );
133        }
134        else if ( forcedLinkMatcher.find() )
135        {
136            parseForcedLink( line, ret, forcedLinkMatcher );
137        }
138        else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) )
139        {
140            parseAnchor( line, ret, anchorMatcher );
141        }
142        else if ( urlMatcher.find() && isAWord( urlMatcher, line ) )
143        {
144            parseUrl( line, ret, urlMatcher );
145        }
146        else if ( imageTagMatcher.find() )
147        {
148            parseImage( line, ret, imageTagMatcher );
149        }
150        else
151        {
152            if ( line.length() != 0 )
153            {
154                ret.add( new TextBlock( line ) );
155            }
156        }
157
158        return ret;
159    }
160
161    /**
162     * Parses the image tag
163     * @param line the line to parse
164     * @param ret where the results live
165     * @param imageTagMatcher image tag matcher
166     */
167    private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher )
168    {
169        ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) );
170        final String src = imageTagMatcher.group( 2 );
171        ret.add( new ImageBlock( src ) );
172        ret.addAll( parse( line.substring( imageTagMatcher.end(), line.length() ) ) );
173    }
174
175    /**
176     * Parses the url
177     * @param line the line to parse
178     * @param ret where the results live
179     * @param urlMatcher url matcher
180     */
181    private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher )
182    {
183        ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) );
184        final String url = urlMatcher.group( 0 );
185        final Matcher imageMatcher = IMAGE_PATTERN.matcher( url );
186        if ( imageMatcher.matches() )
187        {
188            ret.add( new ImageBlock( url ) );
189        }
190        else
191        {
192            ret.add( new LinkBlock( url, new TextBlock( url ) ) );
193        }
194        ret.addAll( parse( line.substring( urlMatcher.end(), line.length() ) ) );
195    }
196
197    /**
198     * Parses the anchor
199     * @param line the line to parse
200     * @param ret where the results live
201     * @param anchorMatcher anchor matcher
202     */
203    private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher )
204    {
205        ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) );
206        ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) );
207        ret.addAll( parse( line.substring( anchorMatcher.end(), line.length() ) ) );
208    }
209
210    /**
211     * Parses the link
212     * @param line line to parse
213     * @param ret where the results live
214     * @param forcedLinkMatcher forced link matcher
215     */
216    private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher )
217    {
218        if ( forcedLinkMatcher.group( 1 ) != null )
219        {
220            ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) );
221        }
222        else
223        {
224            final String showText = forcedLinkMatcher.group( 3 );
225            // mailto link:
226            if ( showText.trim().startsWith( "mailto:" ) )
227            {
228                String s = showText.trim();
229                int i = s.indexOf( ' ' );
230                if ( i == -1 )
231                {
232                    ret.add( new TextBlock( s ) );
233                }
234                else
235                {
236                    ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) );
237                }
238            }
239            else
240            {
241                ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) );
242                ret.add( createLink( showText, showText ) );
243                ret.addAll( parse( line.substring( forcedLinkMatcher.end(), line.length() ) ) );
244            }
245        }
246    }
247
248    /**
249     * Decides between a WikiWordBlock or a a LinkBlock
250     * @param link the link text
251     * @param showText the show text.
252     * @return either a WikiWordBlock or a LinkBlock
253     */
254    private Block createLink( final String link, final String showText )
255    {
256        final Block content;
257        if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() )
258        {
259            content = new ImageBlock( showText );
260        }
261        else
262        {
263            content = new TextBlock( showText );
264        }
265
266        if ( URL_PATTERN.matcher( link ).matches() )
267        {
268            return new LinkBlock( link, content );
269        }
270
271        final StringTokenizer tokenizer = new StringTokenizer( link );
272        final StringBuilder sb = new StringBuilder();
273
274        while ( tokenizer.hasMoreElements() )
275        {
276            final String s = tokenizer.nextToken();
277            sb.append( s.substring( 0, 1 ).toUpperCase() );
278            sb.append( s.substring( 1 ) );
279        }
280        return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver );
281    }
282
283    /**
284     * Parses a wiki word
285     * @param line the line to parse
286     * @param ret where the results live
287     * @param wikiMatcher wiki matcher
288     */
289    private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher )
290    {
291        final String wikiWord = wikiMatcher.group();
292        ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) );
293        if ( wikiWord.startsWith( "!" ) )
294        { // link prevention
295            ret.add( new TextBlock( wikiWord.substring( 1 ) ) );
296        }
297        else
298        {
299            ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) );
300        }
301        ret.addAll( parse( line.substring( wikiMatcher.end(), line.length() ) ) );
302    }
303
304    /**
305     * Parses a link
306     * @param line the line to parse
307     * @param ret where the results live
308     * @param linkMatcher link matcher
309     */
310    private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher )
311    {
312        ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) );
313        if ( line.charAt( linkMatcher.start() ) == '!' )
314        {
315            ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) );
316        }
317        else
318        {
319            ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) );
320        }
321        ret.addAll( parse( line.substring( linkMatcher.end(), line.length() ) ) );
322    }
323
324    /**
325     * Parses xhtml.
326     *
327     * @param line the line to parse
328     * @param ret where the results live
329     * @param xhtmlMatcher xhtml matcher
330     */
331    private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher )
332    {
333        ret.addAll( parse( line.substring( 0, xhtmlMatcher.start() ) ) );
334        if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
335        {
336            noautolink = true;
337        }
338        else
339        {
340            ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) );
341        }
342
343        ret.addAll( parse( xhtmlMatcher.group( 2 ) ) );
344
345        if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
346        {
347            noautolink = false;
348        }
349        else
350        {
351            ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) );
352        }
353
354        ret.addAll( parse( xhtmlMatcher.group( 4 ) ) );
355    }
356
357    /**
358     * @param m    matcher to test
359     * @param line line to test
360     * @return <code>true</code> if the match on m represent a word (must be
361     *         a space before the word or must be the beginning of the line)
362     */
363    private boolean isAWord( final Matcher m, final String line )
364    {
365        return startLikeWord( m, line ) && endLikeWord( m, line );
366    }
367
368    /**
369     * @param m matcher to test
370     * @param line line to test
371     * @return true if it is the beginning of a word
372     */
373    private boolean startLikeWord( final Matcher m, final String line )
374    {
375        final int start = m.start();
376
377        boolean ret = false;
378        if ( start == 0 )
379        {
380            ret = true;
381        }
382        else if ( start > 0 )
383        {
384            if ( isSpace( line.charAt( start - 1 ) ) )
385            {
386                ret = true;
387            }
388        }
389
390        return ret;
391    }
392
393    /**
394     * @param m matcher to test
395     * @param line line to test
396     * @return true if it is the end of a word
397     */
398    private boolean endLikeWord( final Matcher m, final String line )
399    {
400        final int end = m.end();
401
402        boolean ret = true;
403        if ( end < line.length() )
404        {
405            ret = isSpace( line.charAt( end ) );
406        }
407
408        return ret;
409    }
410
411    /**
412     * @param c char to test
413     * @return <code>true</code> if c is a space char
414     */
415    private boolean isSpace( final char c )
416    {
417        return c == ' ' || c == '\t';
418    }
419}