001package org.apache.maven.doxia.module.twiki.parser;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import java.util.ArrayList;
023import java.util.List;
024import java.util.StringTokenizer;
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027
028/**
029 * Parse almost plain text in search of WikiWords, links, ...
030 *
031 * @author Juan F. Codagnone
032 */
033public class TextParser
034{
035    /**
036     * pattern to detect WikiWords
037     */
038    private static final Pattern WIKIWORD_PATTERN =
039        Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" );
040
041    /**
042     * pattern to detect SpecificLinks links [[reference][text]]
043     */
044    private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" );
045
046    /**
047     * pattern to detect ForcedLinks links [[reference asd]]
048     */
049    private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" );
050
051    /**
052     * anchor name
053     */
054    private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" );
055
056    /**
057     * url word
058     */
059    private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" );
060
061    /**
062     *  image pattern specification
063     */
064    private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" );
065
066    /**
067     *  image tag pattern specification (used for images at relative URLs)
068     */
069    private static final Pattern IMAGE_TAG_PATTERN =
070        Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE );
071
072    /** HTML tag pattern */
073    private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL );
074
075    /**
076     * resolves wikiWordLinks
077     */
078    private final WikiWordLinkResolver wikiWordLinkResolver;
079
080    /** resolves noautolink tag */
081    private boolean noautolink;
082
083    /**
084     * Creates the TextParser.
085     *
086     * @param resolver resolver for wikiWord links
087     */
088    public TextParser( final WikiWordLinkResolver resolver )
089    {
090        this.wikiWordLinkResolver = resolver;
091    }
092
093    /**
094     * <p>parse.</p>
095     *
096     * @param line line to parse
097     * @return a list of block that represents the input
098     */
099    public final List<Block> parse( final String line )
100    {
101        final List<Block> ret = new ArrayList<>();
102
103        final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line );
104        final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line );
105        final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line );
106        final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line );
107        final Matcher urlMatcher = URL_PATTERN.matcher( line );
108        final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line );
109
110        final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line );
111        Matcher xhtmlMatcher = null;
112        if ( tagMatcher.find() )
113        {
114            String tag = tagMatcher.group( 2 );
115
116            Pattern pattern =
117                Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL );
118            xhtmlMatcher = pattern.matcher( line );
119        }
120
121        if ( xhtmlMatcher != null && xhtmlMatcher.find() )
122        {
123            parseXHTML( line, ret, xhtmlMatcher );
124        }
125        else if ( linkMatcher.find() )
126        {
127            parseLink( line, ret, linkMatcher );
128        }
129        else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink )
130        {
131            parseWiki( line, ret, wikiMatcher );
132        }
133        else if ( forcedLinkMatcher.find() )
134        {
135            parseForcedLink( line, ret, forcedLinkMatcher );
136        }
137        else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) )
138        {
139            parseAnchor( line, ret, anchorMatcher );
140        }
141        else if ( urlMatcher.find() && isAWord( urlMatcher, line ) )
142        {
143            parseUrl( line, ret, urlMatcher );
144        }
145        else if ( imageTagMatcher.find() )
146        {
147            parseImage( line, ret, imageTagMatcher );
148        }
149        else
150        {
151            if ( line.length() != 0 )
152            {
153                ret.add( new TextBlock( line ) );
154            }
155        }
156
157        return ret;
158    }
159
160    /**
161     * Parses the image tag
162     * @param line the line to parse
163     * @param ret where the results live
164     * @param imageTagMatcher image tag matcher
165     */
166    private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher )
167    {
168        ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) );
169        final String src = imageTagMatcher.group( 2 );
170        ret.add( new ImageBlock( src ) );
171        ret.addAll( parse( line.substring( imageTagMatcher.end() ) ) );
172    }
173
174    /**
175     * Parses the url
176     * @param line the line to parse
177     * @param ret where the results live
178     * @param urlMatcher url matcher
179     */
180    private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher )
181    {
182        ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) );
183        final String url = urlMatcher.group( 0 );
184        final Matcher imageMatcher = IMAGE_PATTERN.matcher( url );
185        if ( imageMatcher.matches() )
186        {
187            ret.add( new ImageBlock( url ) );
188        }
189        else
190        {
191            ret.add( new LinkBlock( url, new TextBlock( url ) ) );
192        }
193        ret.addAll( parse( line.substring( urlMatcher.end() ) ) );
194    }
195
196    /**
197     * Parses the anchor
198     * @param line the line to parse
199     * @param ret where the results live
200     * @param anchorMatcher anchor matcher
201     */
202    private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher )
203    {
204        ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) );
205        ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) );
206        ret.addAll( parse( line.substring( anchorMatcher.end() ) ) );
207    }
208
209    /**
210     * Parses the link
211     * @param line line to parse
212     * @param ret where the results live
213     * @param forcedLinkMatcher forced link matcher
214     */
215    private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher )
216    {
217        if ( forcedLinkMatcher.group( 1 ) != null )
218        {
219            ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) );
220        }
221        else
222        {
223            final String showText = forcedLinkMatcher.group( 3 );
224            // mailto link:
225            if ( showText.trim().startsWith( "mailto:" ) )
226            {
227                String s = showText.trim();
228                int i = s.indexOf( ' ' );
229                if ( i == -1 )
230                {
231                    ret.add( new TextBlock( s ) );
232                }
233                else
234                {
235                    ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) );
236                }
237            }
238            else
239            {
240                ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) );
241                ret.add( createLink( showText, showText ) );
242                ret.addAll( parse( line.substring( forcedLinkMatcher.end() ) ) );
243            }
244        }
245    }
246
247    /**
248     * Decides between a WikiWordBlock or a a LinkBlock
249     * @param link the link text
250     * @param showText the show text.
251     * @return either a WikiWordBlock or a LinkBlock
252     */
253    private Block createLink( final String link, final String showText )
254    {
255        final Block content;
256        if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() )
257        {
258            content = new ImageBlock( showText );
259        }
260        else
261        {
262            content = new TextBlock( showText );
263        }
264
265        if ( URL_PATTERN.matcher( link ).matches() )
266        {
267            return new LinkBlock( link, content );
268        }
269
270        final StringTokenizer tokenizer = new StringTokenizer( link );
271        final StringBuilder sb = new StringBuilder();
272
273        while ( tokenizer.hasMoreElements() )
274        {
275            final String s = tokenizer.nextToken();
276            sb.append( s.substring( 0, 1 ).toUpperCase() );
277            sb.append( s.substring( 1 ) );
278        }
279        return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver );
280    }
281
282    /**
283     * Parses a wiki word
284     * @param line the line to parse
285     * @param ret where the results live
286     * @param wikiMatcher wiki matcher
287     */
288    private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher )
289    {
290        final String wikiWord = wikiMatcher.group();
291        ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) );
292        if ( wikiWord.startsWith( "!" ) )
293        { // link prevention
294            ret.add( new TextBlock( wikiWord.substring( 1 ) ) );
295        }
296        else
297        {
298            ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) );
299        }
300        ret.addAll( parse( line.substring( wikiMatcher.end() ) ) );
301    }
302
303    /**
304     * Parses a link
305     * @param line the line to parse
306     * @param ret where the results live
307     * @param linkMatcher link matcher
308     */
309    private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher )
310    {
311        ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) );
312        if ( line.charAt( linkMatcher.start() ) == '!' )
313        {
314            ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) );
315        }
316        else
317        {
318            ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) );
319        }
320        ret.addAll( parse( line.substring( linkMatcher.end() ) ) );
321    }
322
323    /**
324     * Parses xhtml.
325     *
326     * @param line the line to parse
327     * @param ret where the results live
328     * @param xhtmlMatcher xhtml matcher
329     */
330    private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher )
331    {
332        ret.addAll( parse( line.substring( 0, xhtmlMatcher.start() ) ) );
333        if ( xhtmlMatcher.group( 1 ).contains( "noautolink" ) )
334        {
335            noautolink = true;
336        }
337        else
338        {
339            ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) );
340        }
341
342        ret.addAll( parse( xhtmlMatcher.group( 2 ) ) );
343
344        if ( xhtmlMatcher.group( 1 ).contains( "noautolink" ) )
345        {
346            noautolink = false;
347        }
348        else
349        {
350            ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) );
351        }
352
353        ret.addAll( parse( xhtmlMatcher.group( 4 ) ) );
354    }
355
356    /**
357     * @param m    matcher to test
358     * @param line line to test
359     * @return <code>true</code> if the match on m represent a word (must be
360     *         a space before the word or must be the beginning of the line)
361     */
362    private boolean isAWord( final Matcher m, final String line )
363    {
364        return startLikeWord( m, line ) && endLikeWord( m, line );
365    }
366
367    /**
368     * @param m matcher to test
369     * @param line line to test
370     * @return true if it is the beginning of a word
371     */
372    private boolean startLikeWord( final Matcher m, final String line )
373    {
374        final int start = m.start();
375
376        boolean ret = false;
377        if ( start == 0 )
378        {
379            ret = true;
380        }
381        else if ( start > 0 )
382        {
383            if ( isSpace( line.charAt( start - 1 ) ) )
384            {
385                ret = true;
386            }
387        }
388
389        return ret;
390    }
391
392    /**
393     * @param m matcher to test
394     * @param line line to test
395     * @return true if it is the end of a word
396     */
397    private boolean endLikeWord( final Matcher m, final String line )
398    {
399        final int end = m.end();
400
401        boolean ret = true;
402        if ( end < line.length() )
403        {
404            ret = isSpace( line.charAt( end ) );
405        }
406
407        return ret;
408    }
409
410    /**
411     * @param c char to test
412     * @return <code>true</code> if c is a space char
413     */
414    private boolean isSpace( final char c )
415    {
416        return c == ' ' || c == '\t';
417    }
418}