View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.myfaces.renderkit.html.util;
20  
21  import org.apache.commons.logging.Log;
22  import org.apache.commons.logging.LogFactory;
23  
24  /**
25   * A class which detects the open/close tags in an HTML document and reports
26   * them to a listener class.
27   * <p>
28   * This is unfortunately necessary when using JSF with JSP, as tags in the body
29   * of the document can need to output commands into the document at points
30   * earlier than the tag occurred (particularly into the document HEAD section).
31   * This can only be implemented by buffering the response and post-processing
32   * it to find the relevant HTML tags and modifying the buffer as needed.
33   * <p>
34   * This class tries to do the parsing as quickly as possible; many of the
35   * details of HTML are not relevant for the purposes this class is used for.
36   * 
37   * @version $Revision: 673833 $ $Date: 2008-07-03 16:58:05 -0500 (Thu, 03 Jul 2008) $
38   */
39  public class ReducedHTMLParser
40  {
41      // IMPLEMENTATION NOTE:
42      //
43      // Many of the methods on this class are package-scope. This is intended
44      // solely for the purpose of unit-testing. This class does not expect
45      // other classes in this package to access its methods.
46  
47      private static final Log log = LogFactory.getLog(ReducedHTMLParser.class);
48  
49      public static final int BODY_TAG = 0;
50      public static final int HEAD_TAG = 1;
51      public static final int SCRIPT_TAG = 2;
52  
53      private static final int STATE_READY = 0;
54      private static final int STATE_IN_COMMENT = 1;
55      private static final int STATE_IN_TAG = 2;
56      private static final int STATE_IN_MARKED_SECTION = 3;
57      private static final int STATE_EXPECTING_ETAGO = 4;
58  
59      private int _offset;
60      private int _lineNumber;
61      private CharSequence _seq;
62      private CallbackListener _listener;
63  
64      public static void parse(CharSequence seq, CallbackListener l)
65      {
66          new ReducedHTMLParser(seq, l).parse();
67      }
68  
69      /**
70       * Constructor, package-scope for unit testing.
71       *
72       * @param s is the sequence of chars to parse.
73       * @param l is the listener to invoke callbacks on.
74       */
75      ReducedHTMLParser(CharSequence s, CallbackListener l)
76      {
77          _seq = s;
78          _listener = l;
79      }
80  
81      /**
82       * Return true if there are no more characters to parse.
83       */
84      boolean isFinished()
85      {
86          return _offset >= _seq.length();
87      }
88  
89      int getCurrentLineNumber()
90      {
91           return _lineNumber;
92      }
93  
94      /**
95       * Advance the current parse position over any whitespace characters.
96       */
97      void consumeWhitespace()
98      {
99          boolean crSeen = false;
100 
101         while (_offset < _seq.length())
102         {
103             char c = _seq.charAt(_offset);
104             if (!Character.isWhitespace(c))
105             {
106                 break;
107             }
108 
109             // Track line number for error messages.
110             if (c == '\r')
111             {
112                 ++_lineNumber;
113                 crSeen = true;
114             }
115             else if ((c == '\n') && !crSeen)
116             {
117                 ++_lineNumber;
118             }
119             else
120             {
121                 crSeen = false;
122             }
123 
124             ++_offset;
125         }
126     }
127 
128     /**
129      * Eat up a sequence of non-whitespace characters and return them.
130      */
131     String consumeNonWhitespace()
132     {
133         int wordStart = _offset;
134         while (_offset < _seq.length())
135         {
136             char c = _seq.charAt(_offset);
137             if (Character.isWhitespace(c))
138             {
139                 break;
140             }
141             ++_offset;
142         }
143         if (wordStart == _offset)
144         {
145             return null;
146         }
147         else
148         {
149             return _seq.subSequence(wordStart, _offset).toString();
150         }
151     }
152 
153     /**
154      * If the next chars in the input sequence exactly match the specified
155      * string then skip over them and return true.
156      * <p>
157      * If there is not a match then leave the current parse position
158      * unchanged and return false.
159      *
160      * @param s is the exact string to match.
161      * @return true if the input contains exactly the param s
162      */
163     boolean consumeMatch(String s)
164     {
165         if (_offset + s.length() > _seq.length())
166         {
167             // seq isn't long enough to contain the specified string
168             return false;
169         }
170 
171         int i = 0;
172         while (i < s.length())
173         {
174             if (_seq.charAt(_offset+i) == s.charAt(i))
175             {
176                 ++i;
177             }
178             else
179             {
180                 return false;
181             }
182         }
183 
184         _offset += i;
185         return true;
186     }
187 
188     /**
189      * Eat up a sequence of chars which form a valid XML element name.
190      * <p>
191      * TODO: implement this properly in compliance with spec
192      */
193     String consumeElementName()
194     {
195         consumeWhitespace();
196         int nameStart = _offset;
197         while (!isFinished())
198         {
199             boolean ok = false;
200             char c = _seq.charAt(_offset);
201             if (Character.isLetterOrDigit(_seq.charAt(_offset)))
202             {
203                 ok = true;
204             }
205             else if (c == '_')
206             {
207                 ok = true;
208             }
209             else if (c == '-')
210             {
211                 ok = true;
212             }
213             else if (c == ':')
214             {
215                 ok = true;
216             }
217 
218             if (!ok)
219             {
220                 break;
221             }
222 
223             ++_offset;
224         }
225 
226         if (nameStart == _offset)
227         {
228             return null;
229         }
230         else
231         {
232             return _seq.subSequence(nameStart, _offset).toString();
233         }
234     }
235 
236     /**
237      * Eat up a sequence of chars which form a valid XML attribute name.
238      * <p>
239      * TODO: implement this properly in compliance with spec
240      */
241     String consumeAttrName()
242     {
243         // for now, assume elements and attributes have same rules
244         return consumeElementName();
245     }
246 
247     /**
248      * Eat up a string which is terminated with the specified quote
249      * character. This means handling escaped quote chars within the
250      * string.
251      * <p>
252      * This method assumes that the leading quote has already been
253      * consumed.
254      */
255     String consumeString(char quote)
256     {
257         // TODO: should we consider a string to be terminated by a newline?
258         // that would help with runaway strings but I think that multiline
259         // strings *are* allowed...
260         //
261         // TODO: detect newlines within strings and increment lineNumber.
262         // This isn't so important, though; they aren't common and being a
263         // few lines out in an error message isn't serious either.
264         StringBuffer stringBuf = new StringBuffer();
265         boolean escaping = false;
266         while (!isFinished())
267         {
268             char c = _seq.charAt(_offset);
269             ++_offset;
270             if (c == quote)
271             {
272                 if (!escaping)
273                 {
274                     break;
275                 }
276                 else
277                 {
278                     stringBuf.append(c);
279                     escaping = false;
280                 }
281             }
282             else if (c == '\\')
283             {
284                 if (escaping)
285                 {
286                     // append a real backslash
287                     stringBuf.append(c);
288                     escaping = false;
289                 }
290                 else
291                 {
292                     escaping = true;
293                 }
294             }
295             else
296             {
297                 if (escaping)
298                 {
299                     stringBuf.append('\\');
300                     escaping = false;                    
301                 }
302 
303                 stringBuf.append(c);
304             }
305         }
306         return stringBuf.toString();
307     }
308 
309     /**
310      * Assuming we have already encountered "attrname=", consume the
311      * value part of the attribute definition. Note that unlike XML,
312      * HTML doesn't have to quote its attribute values.
313      *
314      * @return the attribute value. If the attr-value was quoted,
315      * the returned value will not include the quote chars.
316      */
317     String consumeAttrValue()
318     {
319         consumeWhitespace();
320 
321         if (consumeMatch("'"))
322         {
323             return consumeString('\'');
324         }
325         else if (consumeMatch("\""))
326         {
327             return consumeString('"');
328         }
329         else
330         {
331             return consumeNonWhitespace();
332         }
333     }
334 
335     /**
336      * Discard all characters in the input until one in the specified
337      * string (character-set) is found.
338      *
339      * @param s is a set of characters that should not be discarded.
340      */
341     void consumeExcept(String s)
342     {
343         boolean crSeen = false;
344 
345         while (_offset < _seq.length())
346         {
347             char c = _seq.charAt(_offset);
348             if (s.indexOf(c) >= 0)
349             {
350                 // char is in the exception set
351                 return;
352             }
353 
354             // Track line number for error messages.
355             if (c == '\r')
356             {
357                 ++_lineNumber;
358                 crSeen = true;
359             }
360             else if ((c == '\n') && !crSeen)
361             {
362                 ++_lineNumber;
363             }
364             else
365             {
366                 crSeen = false;
367             }
368 
369             ++_offset;
370         }
371     }
372 
373     /**
374      * Process the entire input buffer, invoking callbacks on the listener
375      * object as appropriate.
376      */
377     void parse()
378     {
379         int state = STATE_READY;
380 
381         int currentTagStart = -1;
382         String currentTagName = null;
383 
384         _lineNumber = 1;
385         _offset = 0;
386         int lastOffset = _offset -1;
387         while (_offset < _seq.length())
388         {
389             // Sanity check; each pass through this loop must increase the offset.
390             // Failure to do this means a hang situation has occurred.
391             if (_offset <= lastOffset)
392             {
393                 // throw new RuntimeException("Infinite loop detected in ReducedHTMLParser");
394                 log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped."+
395                           " Surroundings: '" + getTagSurroundings() +"'.");
396                 //return;
397             }
398             lastOffset = _offset;
399 
400             if (state == STATE_READY)
401             {
402                 // in this state, nothing but "<" has any significance
403                 consumeExcept("<");
404                 if (isFinished())
405                 {
406                     break;
407                 }
408 
409                 if (consumeMatch("<!--"))
410                 {
411                     // Note that whitespace is *not* permitted in <!--
412                     state = STATE_IN_COMMENT;
413                 }
414                 else if (consumeMatch("<!["))
415                 {
416                     // Start of a "marked section", eg "<![CDATA" or
417                     // "<![INCLUDE" or "<![IGNORE". These always terminate
418                     // with "]]>"
419                     log.debug("Marked section found at line " + getCurrentLineNumber()+". "+
420                               "Surroundings: '" + getTagSurroundings() +"'.");
421                     state = STATE_IN_MARKED_SECTION;
422                 }
423                 else if (consumeMatch("<!DOCTYPE"))
424                 {
425                     log.debug("DOCTYPE found at line " + getCurrentLineNumber());
426                     // we don't need to actually do anything here; the
427                     // tag can't contain a bare "<", so the first "<"
428                     // indicates the start of the next real tag.
429                     //
430                     // TODO: Handle case where the DOCTYPE includes an internal DTD. In
431                     // that case there *will* be embedded < chars in the document. However
432                     // that's very unlikely to be used in a JSF page, so this is pretty low
433                     // priority.
434                 }
435                 else if (consumeMatch("<?"))
436                 {
437                     // xml processing instruction or <!DOCTYPE> tag
438                     // we don't need to actually do anything here; the
439                     // tag can't contain a bare "<", so the first "<"
440                     // indicates the start of the next real tag.
441                     log.debug("PI found at line " + getCurrentLineNumber());
442                 }
443                 else if (consumeMatch("</"))
444                 {
445                     if (!processEndTag())
446                     {
447                         // message already logged
448                         return;
449                     }
450 
451                     // stay in state READY
452                     state = STATE_READY;
453                 }
454                 else if (consumeMatch("<"))
455                 {
456                     // We can't tell the user that the tag has closed until after we have
457                     // processed any attributes and found the real end of the tag. So save
458                     // the current info until the end of this tag.
459                     currentTagStart = _offset - 1;
460                     currentTagName = consumeElementName();
461                     if (currentTagName == null)
462                     {
463                         log.warn("Invalid HTML; bare lessthan sign found at line "
464                                  + getCurrentLineNumber() + ". "+
465                                  "Surroundings: '" + getTagSurroundings() +"'.");
466                         // remain in STATE_READY; this isn't really the start of
467                         // an xml element.
468                     }
469                     else
470                     {
471                         state = STATE_IN_TAG;
472                     }
473                 }
474                 else
475                 {
476                     // should never get here
477                     throw new Error("Internal error at line " + getCurrentLineNumber());
478                 }
479 
480                 continue;
481             }
482 
483             if (state == STATE_IN_COMMENT)
484             {
485                 // TODO: handle "--  >", which is a valid way to close a
486                 // comment according to the specs.
487 
488                 // in this state, nothing but "--" has any significance
489                 consumeExcept("-");
490                 if (isFinished())
491                 {
492                     break;
493                 }
494 
495                 if (consumeMatch("-->"))
496                 {
497                     state = STATE_READY;
498                 }
499                 else
500                 {
501                     // false call; hyphen is not end of comment
502                     consumeMatch("-");
503                 }
504 
505                 continue;
506             }
507 
508             if (state == STATE_IN_TAG)
509             {
510                 consumeWhitespace();
511 
512                 if (consumeMatch("/>"))
513                 {
514                     // ok, end of element
515                     state = STATE_READY;
516                     closedTag(currentTagStart, _offset, currentTagName);
517 
518                     // and reset vars just in case...
519                     currentTagStart = -1;
520                     currentTagName = null;
521                 }
522                 else if (consumeMatch(">"))
523                 {
524                     if (currentTagName.equalsIgnoreCase("script")
525                         || currentTagName.equalsIgnoreCase("style"))
526                     {
527                         // We've just started a special tag which can contain anything except
528                         // the ETAGO marker ("</"). See
529                         // http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
530                         state = STATE_EXPECTING_ETAGO;
531                     }
532                     else
533                     {
534                         state = STATE_READY;
535                     }
536 
537                     // end of open tag, but not end of element
538                     openedTag(currentTagStart, _offset, currentTagName);
539 
540                     // and reset vars just in case...
541                     currentTagStart = -1;
542                     currentTagName = null;
543                 }
544                 else
545                 {
546                     // xml attribute
547                     String attrName = consumeAttrName();
548                     if (attrName == null)
549                     {
550                         // Oops, we found something quite unexpected in this tag.
551                         // The best we can do is probably to drop back to looking
552                         // for "/>", though that does risk us misinterpreting the
553                         // contents of an attribute's associated string value.
554                         log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'"
555                                  + " at line " + getCurrentLineNumber()+". "+
556                                  "Surroundings: '" + getTagSurroundings() +"'.");
557                         state = STATE_EXPECTING_ETAGO;
558                         // and consume one character
559                         ++_offset;
560                     }
561                     else
562                     {
563                         consumeWhitespace();
564 
565                         // html can have "stand-alone" attributes with no following equals sign
566                         if (consumeMatch("="))
567                         {
568                             consumeAttrValue();
569                         }
570                     }
571                 }
572 
573                 continue;
574             }
575 
576             if (state == STATE_IN_MARKED_SECTION)
577             {
578                 // in this state, nothing but "]]>" has any significance
579                 consumeExcept("]");
580                 if (isFinished())
581                 {
582                     break;
583                 }
584 
585                 if (consumeMatch("]]>"))
586                 {
587                     state = STATE_READY;
588                 }
589                 else
590                 {
591                     // false call; ] is not end of cdata section
592                     consumeMatch("]");
593                 }
594 
595                 continue;
596             }
597 
598             if (state == STATE_EXPECTING_ETAGO)
599             {
600                 // The term "ETAGO" is the official spec term for "</".
601                 consumeExcept("<");
602                 if (isFinished())
603                 {
604                     log.debug("Malformed input page; input terminated while tag not closed.");
605                     break;
606                 }
607 
608                 if (consumeMatch("</"))
609                 {
610                     if (!processEndTag())
611                     {
612                         return;
613                     }
614                     state = STATE_READY;
615                 }
616                 else
617                 {
618                     // false call; < does not start an ETAGO
619                     consumeMatch("<");
620                 }
621 
622                 continue;
623             }
624         }
625     }
626 
627     /**
628      * Get details about malformed HTML tag.
629      *
630      * @return Tag surroundings.
631      */
632     private String getTagSurroundings()
633     {
634         int maxLength = 30;
635         int end = _seq.length();
636         if (end - _offset > maxLength) {
637             end = _offset + maxLength;
638         }
639         return _seq.subSequence(_offset, end).toString();
640     }
641 
642     /**
643      * Invoked when "&lt;/" has been seen in the input, this method
644      * handles the parsing of the end tag and the invocation of the
645      * appropriate callback method.
646      *
647      * @return true if the tag was successfully parsed, and false
648      * if there was a fatal parsing error.
649      */
650     private boolean processEndTag()
651     {
652         int tagStart = _offset - 2;
653         String tagName = consumeElementName();
654         consumeWhitespace();
655         if (!consumeMatch(">"))
656         {
657             // log details about malformed end tag
658             log.error("Malformed end tag '" + tagName + "' at line " + getCurrentLineNumber()
659                       + "; skipping parsing. Surroundings: '" + getTagSurroundings() +"'.");
660             return false;
661         }
662 
663 
664         // inform user that the tag has been closed
665         closedTag(tagStart, _offset, tagName);
666 
667         // We can't verify that the tag names balance because this is HTML
668         // we are processing, not XML.
669         return true;
670     }
671 
672     /**
673      * Invoke a callback method to inform the listener that we have found a start tag.
674      *
675      * @param startOffset
676      * @param endOffset
677      * @param tagName
678      */
679     void openedTag(int startOffset, int endOffset, String tagName)
680     {
681         //log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName);
682 
683         if ("head".equalsIgnoreCase(tagName))
684         {
685             _listener.openedStartTag(startOffset, HEAD_TAG);
686             _listener.closedStartTag(endOffset, HEAD_TAG);
687         }
688         else if ("body".equalsIgnoreCase(tagName))
689         {
690             _listener.openedStartTag(startOffset, BODY_TAG);
691             _listener.closedStartTag(endOffset, BODY_TAG);
692         }
693         else if ("script".equalsIgnoreCase(tagName))
694         {
695             _listener.openedStartTag(startOffset, SCRIPT_TAG);
696             _listener.closedStartTag(endOffset, SCRIPT_TAG);
697         }
698     }
699 
700     void closedTag(int startOffset, int endOffset, String tagName)
701     {
702         //log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName);
703 
704         if ("head".equalsIgnoreCase(tagName))
705         {
706             _listener.openedEndTag(startOffset, HEAD_TAG);
707             _listener.closedEndTag(endOffset, HEAD_TAG);
708         }
709         else if ("body".equalsIgnoreCase(tagName))
710         {
711             _listener.openedEndTag(startOffset, BODY_TAG);
712             _listener.closedEndTag(endOffset, BODY_TAG);
713         }
714         else if ("script".equalsIgnoreCase(tagName))
715         {
716             _listener.openedEndTag(startOffset, SCRIPT_TAG);
717             _listener.closedEndTag(endOffset, SCRIPT_TAG);
718         }
719     }
720 }