1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.myfaces.renderkit.html.util;
20
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39 public class ReducedHTMLParser
40 {
41
42
43
44
45
46
47 private static final Log log = LogFactory.getLog(ReducedHTMLParser.class);
48
49 public static final int BODY_TAG = 0;
50 public static final int HEAD_TAG = 1;
51 public static final int SCRIPT_TAG = 2;
52
53 private static final int STATE_READY = 0;
54 private static final int STATE_IN_COMMENT = 1;
55 private static final int STATE_IN_TAG = 2;
56 private static final int STATE_IN_MARKED_SECTION = 3;
57 private static final int STATE_EXPECTING_ETAGO = 4;
58
59 private int _offset;
60 private int _lineNumber;
61 private CharSequence _seq;
62 private CallbackListener _listener;
63
64 public static void parse(CharSequence seq, CallbackListener l)
65 {
66 new ReducedHTMLParser(seq, l).parse();
67 }
68
69
70
71
72
73
74
75 ReducedHTMLParser(CharSequence s, CallbackListener l)
76 {
77 _seq = s;
78 _listener = l;
79 }
80
81
82
83
84 boolean isFinished()
85 {
86 return _offset >= _seq.length();
87 }
88
89 int getCurrentLineNumber()
90 {
91 return _lineNumber;
92 }
93
94
95
96
97 void consumeWhitespace()
98 {
99 boolean crSeen = false;
100
101 while (_offset < _seq.length())
102 {
103 char c = _seq.charAt(_offset);
104 if (!Character.isWhitespace(c))
105 {
106 break;
107 }
108
109
110 if (c == '\r')
111 {
112 ++_lineNumber;
113 crSeen = true;
114 }
115 else if ((c == '\n') && !crSeen)
116 {
117 ++_lineNumber;
118 }
119 else
120 {
121 crSeen = false;
122 }
123
124 ++_offset;
125 }
126 }
127
128
129
130
131 String consumeNonWhitespace()
132 {
133 int wordStart = _offset;
134 while (_offset < _seq.length())
135 {
136 char c = _seq.charAt(_offset);
137 if (Character.isWhitespace(c))
138 {
139 break;
140 }
141 ++_offset;
142 }
143 if (wordStart == _offset)
144 {
145 return null;
146 }
147 else
148 {
149 return _seq.subSequence(wordStart, _offset).toString();
150 }
151 }
152
153
154
155
156
157
158
159
160
161
162
163 boolean consumeMatch(String s)
164 {
165 if (_offset + s.length() > _seq.length())
166 {
167
168 return false;
169 }
170
171 int i = 0;
172 while (i < s.length())
173 {
174 if (_seq.charAt(_offset+i) == s.charAt(i))
175 {
176 ++i;
177 }
178 else
179 {
180 return false;
181 }
182 }
183
184 _offset += i;
185 return true;
186 }
187
188
189
190
191
192
193 String consumeElementName()
194 {
195 consumeWhitespace();
196 int nameStart = _offset;
197 while (!isFinished())
198 {
199 boolean ok = false;
200 char c = _seq.charAt(_offset);
201 if (Character.isLetterOrDigit(_seq.charAt(_offset)))
202 {
203 ok = true;
204 }
205 else if (c == '_')
206 {
207 ok = true;
208 }
209 else if (c == '-')
210 {
211 ok = true;
212 }
213 else if (c == ':')
214 {
215 ok = true;
216 }
217
218 if (!ok)
219 {
220 break;
221 }
222
223 ++_offset;
224 }
225
226 if (nameStart == _offset)
227 {
228 return null;
229 }
230 else
231 {
232 return _seq.subSequence(nameStart, _offset).toString();
233 }
234 }
235
236
237
238
239
240
241 String consumeAttrName()
242 {
243
244 return consumeElementName();
245 }
246
247
248
249
250
251
252
253
254
255 String consumeString(char quote)
256 {
257
258
259
260
261
262
263
264 StringBuffer stringBuf = new StringBuffer();
265 boolean escaping = false;
266 while (!isFinished())
267 {
268 char c = _seq.charAt(_offset);
269 ++_offset;
270 if (c == quote)
271 {
272 if (!escaping)
273 {
274 break;
275 }
276 else
277 {
278 stringBuf.append(c);
279 escaping = false;
280 }
281 }
282 else if (c == '\\')
283 {
284 if (escaping)
285 {
286
287 stringBuf.append(c);
288 escaping = false;
289 }
290 else
291 {
292 escaping = true;
293 }
294 }
295 else
296 {
297 if (escaping)
298 {
299 stringBuf.append('\\');
300 escaping = false;
301 }
302
303 stringBuf.append(c);
304 }
305 }
306 return stringBuf.toString();
307 }
308
309
310
311
312
313
314
315
316
317 String consumeAttrValue()
318 {
319 consumeWhitespace();
320
321 if (consumeMatch("'"))
322 {
323 return consumeString('\'');
324 }
325 else if (consumeMatch("\""))
326 {
327 return consumeString('"');
328 }
329 else
330 {
331 return consumeNonWhitespace();
332 }
333 }
334
335
336
337
338
339
340
341 void consumeExcept(String s)
342 {
343 boolean crSeen = false;
344
345 while (_offset < _seq.length())
346 {
347 char c = _seq.charAt(_offset);
348 if (s.indexOf(c) >= 0)
349 {
350
351 return;
352 }
353
354
355 if (c == '\r')
356 {
357 ++_lineNumber;
358 crSeen = true;
359 }
360 else if ((c == '\n') && !crSeen)
361 {
362 ++_lineNumber;
363 }
364 else
365 {
366 crSeen = false;
367 }
368
369 ++_offset;
370 }
371 }
372
373
374
375
376
377 void parse()
378 {
379 int state = STATE_READY;
380
381 int currentTagStart = -1;
382 String currentTagName = null;
383
384 _lineNumber = 1;
385 _offset = 0;
386 int lastOffset = _offset -1;
387 while (_offset < _seq.length())
388 {
389
390
391 if (_offset <= lastOffset)
392 {
393
394 log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped."+
395 " Surroundings: '" + getTagSurroundings() +"'.");
396
397 }
398 lastOffset = _offset;
399
400 if (state == STATE_READY)
401 {
402
403 consumeExcept("<");
404 if (isFinished())
405 {
406 break;
407 }
408
409 if (consumeMatch("<!--"))
410 {
411
412 state = STATE_IN_COMMENT;
413 }
414 else if (consumeMatch("<!["))
415 {
416
417
418
419 log.debug("Marked section found at line " + getCurrentLineNumber()+". "+
420 "Surroundings: '" + getTagSurroundings() +"'.");
421 state = STATE_IN_MARKED_SECTION;
422 }
423 else if (consumeMatch("<!DOCTYPE"))
424 {
425 log.debug("DOCTYPE found at line " + getCurrentLineNumber());
426
427
428
429
430
431
432
433
434 }
435 else if (consumeMatch("<?"))
436 {
437
438
439
440
441 log.debug("PI found at line " + getCurrentLineNumber());
442 }
443 else if (consumeMatch("</"))
444 {
445 if (!processEndTag())
446 {
447
448 return;
449 }
450
451
452 state = STATE_READY;
453 }
454 else if (consumeMatch("<"))
455 {
456
457
458
459 currentTagStart = _offset - 1;
460 currentTagName = consumeElementName();
461 if (currentTagName == null)
462 {
463 log.warn("Invalid HTML; bare lessthan sign found at line "
464 + getCurrentLineNumber() + ". "+
465 "Surroundings: '" + getTagSurroundings() +"'.");
466
467
468 }
469 else
470 {
471 state = STATE_IN_TAG;
472 }
473 }
474 else
475 {
476
477 throw new Error("Internal error at line " + getCurrentLineNumber());
478 }
479
480 continue;
481 }
482
483 if (state == STATE_IN_COMMENT)
484 {
485
486
487
488
489 consumeExcept("-");
490 if (isFinished())
491 {
492 break;
493 }
494
495 if (consumeMatch("-->"))
496 {
497 state = STATE_READY;
498 }
499 else
500 {
501
502 consumeMatch("-");
503 }
504
505 continue;
506 }
507
508 if (state == STATE_IN_TAG)
509 {
510 consumeWhitespace();
511
512 if (consumeMatch("/>"))
513 {
514
515 state = STATE_READY;
516 closedTag(currentTagStart, _offset, currentTagName);
517
518
519 currentTagStart = -1;
520 currentTagName = null;
521 }
522 else if (consumeMatch(">"))
523 {
524 if (currentTagName.equalsIgnoreCase("script")
525 || currentTagName.equalsIgnoreCase("style"))
526 {
527
528
529
530 state = STATE_EXPECTING_ETAGO;
531 }
532 else
533 {
534 state = STATE_READY;
535 }
536
537
538 openedTag(currentTagStart, _offset, currentTagName);
539
540
541 currentTagStart = -1;
542 currentTagName = null;
543 }
544 else
545 {
546
547 String attrName = consumeAttrName();
548 if (attrName == null)
549 {
550
551
552
553
554 log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'"
555 + " at line " + getCurrentLineNumber()+". "+
556 "Surroundings: '" + getTagSurroundings() +"'.");
557 state = STATE_EXPECTING_ETAGO;
558
559 ++_offset;
560 }
561 else
562 {
563 consumeWhitespace();
564
565
566 if (consumeMatch("="))
567 {
568 consumeAttrValue();
569 }
570 }
571 }
572
573 continue;
574 }
575
576 if (state == STATE_IN_MARKED_SECTION)
577 {
578
579 consumeExcept("]");
580 if (isFinished())
581 {
582 break;
583 }
584
585 if (consumeMatch("]]>"))
586 {
587 state = STATE_READY;
588 }
589 else
590 {
591
592 consumeMatch("]");
593 }
594
595 continue;
596 }
597
598 if (state == STATE_EXPECTING_ETAGO)
599 {
600
601 consumeExcept("<");
602 if (isFinished())
603 {
604 log.debug("Malformed input page; input terminated while tag not closed.");
605 break;
606 }
607
608 if (consumeMatch("</"))
609 {
610 if (!processEndTag())
611 {
612 return;
613 }
614 state = STATE_READY;
615 }
616 else
617 {
618
619 consumeMatch("<");
620 }
621
622 continue;
623 }
624 }
625 }
626
627
628
629
630
631
632 private String getTagSurroundings()
633 {
634 int maxLength = 30;
635 int end = _seq.length();
636 if (end - _offset > maxLength) {
637 end = _offset + maxLength;
638 }
639 return _seq.subSequence(_offset, end).toString();
640 }
641
642
643
644
645
646
647
648
649
650 private boolean processEndTag()
651 {
652 int tagStart = _offset - 2;
653 String tagName = consumeElementName();
654 consumeWhitespace();
655 if (!consumeMatch(">"))
656 {
657
658 log.error("Malformed end tag '" + tagName + "' at line " + getCurrentLineNumber()
659 + "; skipping parsing. Surroundings: '" + getTagSurroundings() +"'.");
660 return false;
661 }
662
663
664
665 closedTag(tagStart, _offset, tagName);
666
667
668
669 return true;
670 }
671
672
673
674
675
676
677
678
679 void openedTag(int startOffset, int endOffset, String tagName)
680 {
681
682
683 if ("head".equalsIgnoreCase(tagName))
684 {
685 _listener.openedStartTag(startOffset, HEAD_TAG);
686 _listener.closedStartTag(endOffset, HEAD_TAG);
687 }
688 else if ("body".equalsIgnoreCase(tagName))
689 {
690 _listener.openedStartTag(startOffset, BODY_TAG);
691 _listener.closedStartTag(endOffset, BODY_TAG);
692 }
693 else if ("script".equalsIgnoreCase(tagName))
694 {
695 _listener.openedStartTag(startOffset, SCRIPT_TAG);
696 _listener.closedStartTag(endOffset, SCRIPT_TAG);
697 }
698 }
699
700 void closedTag(int startOffset, int endOffset, String tagName)
701 {
702
703
704 if ("head".equalsIgnoreCase(tagName))
705 {
706 _listener.openedEndTag(startOffset, HEAD_TAG);
707 _listener.closedEndTag(endOffset, HEAD_TAG);
708 }
709 else if ("body".equalsIgnoreCase(tagName))
710 {
711 _listener.openedEndTag(startOffset, BODY_TAG);
712 _listener.closedEndTag(endOffset, BODY_TAG);
713 }
714 else if ("script".equalsIgnoreCase(tagName))
715 {
716 _listener.openedEndTag(startOffset, SCRIPT_TAG);
717 _listener.closedEndTag(endOffset, SCRIPT_TAG);
718 }
719 }
720 }