/* * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Generated By:JavaCC: Do not edit this line. HTMLParser.java */ using System; namespace Lucene.Net.Demo.Html { public class HTMLParser : HTMLParserConstants { private void InitBlock() { jj_2_rtns = new JJCalls[2]; jj_ls = new LookaheadSuccess(); } public static int SUMMARY_LENGTH = 200; internal System.Text.StringBuilder title = new System.Text.StringBuilder(SUMMARY_LENGTH); internal System.Text.StringBuilder summary = new System.Text.StringBuilder(SUMMARY_LENGTH * 2); internal System.Collections.Specialized.NameValueCollection metaTags = new System.Collections.Specialized.NameValueCollection(); internal System.String currentMetaTag = null; internal System.String currentMetaContent = null; internal int length = 0; internal bool titleComplete = false; internal bool summaryComplete = false; internal bool inTitle = false; internal bool inMetaTag = false; internal bool inStyle = false; internal bool afterTag = false; internal bool afterSpace = false; internal System.String eol = System.Environment.NewLine; internal System.IO.StreamReader pipeIn = null; internal System.IO.StreamWriter pipeOut; private MyPipedInputStream pipeInStream = null; private System.IO.StreamWriter pipeOutStream = null; private class MyPipedInputStream : System.IO.StreamReader { private void InitBlock(HTMLParser enclosingInstance) { this.enclosingInstance = enclosingInstance; } private HTMLParser enclosingInstance; public HTMLParser Enclosing_Instance { get { return enclosingInstance; } } //public MyPipedInputStream(HTMLParser enclosingInstance) : base(new System.IO.MemoryStream()) // // base(System.IO.Stream.Null) //{ // InitBlock(enclosingInstance); //} public MyPipedInputStream(HTMLParser enclosingInstance, System.IO.StreamReader src) : base(src.BaseStream) { InitBlock(enclosingInstance); } public virtual bool Full() { return enclosingInstance.summaryComplete; /* try { if (this.Peek() == -1) { return (true); } } finally { //return (true); } return (false); */ } } public HTMLParser(System.IO.FileInfo file) : this(new System.IO.FileStream(file.FullName, System.IO.FileMode.Open)) { } public virtual System.String GetTitle() { if (pipeIn == null) GetReader(); // spawn parsing thread while (true) { lock (this) { if (titleComplete || pipeInStream.Full()) break; System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10)); } } return title.ToString().Trim(); } public virtual System.Collections.Specialized.NameValueCollection GetMetaTags() { if (pipeIn == null) GetReader(); // spawn parsing thread while (true) { lock (this) { if (titleComplete || pipeInStream.Full()) break; System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10)); } } return metaTags; } public virtual System.String GetSummary() { if (pipeIn == null) GetReader(); // spawn parsing thread while (true) { lock (this) { if (summary.Length >= SUMMARY_LENGTH || pipeInStream.Full()) break; System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10)); } } if (summary.Length > SUMMARY_LENGTH) summary.Length = SUMMARY_LENGTH; System.String sum = summary.ToString().Trim(); System.String tit = GetTitle(); if (sum.StartsWith(tit) || sum.Equals("")) return tit; else return sum; } public virtual System.IO.StreamReader GetReader() { if (pipeIn == null) { pipeInStream = new MyPipedInputStream(this, new System.IO.StreamReader(new System.IO.MemoryStream(1024))); pipeOutStream = new System.IO.StreamWriter(pipeInStream.BaseStream); pipeIn = new System.IO.StreamReader(pipeInStream.BaseStream, System.Text.Encoding.Default); pipeOut = new System.IO.StreamWriter(pipeOutStream.BaseStream, System.Text.Encoding.Default); SupportClass.ThreadClass thread = new ParserThread(this); thread.Start(); // start parsing } return pipeIn; } internal virtual void AddToSummary(System.String text) { if (summary.Length < SUMMARY_LENGTH) { summary.Append(text); if (summary.Length >= SUMMARY_LENGTH) { lock (this) { summaryComplete = true; System.Threading.Monitor.PulseAll(this); } } } } internal virtual void AddText(System.String text) { if (inStyle) return ; if (inTitle) title.Append(text); else { AddToSummary(text); if (!titleComplete && !title.Equals("")) { // finished title lock (this) { titleComplete = true; // tell waiting threads System.Threading.Monitor.PulseAll(this); } } } length += text.Length; pipeOut.Write(text); afterSpace = false; } internal virtual void AddMetaTag() { metaTags[currentMetaTag] = currentMetaContent; currentMetaTag = null; currentMetaContent = null; return ; } internal virtual void AddSpace() { if (!afterSpace) { if (inTitle) title.Append(" "); else AddToSummary(" "); System.String space = afterTag ? eol : " "; length += space.Length; pipeOut.Write(space); afterSpace = true; } } public void HTMLDocument() { Token t; while (true) { switch ((jj_ntk_Renamed_Field == -1) ? jj_ntk() : jj_ntk_Renamed_Field) { case Lucene.Net.Demo.Html.HTMLParserConstants.ScriptStart: case Lucene.Net.Demo.Html.HTMLParserConstants.TagName: case Lucene.Net.Demo.Html.HTMLParserConstants.DeclName: case Lucene.Net.Demo.Html.HTMLParserConstants.Comment1: case Lucene.Net.Demo.Html.HTMLParserConstants.Comment2: case Lucene.Net.Demo.Html.HTMLParserConstants.Word: case Lucene.Net.Demo.Html.HTMLParserConstants.Entity: case Lucene.Net.Demo.Html.HTMLParserConstants.Space: case Lucene.Net.Demo.Html.HTMLParserConstants.Punct: ; break; default: jj_la1[0] = jj_gen; goto label_1_brk; } switch ((jj_ntk_Renamed_Field == -1) ? jj_ntk() : jj_ntk_Renamed_Field) { case Lucene.Net.Demo.Html.HTMLParserConstants.TagName: Tag(); afterTag = true; break; case Lucene.Net.Demo.Html.HTMLParserConstants.DeclName: t = Decl(); afterTag = true; break; case Lucene.Net.Demo.Html.HTMLParserConstants.Comment1: case Lucene.Net.Demo.Html.HTMLParserConstants.Comment2: CommentTag(); afterTag = true; break; case Lucene.Net.Demo.Html.HTMLParserConstants.ScriptStart: ScriptTag(); afterTag = true; break; case Lucene.Net.Demo.Html.HTMLParserConstants.Word: t = jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Word); AddText(t.image); afterTag = false; break; case Lucene.Net.Demo.Html.HTMLParserConstants.Entity: t = jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Entity); AddText(Entities.Decode(t.image)); afterTag = false; break; case Lucene.Net.Demo.Html.HTMLParserConstants.Punct: t = jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Punct); AddText(t.image); afterTag = false; break; case Lucene.Net.Demo.Html.HTMLParserConstants.Space: jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Space); AddSpace(); afterTag = false; break; default: jj_la1[1] = jj_gen; jj_consume_token(- 1); throw new ParseException(); } } label_1_brk: ; jj_consume_token(0); } public void Tag() { Token t1, t2; bool inImg = false; t1 = jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.TagName); System.String tagName = t1.image.ToLower(); if (Tags.WS_ELEMS.Contains(tagName)) { AddSpace(); } inTitle = tagName.ToUpper().Equals("