/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
using System;
namespace Lucene.Net.Demo.Html
{
public class HTMLParser : HTMLParserConstants
{
private void InitBlock()
{
jj_2_rtns = new JJCalls[2];
jj_ls = new LookaheadSuccess();
}
public static int SUMMARY_LENGTH = 200;
internal System.Text.StringBuilder title = new System.Text.StringBuilder(SUMMARY_LENGTH);
internal System.Text.StringBuilder summary = new System.Text.StringBuilder(SUMMARY_LENGTH * 2);
internal System.Collections.Specialized.NameValueCollection metaTags = new System.Collections.Specialized.NameValueCollection();
internal System.String currentMetaTag = null;
internal System.String currentMetaContent = null;
internal int length = 0;
internal bool titleComplete = false;
internal bool summaryComplete = false;
internal bool inTitle = false;
internal bool inMetaTag = false;
internal bool inStyle = false;
internal bool afterTag = false;
internal bool afterSpace = false;
internal System.String eol = System.Environment.NewLine;
internal System.IO.StreamReader pipeIn = null;
internal System.IO.StreamWriter pipeOut;
private MyPipedInputStream pipeInStream = null;
private System.IO.StreamWriter pipeOutStream = null;
private class MyPipedInputStream : System.IO.StreamReader
{
private void InitBlock(HTMLParser enclosingInstance)
{
this.enclosingInstance = enclosingInstance;
}
private HTMLParser enclosingInstance;
public HTMLParser Enclosing_Instance
{
get
{
return enclosingInstance;
}
}
//public MyPipedInputStream(HTMLParser enclosingInstance) : base()
//{
// InitBlock(enclosingInstance);
//}
public MyPipedInputStream(HTMLParser enclosingInstance, System.IO.StreamReader src) : base(src.BaseStream)
{
InitBlock(enclosingInstance);
}
public virtual bool Full()
{
return enclosingInstance.summaryComplete;
}
}
/// Use HTMLParser(FileInputStream) instead
///
public HTMLParser(System.IO.FileInfo file) : this(new System.IO.FileStream(file.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read))
{
}
public virtual System.String GetTitle()
{
if (pipeIn == null)
GetReader(); // spawn parsing thread
while (true)
{
lock (this)
{
if (titleComplete || pipeInStream.Full())
break;
System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10));
}
}
return title.ToString().Trim();
}
public virtual System.Collections.Specialized.NameValueCollection GetMetaTags()
{
if (pipeIn == null)
GetReader(); // spawn parsing thread
while (true)
{
lock (this)
{
if (titleComplete || pipeInStream.Full())
break;
System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10));
}
}
return metaTags;
}
public virtual System.String GetSummary()
{
if (pipeIn == null)
GetReader(); // spawn parsing thread
while (true)
{
lock (this)
{
if (summary.Length >= SUMMARY_LENGTH || pipeInStream.Full())
break;
System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10));
}
}
if (summary.Length > SUMMARY_LENGTH)
summary.Length = SUMMARY_LENGTH;
System.String sum = summary.ToString().Trim();
System.String tit = GetTitle();
if (sum.StartsWith(tit) || sum.Equals(""))
return tit;
else
return sum;
}
public virtual System.IO.StreamReader GetReader()
{
if (pipeIn == null)
{
pipeInStream = new MyPipedInputStream(this, new System.IO.StreamReader(new System.IO.MemoryStream(1024)));
pipeOutStream = new System.IO.StreamWriter(pipeInStream.BaseStream);
pipeIn = new System.IO.StreamReader(pipeInStream.BaseStream, System.Text.Encoding.Default); // GetEncoding("UTF-16BE"));
pipeOut = new System.IO.StreamWriter(pipeOutStream.BaseStream, System.Text.Encoding.Default); // GetEncoding("UTF-16BE"));
SupportClass.ThreadClass thread = new ParserThread(this);
thread.Start(); // start parsing
}
return pipeIn;
}
internal virtual void AddToSummary(System.String text)
{
if (summary.Length < SUMMARY_LENGTH)
{
summary.Append(text);
if (summary.Length >= SUMMARY_LENGTH)
{
lock (this)
{
summaryComplete = true;
System.Threading.Monitor.PulseAll(this);
}
}
}
}
internal virtual void AddText(System.String text)
{
if (inStyle)
return ;
if (inTitle)
title.Append(text);
else
{
AddToSummary(text);
if (!titleComplete && !title.Equals(""))
{
// finished title
lock (this)
{
titleComplete = true; // tell waiting threads
System.Threading.Monitor.PulseAll(this);
}
}
}
length += text.Length;
pipeOut.Write(text);
afterSpace = false;
}
internal virtual void AddMetaTag()
{
metaTags[currentMetaTag] = currentMetaContent;
currentMetaTag = null;
currentMetaContent = null;
return ;
}
internal virtual void AddSpace()
{
if (!afterSpace)
{
if (inTitle)
title.Append(" ");
else
AddToSummary(" ");
System.String space = afterTag ? eol : " ";
length += space.Length;
pipeOut.Write(space);
afterSpace = true;
}
}
public void HTMLDocument()
{
Token t;
while (true)
{
switch ((jj_ntk == - 1) ? Jj_ntk() : jj_ntk)
{
case Lucene.Net.Demo.Html.HTMLParserConstants.ScriptStart:
case Lucene.Net.Demo.Html.HTMLParserConstants.TagName:
case Lucene.Net.Demo.Html.HTMLParserConstants.DeclName:
case Lucene.Net.Demo.Html.HTMLParserConstants.Comment1:
case Lucene.Net.Demo.Html.HTMLParserConstants.Comment2:
case Lucene.Net.Demo.Html.HTMLParserConstants.Word:
case Lucene.Net.Demo.Html.HTMLParserConstants.Entity:
case Lucene.Net.Demo.Html.HTMLParserConstants.Space:
case Lucene.Net.Demo.Html.HTMLParserConstants.Punct:
;
break;
default:
jj_la1[0] = jj_gen;
goto label_1_brk;
}
switch ((jj_ntk == - 1) ? Jj_ntk() : jj_ntk)
{
case Lucene.Net.Demo.Html.HTMLParserConstants.TagName:
Tag();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants.DeclName:
t = Decl();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants.Comment1:
case Lucene.Net.Demo.Html.HTMLParserConstants.Comment2:
CommentTag();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants.ScriptStart:
ScriptTag();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants.Word:
t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Word);
AddText(t.image); afterTag = false;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants.Entity:
t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Entity);
AddText(Entities.Decode(t.image)); afterTag = false;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants.Punct:
t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Punct);
AddText(t.image); afterTag = false;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants.Space:
Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.Space);
AddSpace(); afterTag = false;
break;
default:
jj_la1[1] = jj_gen;
Jj_consume_token(- 1);
throw new ParseException();
}
}
label_1_brk: ;
Jj_consume_token(0);
}
public void Tag()
{
Token t1, t2;
bool inImg = false;
t1 = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants.TagName);
System.String tagName = t1.image.ToLower();
if (Tags.WS_ELEMS.Contains(tagName))
{
AddSpace();
}
inTitle = tagName.ToUpper().Equals("
inMetaTag = tagName.ToUpper().Equals("
inStyle = tagName.ToUpper().Equals("