/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Generated By:JavaCC: Do not edit this line. HTMLParser.java */
using System;
namespace Lucene.Net.Demo.Html
{
public class HTMLParser : HTMLParserConstants_Fields
{
private void InitBlock()
{
jj_2_rtns = new JJCalls[2];
jj_ls = new LookaheadSuccess();
}
public static int SUMMARY_LENGTH = 200;
internal System.Text.StringBuilder title = new System.Text.StringBuilder(SUMMARY_LENGTH);
internal System.Text.StringBuilder summary = new System.Text.StringBuilder(SUMMARY_LENGTH * 2);
internal System.Collections.Specialized.NameValueCollection metaTags = new System.Collections.Specialized.NameValueCollection();
internal System.String currentMetaTag = null;
internal System.String currentMetaContent = null;
internal int length = 0;
internal bool titleComplete = false;
internal bool summaryComplete = false;
internal bool inTitle = false;
internal bool inMetaTag = false;
internal bool inStyle = false;
internal bool afterTag = false;
internal bool afterSpace = false;
internal System.String eol = System.Environment.NewLine;
internal System.IO.StreamReader pipeIn = null;
internal System.IO.StreamWriter pipeOut;
private MyPipedInputStream pipeInStream = null;
private System.IO.StreamWriter pipeOutStream = null;
private class MyPipedInputStream : System.IO.MemoryStream
{
long _readPtr = 0;
long _writePtr = 0;
public System.IO.Stream BaseStream
{
get
{
return this;
}
}
public override int Read(byte[] buffer, int offset, int count)
{
lock (this)
{
base.Seek(_readPtr, System.IO.SeekOrigin.Begin);
int x = base.Read(buffer, offset, count);
_readPtr += x;
return x;
}
}
public override void Write(byte[] buffer, int offset, int count)
{
lock (this)
{
base.Seek(_writePtr, System.IO.SeekOrigin.Begin);
base.Write(buffer, offset, count);
_writePtr += count;
}
}
public override void Close()
{
}
public virtual bool Full()
{
return false;
}
}
/// Use HTMLParser(FileInputStream) instead
///
public HTMLParser(System.IO.FileInfo file):this(new System.IO.FileStream(file.FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read))
{
}
public virtual System.String GetTitle()
{
if (pipeIn == null)
GetReader(); // spawn parsing thread
while (true)
{
lock (this)
{
if (titleComplete || pipeInStream.Full())
break;
System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10));
}
}
return title.ToString().Trim();
}
public virtual System.Collections.Specialized.NameValueCollection GetMetaTags()
{
if (pipeIn == null)
GetReader(); // spawn parsing thread
while (true)
{
lock (this)
{
if (titleComplete || pipeInStream.Full())
break;
System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10));
}
}
return metaTags;
}
public virtual System.String GetSummary()
{
if (pipeIn == null)
GetReader(); // spawn parsing thread
while (true)
{
lock (this)
{
if (summary.Length >= SUMMARY_LENGTH || pipeInStream.Full())
break;
System.Threading.Monitor.Wait(this, TimeSpan.FromMilliseconds(10));
}
}
if (summary.Length > SUMMARY_LENGTH)
summary.Length = SUMMARY_LENGTH;
System.String sum = summary.ToString().Trim();
System.String tit = GetTitle();
if (sum.StartsWith(tit) || sum.Equals(""))
return tit;
else
return sum;
}
public virtual System.IO.StreamReader GetReader()
{
if (pipeIn == null)
{
pipeInStream = new MyPipedInputStream();
pipeOutStream = new System.IO.StreamWriter(pipeInStream.BaseStream);
pipeIn = new System.IO.StreamReader(pipeInStream.BaseStream, System.Text.Encoding.GetEncoding("UTF-16BE"));
pipeOut = new System.IO.StreamWriter(pipeOutStream.BaseStream, System.Text.Encoding.GetEncoding("UTF-16BE"));
Support.ThreadClass thread = new ParserThread(this);
thread.Start(); // start parsing
}
return pipeIn;
}
internal virtual void AddToSummary(System.String text)
{
if (summary.Length < SUMMARY_LENGTH)
{
summary.Append(text);
if (summary.Length >= SUMMARY_LENGTH)
{
lock (this)
{
summaryComplete = true;
System.Threading.Monitor.PulseAll(this);
}
}
}
}
internal virtual void AddText(System.String text)
{
if (inStyle)
return ;
if (inTitle)
title.Append(text);
else
{
AddToSummary(text);
if (!titleComplete && !(title.Length == 0))
{
// finished title
lock (this)
{
titleComplete = true; // tell waiting threads
System.Threading.Monitor.PulseAll(this);
}
}
}
length += text.Length;
pipeOut.Write(text);
afterSpace = false;
}
internal virtual void AddMetaTag()
{
metaTags[currentMetaTag] = currentMetaContent;
currentMetaTag = null;
currentMetaContent = null;
return ;
}
internal virtual void AddSpace()
{
if (!afterSpace)
{
if (inTitle)
title.Append(" ");
else
AddToSummary(" ");
System.String space = afterTag?eol:" ";
length += space.Length;
pipeOut.Write(space);
afterSpace = true;
}
}
public void HTMLDocument()
{
Token t;
while (true)
{
switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
{
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptStart:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagName:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.DeclName:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment1:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment2:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Word:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Entity:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Space:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Punct:
;
break;
default:
jj_la1[0] = jj_gen;
goto label_1_brk;
}
switch ((jj_ntk == - 1)?Jj_ntk():jj_ntk)
{
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagName:
Tag();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.DeclName:
t = Decl();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment1:
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Comment2:
CommentTag();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.ScriptStart:
ScriptTag();
afterTag = true;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Word:
t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Word);
AddText(t.image); afterTag = false;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Entity:
t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Entity);
AddText(Entities.Decode(t.image)); afterTag = false;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Punct:
t = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Punct);
AddText(t.image); afterTag = false;
break;
case Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Space:
Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.Space);
AddSpace(); afterTag = false;
break;
default:
jj_la1[1] = jj_gen;
Jj_consume_token(- 1);
throw new ParseException();
}
}
label_1_brk: ;
Jj_consume_token(0);
}
public void Tag()
{
Token t1, t2;
bool inImg = false;
t1 = Jj_consume_token(Lucene.Net.Demo.Html.HTMLParserConstants_Fields.TagName);
System.String tagName = t1.image.ToLower();
if (Tags.WS_ELEMS.Contains(tagName))
{
AddSpace();
}
inTitle = tagName.ToUpper().Equals("
inMetaTag = tagName.ToUpper().Equals("
inStyle = tagName.ToUpper().Equals("