/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
namespace Lucene.Net.Analysis
{
/// Loader for text files that represent a list of stopwords.
///
///
///
/// $Id: WordlistLoader.java 706342 2008-10-20 17:19:29Z gsingers $
///
public class WordlistLoader
{
/// Loads a text file and adds every line as an entry to a HashSet (omitting
/// leading and trailing whitespace). Every line of the file should contain only
/// one word. The words need to be in lowercase if you make use of an
/// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
///
///
/// File containing the wordlist
///
/// A HashSet with the file's words
///
public static System.Collections.Hashtable GetWordSet(System.IO.FileInfo wordfile)
{
System.Collections.Hashtable result = new System.Collections.Hashtable();
System.IO.StreamReader reader = null;
try
{
reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default);
result = GetWordSet(reader);
}
finally
{
if (reader != null)
reader.Close();
}
return result;
}
/// Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
/// leading and trailing whitespace). Every line of the file should contain only
/// one word. The words need to be in lowercase if you make use of an
/// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
///
///
/// File containing the wordlist
///
/// The comment string to ignore
///
/// A HashSet with the file's words
///
public static System.Collections.Hashtable GetWordSet(System.IO.FileInfo wordfile, System.String comment)
{
System.Collections.Hashtable result = new System.Collections.Hashtable();
System.IO.StreamReader reader = null;
try
{
reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default);
result = GetWordSet(reader, comment);
}
finally
{
if (reader != null)
reader.Close();
}
return result;
}
/// Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
/// leading and trailing whitespace). Every line of the Reader should contain only
/// one word. The words need to be in lowercase if you make use of an
/// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
///
///
/// Reader containing the wordlist
///
/// A HashSet with the reader's words
///
public static System.Collections.Hashtable GetWordSet(System.IO.TextReader reader)
{
System.Collections.Hashtable result = new System.Collections.Hashtable();
System.IO.TextReader br = null;
try
{
System.String word = null;
while ((word = reader.ReadLine()) != null)
{
SupportClass.CollectionsHelper.Add(result, word.Trim());
}
}
finally
{
if (br != null)
br.Close();
}
return result;
}
/// Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
/// leading and trailing whitespace). Every line of the Reader should contain only
/// one word. The words need to be in lowercase if you make use of an
/// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
///
///
/// Reader containing the wordlist
///
/// The string representing a comment.
///
/// A HashSet with the reader's words
///
public static System.Collections.Hashtable GetWordSet(System.IO.TextReader reader, System.String comment)
{
System.Collections.Hashtable result = new System.Collections.Hashtable();
System.IO.StreamReader br = null;
try
{
System.String word = null;
while ((word = reader.ReadLine()) != null)
{
if (word.StartsWith(comment) == false)
{
SupportClass.CollectionsHelper.Add(result, word.Trim());
}
}
}
finally
{
if (br != null)
br.Close();
}
return result;
}
/// Reads a stem dictionary. Each line contains:
/// word\tstem
/// (i.e. two tab seperated words)
///
///
/// stem dictionary that overrules the stemming algorithm
///
/// IOException
public static System.Collections.Hashtable GetStemDict(System.IO.FileInfo wordstemfile)
{
if (wordstemfile == null)
throw new System.NullReferenceException("wordstemfile may not be null");
System.Collections.Hashtable result = new System.Collections.Hashtable();
System.IO.StreamReader br = null;
System.IO.StreamReader fr = null;
try
{
fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default);
br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding);
System.String line;
char[] tab = {'\t'};
while ((line = br.ReadLine()) != null)
{
System.String[] wordstem = line.Split(tab, 2);
result[wordstem[0]] = wordstem[1];
}
}
finally
{
if (fr != null)
fr.Close();
if (br != null)
br.Close();
}
return result;
}
}
}