/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; namespace Lucene.Net.Analysis { /// Loader for text files that represent a list of stopwords. /// /// /// Gerhard Schwarz /// /// $Id: WordlistLoader.java 192989 2005-06-22 19:59:03Z dnaber $ /// public class WordlistLoader { /// Loads a text file and adds every line as an entry to a HashSet (omitting /// leading and trailing whitespace). Every line of the file should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// /// /// File containing the wordlist /// /// A HashSet with the file's words /// public static System.Collections.Hashtable GetWordSet(System.IO.FileInfo wordfile) { System.Collections.Hashtable result = new System.Collections.Hashtable(); System.IO.TextReader reader = null; try { reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default); result = GetWordSet(reader); } finally { if (reader != null) reader.Close(); } return result; } /// Reads lines from a Reader and adds every line as an entry to a HashSet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// /// /// Reader containing the wordlist /// /// A HashSet with the reader's words /// public static System.Collections.Hashtable GetWordSet(System.IO.TextReader reader) { System.Collections.Hashtable result = new System.Collections.Hashtable(); System.IO.TextReader br = null; try { br = (System.IO.TextReader) reader; System.String word = null; while ((word = br.ReadLine()) != null) { System.String tmp = word.Trim(); result.Add(tmp, tmp); } } finally { if (br != null) br.Close(); } return result; } /// Reads a stem dictionary. Each line contains: ///
word\tstem
/// (i.e. two tab seperated words) /// ///
/// stem dictionary that overrules the stemming algorithm /// /// IOException public static System.Collections.Hashtable GetStemDict(System.IO.FileInfo wordstemfile) { if (wordstemfile == null) throw new System.NullReferenceException("wordstemfile may not be null"); System.Collections.Hashtable result = new System.Collections.Hashtable(); System.IO.StreamReader br = null; System.IO.StreamReader fr = null; try { fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default); br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding); System.String line; char[] tab = {'\t'}; while ((line = br.ReadLine()) != null) { System.String[] wordstem = line.Split(tab, 2); result[wordstem[0]] = wordstem[1]; } } finally { if (fr != null) fr.Close(); if (br != null) br.Close(); } return result; } } }