/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System.Collections.Generic; namespace Lucene.Net.Analysis { /// Loader for text files that represent a list of stopwords. public class WordlistLoader { /// Loads a text file and adds every line as an entry to a HashSet (omitting /// leading and trailing whitespace). Every line of the file should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// /// File containing the wordlist /// A HashSet with the file's words public static ISet GetWordSet(System.IO.FileInfo wordfile) { using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default)) { return GetWordSet(reader); } } /// Loads a text file and adds every non-comment line as an entry to a HashSet (omitting /// leading and trailing whitespace). Every line of the file should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// /// File containing the wordlist /// The comment string to ignore /// A HashSet with the file's words public static ISet GetWordSet(System.IO.FileInfo wordfile, System.String comment) { using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default)) { return GetWordSet(reader, comment); } } /// Reads lines from a Reader and adds every line as an entry to a HashSet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// /// Reader containing the wordlist /// A HashSet with the reader's words public static ISet GetWordSet(System.IO.TextReader reader) { var result = Support.Compatibility.SetFactory.CreateHashSet(); System.String word; while ((word = reader.ReadLine()) != null) { result.Add(word.Trim()); } return result; } /// Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting /// leading and trailing whitespace). Every line of the Reader should contain only /// one word. The words need to be in lowercase if you make use of an /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). /// /// /// Reader containing the wordlist /// /// The string representing a comment. /// /// A HashSet with the reader's words /// public static ISet GetWordSet(System.IO.TextReader reader, System.String comment) { var result = Support.Compatibility.SetFactory.CreateHashSet(); System.String word = null; while ((word = reader.ReadLine()) != null) { if (word.StartsWith(comment) == false) { result.Add(word.Trim()); } } return result; } /// Reads a stem dictionary. Each line contains: /// word\tstem /// (i.e. two tab seperated words) /// /// /// stem dictionary that overrules the stemming algorithm /// /// IOException public static Dictionary GetStemDict(System.IO.FileInfo wordstemfile) { if (wordstemfile == null) throw new System.NullReferenceException("wordstemfile may not be null"); var result = new Dictionary(); System.IO.StreamReader br = null; System.IO.StreamReader fr = null; try { fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default); br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding); System.String line; char[] tab = {'\t'}; while ((line = br.ReadLine()) != null) { System.String[] wordstem = line.Split(tab, 2); result[wordstem[0]] = wordstem[1]; } } finally { if (fr != null) fr.Close(); if (br != null) br.Close(); } return result; } } }