/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Text; using System.Text.RegularExpressions; namespace Lucene.Net.Analysis.Hunspell { public class HunspellDictionary { private static readonly HunspellWord NoFlags = new HunspellWord(); private static readonly String PREFIX_KEY = "PFX"; private static readonly String SUFFIX_KEY = "SFX"; private static readonly String FLAG_KEY = "FLAG"; private static readonly String AF_KEY = "AF"; private static readonly String NUM_FLAG_TYPE = "num"; private static readonly String UTF8_FLAG_TYPE = "UTF-8"; private static readonly String LONG_FLAG_TYPE = "long"; private static readonly String PREFIX_CONDITION_REGEX_PATTERN = @"^{0}"; private static readonly String SUFFIX_CONDITION_REGEX_PATTERN = @"{0}$"; private readonly Dictionary> _prefixes = new Dictionary>(); private readonly Dictionary> _suffixes = new Dictionary>(); private readonly Dictionary> _words = new Dictionary>(); private readonly Dictionary _aliases = new Dictionary(); private FlagParsingStrategy _flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy /// /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary file. /// /// Stream for reading the hunspell affix file. /// Stream for reading the hunspell dictionary file. /// Can be thrown while reading from the streams. /// Can be thrown if the content of the files does not meet expected formats. public HunspellDictionary(Stream affix, Stream dictionary) : this(affix, new[] { dictionary }) { } /// /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary files. /// /// Stream for reading the hunspell affix file. /// Streams for reading the hunspell dictionary file. /// Can be thrown while reading from the streams. /// Can be thrown if the content of the files does not meet expected formats. public HunspellDictionary(Stream affix, IEnumerable dictionaries) { if (affix == null) throw new ArgumentNullException("affix"); if (dictionaries == null) throw new ArgumentNullException("dictionaries"); var encodingName = ReadDictionaryEncoding(affix); var encoding = Encoding.GetEncoding(encodingName); ReadAffixFile(affix, encoding); foreach (var dictionary in dictionaries) ReadDictionaryFile(dictionary, encoding); } /// /// Looks up HunspellWords that match the String created from the given char array, offset and length. /// public IEnumerable LookupWord(String word) { if (word == null) throw new ArgumentNullException("word"); List list; if (_words.TryGetValue(word, out list)) return list; return null; } /// /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length. /// /// Char array to generate the String from. /// Offset in the char array that the String starts at. /// Length from the offset that the String is. /// List of HunspellAffix prefixes with an append that matches the String, or null if none are found. public IEnumerable LookupPrefix(char[] word, int offset, int length) { if (word == null) throw new ArgumentNullException("word"); var key = new String(word, offset, length); List list; if (_prefixes.TryGetValue(key, out list)) return list; return null; } /// /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length. /// /// Char array to generate the String from. /// Offset in the char array that the String starts at. /// Length from the offset that the String is. /// List of HunspellAffix suffixes with an append that matches the String, or null if none are found public IEnumerable LookupSuffix(char[] word, int offset, int length) { if (word == null) throw new ArgumentNullException("word"); var key = new String(word, offset, length); List list; if (_suffixes.TryGetValue(key, out list)) return list; return null; } /// /// Reads the affix file through the provided Stream, building up the prefix and suffix maps. /// /// Stream to read the content of the affix file from. /// Encoding to decode the content of the file. /// IOException Can be thrown while reading from the Stream. private void ReadAffixFile(Stream affixStream, Encoding encoding) { if (affixStream == null) throw new ArgumentNullException("affixStream"); if (encoding == null) throw new ArgumentNullException("encoding"); using (var reader = new StreamReader(affixStream, encoding)) { String line; while ((line = reader.ReadLine()) != null) { if (line.StartsWith(PREFIX_KEY)) { ParseAffix(_prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); } else if (line.StartsWith(SUFFIX_KEY)) { ParseAffix(_suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); } else if (line.StartsWith(FLAG_KEY)) { // Assume that the FLAG line comes before any prefix or suffixes // Store the strategy so it can be used when parsing the dic file _flagParsingStrategy = GetFlagParsingStrategy(line); } else if (line.StartsWith(AF_KEY)) { // Parse Alias Flag ParseAliasFlag(line, reader); } } } } /// /// Parse alias flag and put it in hash /// /// /// private void ParseAliasFlag(String line, TextReader reader) { if (reader == null) throw new ArgumentNullException("reader"); var args = Regex.Split(line, "\\s+"); var numLines = Int32.Parse(args[1]); for (var i = 0; i < numLines; i++) { line = reader.ReadLine(); var ruleArgs = Regex.Split(line, "\\s+"); if (ruleArgs[0] != "AF") throw new Exception("File corrupted, should be AF directive : " + line); var appendFlags = _flagParsingStrategy.ParseFlags(ruleArgs[1]); _aliases.Add((i+1).ToString(CultureInfo.InvariantCulture), appendFlags); } } /// /// Parses a specific affix rule putting the result into the provided affix map. /// /// Map where the result of the parsing will be put. /// Header line of the affix rule. /// TextReader to read the content of the rule from. /// Pattern to be used to generate the condition regex pattern. private void ParseAffix(Dictionary> affixes, String header, TextReader reader, String conditionPattern) { if (affixes == null) throw new ArgumentNullException("affixes"); if (header == null) throw new ArgumentNullException("header"); if (reader == null) throw new ArgumentNullException("reader"); if (conditionPattern == null) throw new ArgumentNullException("conditionPattern"); var args = Regex.Split(header, "\\s+"); var crossProduct = args[2].Equals("Y"); var numLines = Int32.Parse(args[3]); var hasAliases = _aliases.Count > 0; for (var i = 0; i < numLines; i++) { var line = reader.ReadLine(); var ruleArgs = Regex.Split(line, "\\s+"); var affix = new HunspellAffix(); affix.Flag = _flagParsingStrategy.ParseFlag(ruleArgs[1]); affix.Strip = (ruleArgs[2] == "0") ? "" : ruleArgs[2]; var affixArg = ruleArgs[3]; var flagSep = affixArg.LastIndexOf('/'); if (flagSep != -1) { var cflag = affixArg.Substring(flagSep + 1); var appendFlags = hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag); Array.Sort(appendFlags); affix.AppendFlags = appendFlags; affix.Append = affixArg.Substring(0, flagSep); } else { affix.Append = affixArg; } var condition = ruleArgs[4]; affix.SetCondition(condition, String.Format(conditionPattern, condition)); affix.IsCrossProduct = crossProduct; List list; if (!affixes.TryGetValue(affix.Append, out list)) affixes.Add(affix.Append, list = new List()); list.Add(affix); } } /// /// Parses the encoding specificed in the affix file readable through the provided Stream. /// /// Stream for reading the affix file. /// Encoding specified in the affix file. /// /// Thrown if the first non-empty non-comment line read from the file does not /// adhere to the format SET encoding. /// private static String ReadDictionaryEncoding(Stream affix) { if (affix == null) throw new ArgumentNullException("affix"); var builder = new StringBuilder(); for (; ; ) { builder.Length = 0; int ch; while ((ch = affix.ReadByte()) >= 0) { if (ch == '\n') { break; } if (ch != '\r') { builder.Append((char)ch); } } if (builder.Length == 0 || builder[0] == '#' || // this test only at the end as ineffective but would allow lines only containing spaces: builder.ToString().Trim().Length == 0 ) { if (ch < 0) throw new InvalidDataException("Unexpected end of affix file."); continue; } if ("SET ".Equals(builder.ToString(0, 4))) { // cleanup the encoding string, too (whitespace) return builder.ToString(4, builder.Length - 4).Trim(); } throw new InvalidDataException("The first non-comment line in the affix file must " + "be a 'SET charset', was: '" + builder + "'"); } } /// /// Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file. /// /// Line containing the flag information /// FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition. private static FlagParsingStrategy GetFlagParsingStrategy(String flagLine) { if (flagLine == null) throw new ArgumentNullException("flagLine"); var flagType = flagLine.Substring(5); if (NUM_FLAG_TYPE.Equals(flagType)) return new NumFlagParsingStrategy(); if (UTF8_FLAG_TYPE.Equals(flagType)) return new SimpleFlagParsingStrategy(); if (LONG_FLAG_TYPE.Equals(flagType)) return new DoubleASCIIFlagParsingStrategy(); throw new ArgumentException("Unknown flag type: " + flagType); } /// /// Reads the dictionary file through the provided Stream, building up the words map. /// /// Stream to read the dictionary file through. /// Encoding used to decode the contents of the file. /// Can be thrown while reading from the file. private void ReadDictionaryFile(Stream dictionary, Encoding encoding) { if (dictionary == null) throw new ArgumentNullException("dictionary"); if (encoding == null) throw new ArgumentNullException("encoding"); var reader = new StreamReader(dictionary, encoding); // nocommit, don't create millions of strings. var line = reader.ReadLine(); // first line is number of entries var numEntries = Int32.Parse(line); var hasAliases = _aliases.Count > 0; // nocommit, the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently while ((line = reader.ReadLine()) != null) { String entry; HunspellWord wordForm; var flagSep = line.LastIndexOf('/'); if (flagSep == -1) { wordForm = NoFlags; entry = line; } else { // note, there can be comments (morph description) after a flag. // we should really look for any whitespace var end = line.IndexOf('\t', flagSep); var cflag = end == -1 ? line.Substring(flagSep + 1) : line.Substring(flagSep + 1, end - flagSep - 1); wordForm = new HunspellWord(hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag)); entry = line.Substring(0, flagSep); } List entries; if (!_words.TryGetValue(entry, out entries)) _words.Add(entry, entries = new List()); entries.Add(wordForm); } } #region Nested type: DoubleASCIIFlagParsingStrategy /// /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as /// two ASCII characters whose codes must be combined into a single character. /// private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy { public override Char[] ParseFlags(String rawFlags) { if (rawFlags.Length == 0) return new Char[0]; var builder = new StringBuilder(); for (var i = 0; i < rawFlags.Length; i += 2) { var cookedFlag = (Char)(rawFlags[i] + rawFlags[i + 1]); builder.Append(cookedFlag); } return builder.ToString().ToCharArray(); } } #endregion #region Nested type: FlagParsingStrategy /// /// Abstraction of the process of parsing flags taken from the affix and dic files /// private abstract class FlagParsingStrategy { /// /// Parses the given String into a single flag. /// /// String to parse into a flag. /// Parsed flag. public Char ParseFlag(String rawFlag) { if (rawFlag == null) throw new ArgumentNullException("rawFlag"); return ParseFlags(rawFlag)[0]; } /// /// Parses the given String into multiple flag. /// /// String to parse into a flags. /// Parsed flags. public abstract Char[] ParseFlags(String rawFlags); } #endregion #region Nested type: NumFlagParsingStrategy /// /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its /// numerical form. In the case of multiple flags, each number is separated by a comma. /// private class NumFlagParsingStrategy : FlagParsingStrategy { public override Char[] ParseFlags(String rawFlags) { var rawFlagParts = rawFlags.Trim().Split(','); var flags = new Char[rawFlagParts.Length]; for (var i = 0; i < rawFlagParts.Length; i++) { // note, removing the trailing X/leading I for nepali... what is the rule here?! var replaced = Regex.Replace(rawFlagParts[i], "[^0-9]", ""); flags[i] = (Char)Int32.Parse(replaced); } return flags; } } #endregion #region Nested type: SimpleFlagParsingStrategy /// /// Simple implementation of {@link FlagParsingStrategy} that treats the chars in each /// String as a individual flags. Can be used with both the ASCII and UTF-8 flag types. /// private class SimpleFlagParsingStrategy : FlagParsingStrategy { public override Char[] ParseFlags(String rawFlags) { return rawFlags.ToCharArray(); } } #endregion } }