////* // * Licensed to the Apache Software Foundation (ASF) under one or more // * contributor license agreements. See the NOTICE file distributed with // * this work for additional information regarding copyright ownership. // * The ASF licenses this file to You under the Apache License, Version 2.0 // * (the "License"); you may not use this file except in compliance with // * the License. You may obtain a copy of the License at // * // * http://www.apache.org/licenses/LICENSE-2.0 // * // * Unless required by applicable law or agreed to in writing, software // * distributed under the License is distributed on an "AS IS" BASIS, // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // * See the License for the specific language governing permissions and // * limitations under the License. // */ //using System; //using System.Collections.Generic; //using System.IO; //using System.Linq; //using System.Text; //using Lucene.Net.Support; //namespace Lucene.Net.Analysis.Compound.Hyphenation //{ ////* // * This tree structure stores the hyphenation patterns in an efficient way for // * fast lookup. It provides the provides the method to hyphenate a word. // * // * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. // */ //[Serializable] //public class HyphenationTree : TernaryTree, PatternConsumer //{ // private static readonly long serialVersionUID = -7842107987915665573L; // /* // * value space: stores the interletter values // */ // protected ByteVector vspace; // /* // * This map stores hyphenation exceptions // */ // protected HashMap stoplist; // /* // * This map stores the character classes // */ // protected TernaryTree classmap; // /* // * Temporary map to store interletter values on pattern loading. // */ // [NonSerialized] // private TernaryTree ivalues; // public HyphenationTree() { // stoplist = new HashMap(23); // usually a small table // classmap = new TernaryTree(); // vspace = new ByteVector(); // vspace.Alloc(1); // this reserves index 0, which we don't use // } // /* // * Packs the values by storing them in 4 bits, two values into a byte Values // * range is from 0 to 9. We use zero as terminator, so we'll add 1 to the // * value. // * // * @param values a string of digits from '0' to '9' representing the // * interletter values. // * @return the index into the vspace array where the packed values are stored. // */ // protected int packValues(String values) { // int i, n = values.Length; // int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; // int offset = vspace.Alloc(m); // byte[] va = vspace.GetArray(); // for (i = 0; i < n; i++) { // int j = i >> 1; // byte v = (byte) ((values[i] - '0' + 1) & 0x0f); // if ((i & 1) == 1) { // va[j + offset] = (byte) (va[j + offset] | v); // } else { // va[j + offset] = (byte) (v << 4); // big endian // } // } // va[m - 1 + offset] = 0; // terminator // return offset; // } // protected String unpackValues(int k) { // StringBuilder buf = new StringBuilder(); // byte v = vspace.Get(k++); // while (v != 0) { // char c = (char) ((v >>> 4) - 1 + '0'); // buf.Append(c); // c = (char) (v & 0x0f); // if (c == 0) { // break; // } // c = (char) (c - 1 + '0'); // buf.Append(c); // v = vspace.Get(k++); // } // return buf.ToString(); // } // /* // * Read hyphenation patterns from an XML file. // * // * @param f the filename // * @throws HyphenationException In case the parsing fails // */ // public void loadPatterns(FileInfo f) // { // try { // InputSource src = new InputSource(f.toURL().toExternalForm()); // loadPatterns(src); // } catch (MalformedURLException e) { // throw new HyphenationException("Error converting the File '" + f // + "' to a URL: " + e.getMessage()); // } // } // /* // * Read hyphenation patterns from an XML file. // * // * @param source the InputSource for the file // * @throws HyphenationException In case the parsing fails // */ // public void loadPatterns(InputSource source) // { // PatternParser pp = new PatternParser(this); // ivalues = new TernaryTree(); // pp.parse(source); // // patterns/values should be now in the tree // // let's optimize a bit // trimToSize(); // vspace.trimToSize(); // classmap.trimToSize(); // // get rid of the auxiliary map // ivalues = null; // } // public String findPattern(String pat) { // int k = super.find(pat); // if (k >= 0) { // return unpackValues(k); // } // return ""; // } // /* // * String compare, returns 0 if equal or t is a substring of s // */ // protected int hstrcmp(char[] s, int si, char[] t, int ti) { // for (; s[si] == t[ti]; si++, ti++) { // if (s[si] == 0) { // return 0; // } // } // if (t[ti] == 0) { // return 0; // } // return s[si] - t[ti]; // } // protected byte[] getValues(int k) { // StringBuilder buf = new StringBuilder(); // byte v = vspace.get(k++); // while (v != 0) { // char c = (char) ((v >>> 4) - 1); // buf.append(c); // c = (char) (v & 0x0f); // if (c == 0) { // break; // } // c = (char) (c - 1); // buf.append(c); // v = vspace.get(k++); // } // byte[] res = new byte[buf.length()]; // for (int i = 0; i < res.length; i++) { // res[i] = (byte) buf.charAt(i); // } // return res; // } // /* // *

// * Search for all possible partial matches of word starting at index an update // * interletter values. In other words, it does something like: // *

// * // * for(i=0; i // *

// * But it is done in an efficient way since the patterns are stored in a // * ternary tree. In fact, this is the whole purpose of having the tree: doing // * this search without having to test every single pattern. The number of // * patterns for languages such as English range from 4000 to 10000. Thus, // * doing thousands of string comparisons for each word to hyphenate would be // * really slow without the tree. The tradeoff is memory, but using a ternary // * tree instead of a trie, almost halves the the memory used by Lout or TeX. // * It's also faster than using a hash table // *

// * // * @param word null terminated word to match // * @param index start index from word // * @param il interletter values array to update // */ // protected void searchPatterns(char[] word, int index, byte[] il) { // byte[] values; // int i = index; // char p, q; // char sp = word[i]; // p = root; // while (p > 0 && p < sc.length) { // if (sc[p] == 0xFFFF) { // if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { // values = getValues(eq[p]); // data pointer is in eq[] // int j = index; // for (int k = 0; k < values.length; k++) { // if (j < il.length && values[k] > il[j]) { // il[j] = values[k]; // } // j++; // } // } // return; // } // int d = sp - sc[p]; // if (d == 0) { // if (sp == 0) { // break; // } // sp = word[++i]; // p = eq[p]; // q = p; // // look for a pattern ending at this position by searching for // // the null char ( splitchar == 0 ) // while (q > 0 && q < sc.length) { // if (sc[q] == 0xFFFF) { // stop at compressed branch // break; // } // if (sc[q] == 0) { // values = getValues(eq[q]); // int j = index; // for (int k = 0; k < values.length; k++) { // if (j < il.length && values[k] > il[j]) { // il[j] = values[k]; // } // j++; // } // break; // } else { // q = lo[q]; // /* // * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but // * java chars are unsigned // */ // } // } // } else { // p = d < 0 ? lo[p] : hi[p]; // } // } // } // /* // * Hyphenate word and return a Hyphenation object. // * // * @param word the word to be hyphenated // * @param remainCharCount Minimum number of characters allowed before the // * hyphenation point. // * @param pushCharCount Minimum number of characters allowed after the // * hyphenation point. // * @return a {@link Hyphenation Hyphenation} object representing the // * hyphenated word or null if word is not hyphenated. // */ // public Hyphenation hyphenate(String word, int remainCharCount, // int pushCharCount) { // char[] w = word.toCharArray(); // return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); // } // /* // * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n // * may be absent, the first n is at offset, the first l is at offset + // * iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied // * into word. In the first part of the routine len = w.length, in the second // * part of the routine len = word.length. Three indices are used: index(w), // * the index in w, index(word), the index in word, letterindex(word), the // * index in the letter part of word. The following relations exist: index(w) = // * offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) = // * index(word) - 1 (see first loop). It follows that: index(w) - index(word) = // * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset + // * iIgnoreAtBeginning // */ // /* // * Hyphenate word and return an array of hyphenation points. // * // * @param w char array that contains the word // * @param offset Offset to first character in word // * @param len Length of word // * @param remainCharCount Minimum number of characters allowed before the // * hyphenation point. // * @param pushCharCount Minimum number of characters allowed after the // * hyphenation point. // * @return a {@link Hyphenation Hyphenation} object representing the // * hyphenated word or null if word is not hyphenated. // */ // public Hyphenation hyphenate(char[] w, int offset, int len, // int remainCharCount, int pushCharCount) { // int i; // char[] word = new char[len + 3]; // // normalize word // char[] c = new char[2]; // int iIgnoreAtBeginning = 0; // int iLength = len; // boolean bEndOfLetters = false; // for (i = 1; i <= len; i++) { // c[0] = w[offset + i - 1]; // int nc = classmap.find(c, 0); // if (nc < 0) { // found a non-letter character ... // if (i == (1 + iIgnoreAtBeginning)) { // // ... before any letter character // iIgnoreAtBeginning++; // } else { // // ... after a letter character // bEndOfLetters = true; // } // iLength--; // } else { // if (!bEndOfLetters) { // word[i - iIgnoreAtBeginning] = (char) nc; // } else { // return null; // } // } // } // len = iLength; // if (len < (remainCharCount + pushCharCount)) { // // word is too short to be hyphenated // return null; // } // int[] result = new int[len + 1]; // int k = 0; // // check exception list first // String sw = new String(word, 1, len); // if (stoplist.containsKey(sw)) { // // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = // // null) // ArrayList hw = stoplist.get(sw); // int j = 0; // for (i = 0; i < hw.size(); i++) { // Object o = hw.get(i); // // j = index(sw) = letterindex(word)? // // result[k] = corresponding index(w) // if (o instanceof String) { // j += ((String) o).length(); // if (j >= remainCharCount && j < (len - pushCharCount)) { // result[k++] = j + iIgnoreAtBeginning; // } // } // } // } else { // // use algorithm to get hyphenation points // word[0] = '.'; // word start marker // word[len + 1] = '.'; // word end marker // word[len + 2] = 0; // null terminated // byte[] il = new byte[len + 3]; // initialized to zero // for (i = 0; i < len + 1; i++) { // searchPatterns(word, i, il); // } // // hyphenation points are located where interletter value is odd // // i is letterindex(word), // // i + 1 is index(word), // // result[k] = corresponding index(w) // for (i = 0; i < len; i++) { // if (((il[i + 1] & 1) == 1) && i >= remainCharCount // && i <= (len - pushCharCount)) { // result[k++] = i + iIgnoreAtBeginning; // } // } // } // if (k > 0) { // // trim result array // int[] res = new int[k+2]; // System.arraycopy(result, 0, res, 1, k); // // We add the synthetical hyphenation points // // at the beginning and end of the word // res[0]=0; // res[k+1]=len; // return new Hyphenation(res); // } else { // return null; // } // } // /* // * Add a character class to the tree. It is used by // * {@link PatternParser PatternParser} as callback to add character classes. // * Character classes define the valid word characters for hyphenation. If a // * word contains a character not defined in any of the classes, it is not // * hyphenated. It also defines a way to normalize the characters in order to // * compare them with the stored patterns. Usually pattern files use only lower // * case characters, in this case a class for letter 'a', for example, should // * be defined as "aA", the first character being the normalization char. // */ // public void AddClass(String chargroup) { // if (chargroup.Length > 0) { // char equivChar = chargroup[0]; // char[] key = new char[2]; // key[1] = (char)0; // for (int i = 0; i < chargroup.Length; i++) { // key[0] = chargroup[i]; // classmap.Insert(key, 0, equivChar); // } // } // } // /* // * Add an exception to the tree. It is used by // * {@link PatternParser PatternParser} class as callback to store the // * hyphenation exceptions. // * // * @param word normalized word // * @param hyphenatedword a vector of alternating strings and // * {@link Hyphen hyphen} objects. // */ // public void AddException(String word, ArrayList hyphenatedword) { // stoplist.Add(word, hyphenatedword); // } // /* // * Add a pattern to the tree. Mainly, to be used by // * {@link PatternParser PatternParser} class as callback to add a pattern to // * the tree. // * // * @param pattern the hyphenation pattern // * @param ivalue interletter weight values indicating the desirability and // * priority of hyphenating at a given point within the pattern. It // * should contain only digit characters. (i.e. '0' to '9'). // */ // public void AddPattern(String pattern, String ivalue) { // int k = ivalues.Find(ivalue); // if (k <= 0) { // k = packValues(ivalue); // ivalues.Insert(ivalue, (char)k); // } // Insert(pattern, (char) k); // } // public override void PrintStats() { // Console.WriteLine("Value space size = " + vspace.Length()); // base.PrintStats(); // } //} //}