Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
BeiderMorseEncoder |
|
| 1.4444444444444444;1.444 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.codec.language.bm; | |
19 | ||
20 | import org.apache.commons.codec.EncoderException; | |
21 | import org.apache.commons.codec.StringEncoder; | |
22 | ||
23 | /** | |
24 | * Encodes strings into their Beider-Morse phonetic encoding. | |
25 | * <p> | |
26 | * Beider-Morse phonetic encodings are optimised for family names. However, they may be useful for a wide range | |
27 | * of words. | |
28 | * <p> | |
29 | * This encoder is intentionally mutable to allow dynamic configuration through bean properties. As such, it | |
30 | * is mutable, and may not be thread-safe. If you require a guaranteed thread-safe encoding then use | |
31 | * {@link PhoneticEngine} directly. | |
32 | * <p> | |
33 | * <b>Encoding overview</b> | |
34 | * <p> | |
35 | * Beider-Morse phonetic encodings is a multi-step process. Firstly, a table of rules is consulted to guess what | |
36 | * language the word comes from. For example, if it ends in "<code>ault</code>" then it infers that the word is French. | |
37 | * Next, the word is translated into a phonetic representation using a language-specific phonetics table. Some | |
38 | * runs of letters can be pronounced in multiple ways, and a single run of letters may be potentially broken up | |
39 | * into phonemes at different places, so this stage results in a set of possible language-specific phonetic | |
40 | * representations. Lastly, this language-specific phonetic representation is processed by a table of rules that | |
41 | * re-writes it phonetically taking into account systematic pronunciation differences between languages, to move | |
42 | * it towards a pan-indo-european phonetic representation. Again, sometimes there are multiple ways this could be | |
43 | * done and sometimes things that can be pronounced in several ways in the source language have only one way to | |
44 | * represent them in this average phonetic language, so the result is again a set of phonetic spellings. | |
45 | * <p> | |
46 | * Some names are treated as having multiple parts. This can be due to two things. Firstly, they may be hyphenated. | |
47 | * In this case, each individual hyphenated word is encoded, and then these are combined end-to-end for the final | |
48 | * encoding. Secondly, some names have standard prefixes, for example, "<code>Mac/Mc</code>" in Scottish (English) | |
49 | * names. As sometimes it is ambiguous whether the prefix is intended or is an accident of the spelling, the word | |
50 | * is encoded once with the prefix and once without it. The resulting encoding contains one and then the other result. | |
51 | * <p> | |
52 | * <b>Encoding format</b> | |
53 | * <p> | |
54 | * Individual phonetic spellings of an input word are represented in upper- and lower-case roman characters. Where | |
55 | * there are multiple possible phonetic representations, these are joined with a pipe (<code>|</code>) character. | |
56 | * If multiple hyphenated words where found, or if the word may contain a name prefix, each encoded word is placed | |
57 | * in elipses and these blocks are then joined with hyphens. For example, "<code>d'ortley</code>" has a possible | |
58 | * prefix. The form without prefix encodes to "<code>ortlaj|ortlej</code>", while the form with prefix encodes to | |
59 | * "<code>dortlaj|dortlej</code>". Thus, the full, combined encoding is "{@code (ortlaj|ortlej)-(dortlaj|dortlej)}". | |
60 | * <p> | |
61 | * The encoded forms are often quite a bit longer than the input strings. This is because a single input may have many | |
62 | * potential phonetic interpretations. For example, "<code>Renault</code>" encodes to | |
63 | * "<code>rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult</code>". The <code>APPROX</code> rules will tend to produce larger | |
64 | * encodings as they consider a wider range of possible, approximate phonetic interpretations of the original word. | |
65 | * Down-stream applications may wish to further process the encoding for indexing or lookup purposes, for example, by | |
66 | * splitting on pipe (<code>|</code>) and indexing under each of these alternatives. | |
67 | * | |
68 | * @since 1.6 | |
69 | * @version $Id$ | |
70 | */ | |
71 | 36 | public class BeiderMorseEncoder implements StringEncoder { |
72 | // Implementation note: This class is a spring-friendly facade to PhoneticEngine. It allows read/write configuration | |
73 | // of an immutable PhoneticEngine instance that will be delegated to for the actual encoding. | |
74 | ||
75 | // a cached object | |
76 | 36 | private PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true); |
77 | ||
78 | @Override | |
79 | public Object encode(final Object source) throws EncoderException { | |
80 | 97 | if (!(source instanceof String)) { |
81 | 1 | throw new EncoderException("BeiderMorseEncoder encode parameter is not of type String"); |
82 | } | |
83 | 96 | return encode((String) source); |
84 | } | |
85 | ||
86 | @Override | |
87 | public String encode(final String source) throws EncoderException { | |
88 | 67090 | if (source == null) { |
89 | 1 | return null; |
90 | } | |
91 | 67089 | return this.engine.encode(source); |
92 | } | |
93 | ||
94 | /** | |
95 | * Gets the name type currently in operation. | |
96 | * | |
97 | * @return the NameType currently being used | |
98 | */ | |
99 | public NameType getNameType() { | |
100 | 1 | return this.engine.getNameType(); |
101 | } | |
102 | ||
103 | /** | |
104 | * Gets the rule type currently in operation. | |
105 | * | |
106 | * @return the RuleType currently being used | |
107 | */ | |
108 | public RuleType getRuleType() { | |
109 | 1 | return this.engine.getRuleType(); |
110 | } | |
111 | ||
112 | /** | |
113 | * Discovers if multiple possible encodings are concatenated. | |
114 | * | |
115 | * @return true if multiple encodings are concatenated, false if just the first one is returned | |
116 | */ | |
117 | public boolean isConcat() { | |
118 | 1 | return this.engine.isConcat(); |
119 | } | |
120 | ||
121 | /** | |
122 | * Sets how multiple possible phonetic encodings are combined. | |
123 | * | |
124 | * @param concat | |
125 | * true if multiple encodings are to be combined with a '|', false if just the first one is | |
126 | * to be considered | |
127 | */ | |
128 | public void setConcat(final boolean concat) { | |
129 | 1 | this.engine = new PhoneticEngine(this.engine.getNameType(), |
130 | this.engine.getRuleType(), | |
131 | concat, | |
132 | this.engine.getMaxPhonemes()); | |
133 | 1 | } |
134 | ||
135 | /** | |
136 | * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings | |
137 | * optimized for Ashkenazi or Sephardic Jewish family names. | |
138 | * | |
139 | * @param nameType | |
140 | * the NameType in use | |
141 | */ | |
142 | public void setNameType(final NameType nameType) { | |
143 | 11 | this.engine = new PhoneticEngine(nameType, |
144 | this.engine.getRuleType(), | |
145 | this.engine.isConcat(), | |
146 | this.engine.getMaxPhonemes()); | |
147 | 11 | } |
148 | ||
149 | /** | |
150 | * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered. | |
151 | * | |
152 | * @param ruleType | |
153 | * {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches | |
154 | */ | |
155 | public void setRuleType(final RuleType ruleType) { | |
156 | 12 | this.engine = new PhoneticEngine(this.engine.getNameType(), |
157 | ruleType, | |
158 | this.engine.isConcat(), | |
159 | this.engine.getMaxPhonemes()); | |
160 | 11 | } |
161 | ||
162 | /** | |
163 | * Sets the number of maximum of phonemes that shall be considered by the engine. | |
164 | * | |
165 | * @param maxPhonemes | |
166 | * the maximum number of phonemes returned by the engine | |
167 | * @since 1.7 | |
168 | */ | |
169 | public void setMaxPhonemes(final int maxPhonemes) { | |
170 | 1 | this.engine = new PhoneticEngine(this.engine.getNameType(), |
171 | this.engine.getRuleType(), | |
172 | this.engine.isConcat(), | |
173 | maxPhonemes); | |
174 | 1 | } |
175 | ||
176 | } |