1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.accumulo.core.iterators.user;
18
19 import java.io.IOException;
20 import java.io.UnsupportedEncodingException;
21 import java.util.Map;
22 import java.util.regex.Matcher;
23 import java.util.regex.Pattern;
24
25 import org.apache.accumulo.core.client.IteratorSetting;
26 import org.apache.accumulo.core.data.ByteSequence;
27 import org.apache.accumulo.core.data.Key;
28 import org.apache.accumulo.core.data.Value;
29 import org.apache.accumulo.core.iterators.Filter;
30 import org.apache.accumulo.core.iterators.IteratorEnvironment;
31 import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
32
33 /**
34 * A Filter that matches entries based on Java regular expressions.
35 */
36 public class RegExFilter extends Filter {
37
38 @Override
39 public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) {
40 RegExFilter result = (RegExFilter) super.deepCopy(env);
41 result.rowMatcher = copyMatcher(rowMatcher);
42 result.colfMatcher = copyMatcher(colfMatcher);
43 result.colqMatcher = copyMatcher(colqMatcher);
44 result.valueMatcher = copyMatcher(valueMatcher);
45 result.orFields = orFields;
46 return result;
47 }
48
49 public static final String ROW_REGEX = "rowRegex";
50 public static final String COLF_REGEX = "colfRegex";
51 public static final String COLQ_REGEX = "colqRegex";
52 public static final String VALUE_REGEX = "valueRegex";
53 public static final String OR_FIELDS = "orFields";
54 public static final String ENCODING = "encoding";
55 public static final String MATCH_SUBSTRING = "matchSubstring";
56
57 public static final String ENCODING_DEFAULT = "UTF-8";
58
59 private Matcher rowMatcher;
60 private Matcher colfMatcher;
61 private Matcher colqMatcher;
62 private Matcher valueMatcher;
63 private boolean orFields = false;
64 private boolean matchSubstring = false;
65
66 private String encoding = ENCODING_DEFAULT;
67
68 private Matcher copyMatcher(Matcher m) {
69 if (m == null)
70 return m;
71 else
72 return m.pattern().matcher("");
73 }
74
75 private boolean matches(Matcher matcher, ByteSequence bs) {
76 if (matcher != null) {
77 try {
78 matcher.reset(new String(bs.getBackingArray(), bs.offset(), bs.length(), encoding));
79 return matchSubstring ? matcher.find() : matcher.matches();
80 } catch (UnsupportedEncodingException e) {
81 e.printStackTrace();
82 }
83 }
84 return !orFields;
85 }
86
87 private boolean matches(Matcher matcher, byte data[], int offset, int len) {
88 if (matcher != null) {
89 try {
90 matcher.reset(new String(data, offset, len, encoding));
91 return matchSubstring ? matcher.find() : matcher.matches();
92 } catch (UnsupportedEncodingException e) {
93 e.printStackTrace();
94 }
95 }
96 return !orFields;
97 }
98
99 @Override
100 public boolean accept(Key key, Value value) {
101 if (orFields)
102 return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData()))
103 || (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData()))
104 || (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) || (matches(valueMatcher, value.get(), 0, value.get().length)));
105 return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData()))
106 && (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData()))
107 && (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) && (matches(valueMatcher, value.get(), 0, value.get().length)));
108 }
109
110 @Override
111 public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
112 super.init(source, options, env);
113 if (options.containsKey(ROW_REGEX)) {
114 rowMatcher = Pattern.compile(options.get(ROW_REGEX)).matcher("");
115 } else {
116 rowMatcher = null;
117 }
118
119 if (options.containsKey(COLF_REGEX)) {
120 colfMatcher = Pattern.compile(options.get(COLF_REGEX)).matcher("");
121 } else {
122 colfMatcher = null;
123 }
124
125 if (options.containsKey(COLQ_REGEX)) {
126 colqMatcher = Pattern.compile(options.get(COLQ_REGEX)).matcher("");
127 } else {
128 colqMatcher = null;
129 }
130
131 if (options.containsKey(VALUE_REGEX)) {
132 valueMatcher = Pattern.compile(options.get(VALUE_REGEX)).matcher("");
133 } else {
134 valueMatcher = null;
135 }
136
137 if (options.containsKey(OR_FIELDS)) {
138 orFields = Boolean.parseBoolean(options.get(OR_FIELDS));
139 } else {
140 orFields = false;
141 }
142
143 if (options.containsKey(MATCH_SUBSTRING)) {
144 matchSubstring = Boolean.parseBoolean(options.get(MATCH_SUBSTRING));
145 } else {
146 matchSubstring = false;
147 }
148
149 if (options.containsKey(ENCODING)) {
150 encoding = options.get(ENCODING);
151 }
152 }
153
154 @Override
155 public IteratorOptions describeOptions() {
156 IteratorOptions io = super.describeOptions();
157 io.setName("regex");
158 io.setDescription("The RegExFilter/Iterator allows you to filter for key/value pairs based on regular expressions");
159 io.addNamedOption(RegExFilter.ROW_REGEX, "regular expression on row");
160 io.addNamedOption(RegExFilter.COLF_REGEX, "regular expression on column family");
161 io.addNamedOption(RegExFilter.COLQ_REGEX, "regular expression on column qualifier");
162 io.addNamedOption(RegExFilter.VALUE_REGEX, "regular expression on value");
163 io.addNamedOption(RegExFilter.OR_FIELDS, "use OR instead of AND when multiple regexes given");
164 io.addNamedOption(RegExFilter.MATCH_SUBSTRING, "match on substrings");
165 io.addNamedOption(RegExFilter.ENCODING, "character encoding of byte array value (default is " + ENCODING_DEFAULT + ")");
166 return io;
167 }
168
169 @Override
170 public boolean validateOptions(Map<String,String> options) {
171 if (super.validateOptions(options) == false)
172 return false;
173
174 try {
175 if (options.containsKey(ROW_REGEX))
176 Pattern.compile(options.get(ROW_REGEX)).matcher("");
177
178 if (options.containsKey(COLF_REGEX))
179 Pattern.compile(options.get(COLF_REGEX)).matcher("");
180
181 if (options.containsKey(COLQ_REGEX))
182 Pattern.compile(options.get(COLQ_REGEX)).matcher("");
183
184 if (options.containsKey(VALUE_REGEX))
185 Pattern.compile(options.get(VALUE_REGEX)).matcher("");
186 } catch (Exception e) {
187 throw new IllegalArgumentException("bad regex", e);
188 }
189
190 if (options.containsKey(ENCODING)) {
191 try {
192 this.encoding = options.get(ENCODING);
193 if ("".equals(this.encoding))
194 encoding = ENCODING_DEFAULT;
195 new String("test".getBytes(), encoding);
196 } catch (UnsupportedEncodingException e) {
197 throw new IllegalArgumentException("invalid encoding " + ENCODING + ":" + this.encoding, e);
198 }
199 }
200
201 return true;
202 }
203
204 /**
205 * Encode the terms to match against in the iterator. Same as calling {@link #setRegexs(IteratorSetting, String, String, String, String, boolean, boolean)}
206 * with matchSubstring set to false
207 *
208 * @param si
209 * ScanIterator config to be updated
210 * @param rowTerm
211 * the pattern to match against the Key's row. Not used if null.
212 * @param cfTerm
213 * the pattern to match against the Key's column family. Not used if null.
214 * @param cqTerm
215 * the pattern to match against the Key's column qualifier. Not used if null.
216 * @param valueTerm
217 * the pattern to match against the Key's value. Not used if null.
218 * @param orFields
219 * if true, any of the non-null terms can match to return the entry
220 */
221 public static void setRegexs(IteratorSetting si, String rowTerm, String cfTerm, String cqTerm, String valueTerm, boolean orFields) {
222 setRegexs(si, rowTerm, cfTerm, cqTerm, valueTerm, orFields, false);
223 }
224
225 /**
226 * Encode the terms to match against in the iterator
227 *
228 * @param si
229 * ScanIterator config to be updated
230 * @param rowTerm
231 * the pattern to match against the Key's row. Not used if null.
232 * @param cfTerm
233 * the pattern to match against the Key's column family. Not used if null.
234 * @param cqTerm
235 * the pattern to match against the Key's column qualifier. Not used if null.
236 * @param valueTerm
237 * the pattern to match against the Key's value. Not used if null.
238 * @param matchSubstring
239 * if true then search expressions will match on partial strings
240 */
241 public static void setRegexs(IteratorSetting si, String rowTerm, String cfTerm, String cqTerm, String valueTerm, boolean orFields, boolean matchSubstring) {
242
243 if (rowTerm != null)
244 si.addOption(RegExFilter.ROW_REGEX, rowTerm);
245 if (cfTerm != null)
246 si.addOption(RegExFilter.COLF_REGEX, cfTerm);
247 if (cqTerm != null)
248 si.addOption(RegExFilter.COLQ_REGEX, cqTerm);
249 if (valueTerm != null)
250 si.addOption(RegExFilter.VALUE_REGEX, valueTerm);
251 si.addOption(RegExFilter.OR_FIELDS, String.valueOf(orFields));
252 si.addOption(RegExFilter.MATCH_SUBSTRING, String.valueOf(matchSubstring));
253
254 }
255
256 /**
257 * Set the encoding string to use when interpreting characters
258 *
259 * @param si
260 * ScanIterator config to be updated
261 * @param encoding
262 * the encoding string to use for character interpretation.
263 *
264 */
265 public static void setEncoding(IteratorSetting si, String encoding) {
266 if (!encoding.isEmpty()) {
267 si.addOption(RegExFilter.ENCODING, encoding);
268 }
269 }
270 }