View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.accumulo.core.iterators.user;
18  
19  import java.io.IOException;
20  import java.io.UnsupportedEncodingException;
21  import java.util.Map;
22  import java.util.regex.Matcher;
23  import java.util.regex.Pattern;
24  
25  import org.apache.accumulo.core.client.IteratorSetting;
26  import org.apache.accumulo.core.data.ByteSequence;
27  import org.apache.accumulo.core.data.Key;
28  import org.apache.accumulo.core.data.Value;
29  import org.apache.accumulo.core.iterators.Filter;
30  import org.apache.accumulo.core.iterators.IteratorEnvironment;
31  import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
32  
33  /**
34   * A Filter that matches entries based on Java regular expressions.
35   */
36  public class RegExFilter extends Filter {
37    
38    @Override
39    public SortedKeyValueIterator<Key,Value> deepCopy(IteratorEnvironment env) {
40      RegExFilter result = (RegExFilter) super.deepCopy(env);
41      result.rowMatcher = copyMatcher(rowMatcher);
42      result.colfMatcher = copyMatcher(colfMatcher);
43      result.colqMatcher = copyMatcher(colqMatcher);
44      result.valueMatcher = copyMatcher(valueMatcher);
45      result.orFields = orFields;
46      return result;
47    }
48    
49    public static final String ROW_REGEX = "rowRegex";
50    public static final String COLF_REGEX = "colfRegex";
51    public static final String COLQ_REGEX = "colqRegex";
52    public static final String VALUE_REGEX = "valueRegex";
53    public static final String OR_FIELDS = "orFields";
54    public static final String ENCODING = "encoding";
55    public static final String MATCH_SUBSTRING = "matchSubstring";
56    
57    public static final String ENCODING_DEFAULT = "UTF-8";
58    
59    private Matcher rowMatcher;
60    private Matcher colfMatcher;
61    private Matcher colqMatcher;
62    private Matcher valueMatcher;
63    private boolean orFields = false;
64    private boolean matchSubstring = false;
65    
66    private String encoding = ENCODING_DEFAULT;
67    
68    private Matcher copyMatcher(Matcher m) {
69      if (m == null)
70        return m;
71      else
72        return m.pattern().matcher("");
73    }
74    
75    private boolean matches(Matcher matcher, ByteSequence bs) {
76      if (matcher != null) {
77        try {
78          matcher.reset(new String(bs.getBackingArray(), bs.offset(), bs.length(), encoding));
79          return matchSubstring ? matcher.find() : matcher.matches();
80        } catch (UnsupportedEncodingException e) {
81          e.printStackTrace();
82        }
83      }
84      return !orFields;
85    }
86    
87    private boolean matches(Matcher matcher, byte data[], int offset, int len) {
88      if (matcher != null) {
89        try {
90          matcher.reset(new String(data, offset, len, encoding));
91          return matchSubstring ? matcher.find() : matcher.matches();
92        } catch (UnsupportedEncodingException e) {
93          e.printStackTrace();
94        }
95      }
96      return !orFields;
97    }
98    
99    @Override
100   public boolean accept(Key key, Value value) {
101     if (orFields)
102       return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData()))
103           || (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData()))
104           || (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) || (matches(valueMatcher, value.get(), 0, value.get().length)));
105     return ((matches(rowMatcher, rowMatcher == null ? null : key.getRowData()))
106         && (matches(colfMatcher, colfMatcher == null ? null : key.getColumnFamilyData()))
107         && (matches(colqMatcher, colqMatcher == null ? null : key.getColumnQualifierData())) && (matches(valueMatcher, value.get(), 0, value.get().length)));
108   }
109   
110   @Override
111   public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
112     super.init(source, options, env);
113     if (options.containsKey(ROW_REGEX)) {
114       rowMatcher = Pattern.compile(options.get(ROW_REGEX)).matcher("");
115     } else {
116       rowMatcher = null;
117     }
118     
119     if (options.containsKey(COLF_REGEX)) {
120       colfMatcher = Pattern.compile(options.get(COLF_REGEX)).matcher("");
121     } else {
122       colfMatcher = null;
123     }
124     
125     if (options.containsKey(COLQ_REGEX)) {
126       colqMatcher = Pattern.compile(options.get(COLQ_REGEX)).matcher("");
127     } else {
128       colqMatcher = null;
129     }
130     
131     if (options.containsKey(VALUE_REGEX)) {
132       valueMatcher = Pattern.compile(options.get(VALUE_REGEX)).matcher("");
133     } else {
134       valueMatcher = null;
135     }
136     
137     if (options.containsKey(OR_FIELDS)) {
138       orFields = Boolean.parseBoolean(options.get(OR_FIELDS));
139     } else {
140       orFields = false;
141     }
142     
143     if (options.containsKey(MATCH_SUBSTRING)) {
144       matchSubstring = Boolean.parseBoolean(options.get(MATCH_SUBSTRING));
145     } else {
146       matchSubstring = false;
147     }
148     
149     if (options.containsKey(ENCODING)) {
150       encoding = options.get(ENCODING);
151     }
152   }
153   
154   @Override
155   public IteratorOptions describeOptions() {
156     IteratorOptions io = super.describeOptions();
157     io.setName("regex");
158     io.setDescription("The RegExFilter/Iterator allows you to filter for key/value pairs based on regular expressions");
159     io.addNamedOption(RegExFilter.ROW_REGEX, "regular expression on row");
160     io.addNamedOption(RegExFilter.COLF_REGEX, "regular expression on column family");
161     io.addNamedOption(RegExFilter.COLQ_REGEX, "regular expression on column qualifier");
162     io.addNamedOption(RegExFilter.VALUE_REGEX, "regular expression on value");
163     io.addNamedOption(RegExFilter.OR_FIELDS, "use OR instead of AND when multiple regexes given");
164     io.addNamedOption(RegExFilter.MATCH_SUBSTRING, "match on substrings");
165     io.addNamedOption(RegExFilter.ENCODING, "character encoding of byte array value (default is " + ENCODING_DEFAULT + ")");
166     return io;
167   }
168   
169   @Override
170   public boolean validateOptions(Map<String,String> options) {
171     if (super.validateOptions(options) == false)
172       return false;
173     
174     try {
175       if (options.containsKey(ROW_REGEX))
176         Pattern.compile(options.get(ROW_REGEX)).matcher("");
177       
178       if (options.containsKey(COLF_REGEX))
179         Pattern.compile(options.get(COLF_REGEX)).matcher("");
180       
181       if (options.containsKey(COLQ_REGEX))
182         Pattern.compile(options.get(COLQ_REGEX)).matcher("");
183       
184       if (options.containsKey(VALUE_REGEX))
185         Pattern.compile(options.get(VALUE_REGEX)).matcher("");
186     } catch (Exception e) {
187       throw new IllegalArgumentException("bad regex", e);
188     }
189     
190     if (options.containsKey(ENCODING)) {
191       try {
192         this.encoding = options.get(ENCODING);
193         if ("".equals(this.encoding))
194           encoding = ENCODING_DEFAULT;
195         new String("test".getBytes(), encoding);
196       } catch (UnsupportedEncodingException e) {
197         throw new IllegalArgumentException("invalid encoding " + ENCODING + ":" + this.encoding, e);
198       }
199     }
200     
201     return true;
202   }
203   
204   /**
205    * Encode the terms to match against in the iterator. Same as calling {@link #setRegexs(IteratorSetting, String, String, String, String, boolean, boolean)}
206    * with matchSubstring set to false
207    * 
208    * @param si
209    *          ScanIterator config to be updated
210    * @param rowTerm
211    *          the pattern to match against the Key's row. Not used if null.
212    * @param cfTerm
213    *          the pattern to match against the Key's column family. Not used if null.
214    * @param cqTerm
215    *          the pattern to match against the Key's column qualifier. Not used if null.
216    * @param valueTerm
217    *          the pattern to match against the Key's value. Not used if null.
218    * @param orFields
219    *          if true, any of the non-null terms can match to return the entry
220    */
221   public static void setRegexs(IteratorSetting si, String rowTerm, String cfTerm, String cqTerm, String valueTerm, boolean orFields) {
222     setRegexs(si, rowTerm, cfTerm, cqTerm, valueTerm, orFields, false);
223   }
224   
225   /**
226    * Encode the terms to match against in the iterator
227    * 
228    * @param si
229    *          ScanIterator config to be updated
230    * @param rowTerm
231    *          the pattern to match against the Key's row. Not used if null.
232    * @param cfTerm
233    *          the pattern to match against the Key's column family. Not used if null.
234    * @param cqTerm
235    *          the pattern to match against the Key's column qualifier. Not used if null.
236    * @param valueTerm
237    *          the pattern to match against the Key's value. Not used if null.
238    * @param matchSubstring
239    *          if true then search expressions will match on partial strings
240    */
241   public static void setRegexs(IteratorSetting si, String rowTerm, String cfTerm, String cqTerm, String valueTerm, boolean orFields, boolean matchSubstring) {
242     
243     if (rowTerm != null)
244       si.addOption(RegExFilter.ROW_REGEX, rowTerm);
245     if (cfTerm != null)
246       si.addOption(RegExFilter.COLF_REGEX, cfTerm);
247     if (cqTerm != null)
248       si.addOption(RegExFilter.COLQ_REGEX, cqTerm);
249     if (valueTerm != null)
250       si.addOption(RegExFilter.VALUE_REGEX, valueTerm);
251     si.addOption(RegExFilter.OR_FIELDS, String.valueOf(orFields));
252     si.addOption(RegExFilter.MATCH_SUBSTRING, String.valueOf(matchSubstring));
253     
254   }
255   
256   /**
257    * Set the encoding string to use when interpreting characters
258    * 
259    * @param si
260    *          ScanIterator config to be updated
261    * @param encoding
262    *          the encoding string to use for character interpretation.
263    * 
264    */
265   public static void setEncoding(IteratorSetting si, String encoding) {
266     if (!encoding.isEmpty()) {
267       si.addOption(RegExFilter.ENCODING, encoding);
268     }
269   }
270 }