View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.hc.core5.util;
29  
30  import java.util.BitSet;
31  
32  import org.apache.hc.core5.annotation.Contract;
33  import org.apache.hc.core5.annotation.ThreadingBehavior;
34  
35  /**
36   * Tokenizer that can be used as a foundation for more complex parsing routines.
37   * Methods of this class are designed to produce near zero intermediate garbage
38   * and make no intermediate copies of input data.
39   * <p>
40   * This class is immutable and thread safe.
41   *
42   * @since 5.1
43   */
44  @Contract(threading = ThreadingBehavior.IMMUTABLE)
45  public class Tokenizer {
46  
47      public static class Cursor {
48  
49          private final int lowerBound;
50          private final int upperBound;
51          private int pos;
52  
53          public Cursor(final int lowerBound, final int upperBound) {
54              super();
55              Args.notNegative(lowerBound, "lowerBound");
56              Args.check(lowerBound <= upperBound, "lowerBound cannot be greater than upperBound");
57              this.lowerBound = lowerBound;
58              this.upperBound = upperBound;
59              this.pos = lowerBound;
60          }
61  
62          public int getLowerBound() {
63              return this.lowerBound;
64          }
65  
66          public int getUpperBound() {
67              return this.upperBound;
68          }
69  
70          public int getPos() {
71              return this.pos;
72          }
73  
74          public void updatePos(final int pos) {
75              Args.check(pos >= this.lowerBound, "pos: %s < lowerBound: %s", pos, this.lowerBound);
76              Args.check(pos <= this.upperBound, "pos: %s > upperBound: %s", pos, this.upperBound);
77              this.pos = pos;
78          }
79  
80          public boolean atEnd() {
81              return this.pos >= this.upperBound;
82          }
83  
84          @Override
85          public String toString() {
86              final StringBuilder buffer = new StringBuilder();
87              buffer.append('[');
88              buffer.append(this.lowerBound);
89              buffer.append('>');
90              buffer.append(this.pos);
91              buffer.append('>');
92              buffer.append(this.upperBound);
93              buffer.append(']');
94              return buffer.toString();
95          }
96  
97      }
98  
99      public static BitSet INIT_BITSET(final int ... b) {
100         final BitSet bitset = new BitSet();
101         for (final int aB : b) {
102             bitset.set(aB);
103         }
104         return bitset;
105     }
106 
107     /** Double quote */
108     public static final char DQUOTE = '\"';
109 
110     /** Backward slash / escape character */
111     public static final char ESCAPE = '\\';
112 
113     public static final int CR = 13; // <US-ASCII CR, carriage return (13)>
114     public static final int LF = 10; // <US-ASCII LF, linefeed (10)>
115     public static final int SP = 32; // <US-ASCII SP, space (32)>
116     public static final int HT = 9;  // <US-ASCII HT, horizontal-tab (9)>
117 
118     public static boolean isWhitespace(final char ch) {
119         return ch == SP || ch == HT || ch == CR || ch == LF;
120     }
121 
122     public static final Tokenizer INSTANCE = new Tokenizer();
123 
124     /**
125      * Extracts from the sequence of chars a token terminated with any of the given delimiters
126      * or a whitespace characters.
127      *
128      * @param buf buffer with the sequence of chars to be parsed
129      * @param cursor defines the bounds and current position of the buffer
130      * @param delimiters set of delimiting characters. Can be {@code null} if the token
131      *  is not delimited by any character.
132      */
133     public String parseContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
134         Args.notNull(buf, "Char sequence");
135         Args.notNull(cursor, "Parser cursor");
136         final StringBuilder dst = new StringBuilder();
137         copyContent(buf, cursor, delimiters, dst);
138         return dst.toString();
139     }
140 
141     /**
142      * Extracts from the sequence of chars a token terminated with any of the given delimiters
143      * discarding semantically insignificant whitespace characters.
144      *
145      * @param buf buffer with the sequence of chars to be parsed
146      * @param cursor defines the bounds and current position of the buffer
147      * @param delimiters set of delimiting characters. Can be {@code null} if the token
148      *  is not delimited by any character.
149      */
150     public String parseToken(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
151         Args.notNull(buf, "Char sequence");
152         Args.notNull(cursor, "Parser cursor");
153         final StringBuilder dst = new StringBuilder();
154         boolean whitespace = false;
155         while (!cursor.atEnd()) {
156             final char current = buf.charAt(cursor.getPos());
157             if (delimiters != null && delimiters.get(current)) {
158                 break;
159             } else if (isWhitespace(current)) {
160                 skipWhiteSpace(buf, cursor);
161                 whitespace = true;
162             } else {
163                 if (whitespace && dst.length() > 0) {
164                     dst.append(' ');
165                 }
166                 copyContent(buf, cursor, delimiters, dst);
167                 whitespace = false;
168             }
169         }
170         return dst.toString();
171     }
172 
173     /**
174      * Extracts from the sequence of chars a value which can be enclosed in quote marks and
175      * terminated with any of the given delimiters discarding semantically insignificant
176      * whitespace characters.
177      *
178      * @param buf buffer with the sequence of chars to be parsed
179      * @param cursor defines the bounds and current position of the buffer
180      * @param delimiters set of delimiting characters. Can be {@code null} if the value
181      *  is not delimited by any character.
182      */
183     public String parseValue(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
184         Args.notNull(buf, "Char sequence");
185         Args.notNull(cursor, "Parser cursor");
186         final StringBuilder dst = new StringBuilder();
187         boolean whitespace = false;
188         while (!cursor.atEnd()) {
189             final char current = buf.charAt(cursor.getPos());
190             if (delimiters != null && delimiters.get(current)) {
191                 break;
192             } else if (isWhitespace(current)) {
193                 skipWhiteSpace(buf, cursor);
194                 whitespace = true;
195             } else if (current == DQUOTE) {
196                 if (whitespace && dst.length() > 0) {
197                     dst.append(' ');
198                 }
199                 copyQuotedContent(buf, cursor, dst);
200                 whitespace = false;
201             } else {
202                 if (whitespace && dst.length() > 0) {
203                     dst.append(' ');
204                 }
205                 copyUnquotedContent(buf, cursor, delimiters, dst);
206                 whitespace = false;
207             }
208         }
209         return dst.toString();
210     }
211 
212     /**
213      * Skips semantically insignificant whitespace characters and moves the cursor to the closest
214      * non-whitespace character.
215      *
216      * @param buf buffer with the sequence of chars to be parsed
217      * @param cursor defines the bounds and current position of the buffer
218      */
219     public void skipWhiteSpace(final CharSequence buf, final Cursor cursor) {
220         Args.notNull(buf, "Char sequence");
221         Args.notNull(cursor, "Parser cursor");
222         int pos = cursor.getPos();
223         final int indexFrom = cursor.getPos();
224         final int indexTo = cursor.getUpperBound();
225         for (int i = indexFrom; i < indexTo; i++) {
226             final char current = buf.charAt(i);
227             if (!isWhitespace(current)) {
228                 break;
229             }
230             pos++;
231         }
232         cursor.updatePos(pos);
233     }
234 
235     /**
236      * Transfers content into the destination buffer until a whitespace character or any of
237      * the given delimiters is encountered.
238      *
239      * @param buf buffer with the sequence of chars to be parsed
240      * @param cursor defines the bounds and current position of the buffer
241      * @param delimiters set of delimiting characters. Can be {@code null} if the value
242      *  is delimited by a whitespace only.
243      * @param dst destination buffer
244      */
245     public void copyContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters,
246                             final StringBuilder dst) {
247         Args.notNull(buf, "Char sequence");
248         Args.notNull(cursor, "Parser cursor");
249         Args.notNull(dst, "String builder");
250         int pos = cursor.getPos();
251         final int indexFrom = cursor.getPos();
252         final int indexTo = cursor.getUpperBound();
253         for (int i = indexFrom; i < indexTo; i++) {
254             final char current = buf.charAt(i);
255             if ((delimiters != null && delimiters.get(current)) || isWhitespace(current)) {
256                 break;
257             }
258             pos++;
259             dst.append(current);
260         }
261         cursor.updatePos(pos);
262     }
263 
264     /**
265      * Transfers content into the destination buffer until a whitespace character,  a quote,
266      * or any of the given delimiters is encountered.
267      *
268      * @param buf buffer with the sequence of chars to be parsed
269      * @param cursor defines the bounds and current position of the buffer
270      * @param delimiters set of delimiting characters. Can be {@code null} if the value
271      *  is delimited by a whitespace or a quote only.
272      * @param dst destination buffer
273      */
274     public void copyUnquotedContent(final CharSequence buf, final Cursor cursor,
275             final BitSet delimiters, final StringBuilder dst) {
276         Args.notNull(buf, "Char sequence");
277         Args.notNull(cursor, "Parser cursor");
278         Args.notNull(dst, "String builder");
279         int pos = cursor.getPos();
280         final int indexFrom = cursor.getPos();
281         final int indexTo = cursor.getUpperBound();
282         for (int i = indexFrom; i < indexTo; i++) {
283             final char current = buf.charAt(i);
284             if ((delimiters != null && delimiters.get(current))
285                     || isWhitespace(current) || current == DQUOTE) {
286                 break;
287             }
288             pos++;
289             dst.append(current);
290         }
291         cursor.updatePos(pos);
292     }
293 
294     /**
295      * Transfers content enclosed with quote marks into the destination buffer.
296      *
297      * @param buf buffer with the sequence of chars to be parsed
298      * @param cursor defines the bounds and current position of the buffer
299      * @param dst destination buffer
300      */
301     public void copyQuotedContent(final CharSequence buf, final Cursor cursor,
302             final StringBuilder dst) {
303         Args.notNull(buf, "Char sequence");
304         Args.notNull(cursor, "Parser cursor");
305         Args.notNull(dst, "String builder");
306         if (cursor.atEnd()) {
307             return;
308         }
309         int pos = cursor.getPos();
310         int indexFrom = cursor.getPos();
311         final int indexTo = cursor.getUpperBound();
312         char current = buf.charAt(pos);
313         if (current != DQUOTE) {
314             return;
315         }
316         pos++;
317         indexFrom++;
318         boolean escaped = false;
319         for (int i = indexFrom; i < indexTo; i++, pos++) {
320             current = buf.charAt(i);
321             if (escaped) {
322                 if (current != DQUOTE && current != ESCAPE) {
323                     dst.append(ESCAPE);
324                 }
325                 dst.append(current);
326                 escaped = false;
327             } else {
328                 if (current == DQUOTE) {
329                     pos++;
330                     break;
331                 }
332                 if (current == ESCAPE) {
333                     escaped = true;
334                 } else if (current != CR && current != LF) {
335                     dst.append(current);
336                 }
337             }
338         }
339         cursor.updatePos(pos);
340     }
341 
342 }