View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.http.message;
29  
30  import java.util.NoSuchElementException;
31  
32  import org.apache.http.HeaderIterator;
33  import org.apache.http.ParseException;
34  import org.apache.http.TokenIterator;
35  import org.apache.http.util.Args;
36  
37  /**
38   * Basic implementation of a {@link TokenIterator}.
39   * This implementation parses {@code #token} sequences as
40   * defined by RFC 2616, section 2.
41   * It extends that definition somewhat beyond US-ASCII.
42   *
43   * @since 4.0
44   */
45  public class BasicTokenIterator implements TokenIterator {
46  
47      /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
48      // the order of the characters here is adjusted to put the
49      // most likely candidates at the beginning of the collection
50      public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
51  
52  
53      /** The iterator from which to obtain the next header. */
54      protected final HeaderIterator headerIt;
55  
56      /**
57       * The value of the current header.
58       * This is the header value that includes {@link #currentToken}.
59       * Undefined if the iteration is over.
60       */
61      protected String currentHeader;
62  
63      /**
64       * The token to be returned by the next call to {@link #nextToken()}.
65       * {@code null} if the iteration is over.
66       */
67      protected String currentToken;
68  
69      /**
70       * The position after {@link #currentToken} in {@link #currentHeader}.
71       * Undefined if the iteration is over.
72       */
73      protected int searchPos;
74  
75  
76      /**
77       * Creates a new instance of {@link BasicTokenIterator}.
78       *
79       * @param headerIterator    the iterator for the headers to tokenize
80       */
81      public BasicTokenIterator(final HeaderIterator headerIterator) {
82          super();
83          this.headerIt = Args.notNull(headerIterator, "Header iterator");
84          this.searchPos = findNext(-1);
85      }
86  
87  
88      // non-javadoc, see interface TokenIterator
89      @Override
90      public boolean hasNext() {
91          return (this.currentToken != null);
92      }
93  
94  
95      /**
96       * Obtains the next token from this iteration.
97       *
98       * @return  the next token in this iteration
99       *
100      * @throws NoSuchElementException   if the iteration is already over
101      * @throws ParseException   if an invalid header value is encountered
102      */
103     @Override
104     public String nextToken()
105         throws NoSuchElementException, ParseException {
106 
107         if (this.currentToken == null) {
108             throw new NoSuchElementException("Iteration already finished.");
109         }
110 
111         final String result = this.currentToken;
112         // updates currentToken, may trigger ParseException:
113         this.searchPos = findNext(this.searchPos);
114 
115         return result;
116     }
117 
118 
119     /**
120      * Returns the next token.
121      * Same as {@link #nextToken}, but with generic return type.
122      *
123      * @return  the next token in this iteration
124      *
125      * @throws NoSuchElementException   if there are no more tokens
126      * @throws ParseException   if an invalid header value is encountered
127      */
128     @Override
129     public final Object next()
130         throws NoSuchElementException, ParseException {
131         return nextToken();
132     }
133 
134 
135     /**
136      * Removing tokens is not supported.
137      *
138      * @throws UnsupportedOperationException    always
139      */
140     @Override
141     public final void remove()
142         throws UnsupportedOperationException {
143 
144         throw new UnsupportedOperationException
145             ("Removing tokens is not supported.");
146     }
147 
148 
149     /**
150      * Determines the next token.
151      * If found, the token is stored in {@link #currentToken}.
152      * The return value indicates the position after the token
153      * in {@link #currentHeader}. If necessary, the next header
154      * will be obtained from {@link #headerIt}.
155      * If not found, {@link #currentToken} is set to {@code null}.
156      *
157      * @param pos       the position in the current header at which to
158      *                  start the search, -1 to search in the first header
159      *
160      * @return  the position after the found token in the current header, or
161      *          negative if there was no next token
162      *
163      * @throws ParseException   if an invalid header value is encountered
164      */
165     protected int findNext(final int pos) throws ParseException {
166         int from = pos;
167         if (from < 0) {
168             // called from the constructor, initialize the first header
169             if (!this.headerIt.hasNext()) {
170                 return -1;
171             }
172             this.currentHeader = this.headerIt.nextHeader().getValue();
173             from = 0;
174         } else {
175             // called after a token, make sure there is a separator
176             from = findTokenSeparator(from);
177         }
178 
179         final int start = findTokenStart(from);
180         if (start < 0) {
181             this.currentToken = null;
182             return -1; // nothing found
183         }
184 
185         final int end = findTokenEnd(start);
186         this.currentToken = createToken(this.currentHeader, start, end);
187         return end;
188     }
189 
190 
191     /**
192      * Creates a new token to be returned.
193      * Called from {@link #findNext findNext} after the token is identified.
194      * The default implementation simply calls
195      * {@link java.lang.String#substring String.substring}.
196      * <p>
197      * If header values are significantly longer than tokens, and some
198      * tokens are permanently referenced by the application, there can
199      * be problems with garbage collection. A substring will hold a
200      * reference to the full characters of the original string and
201      * therefore occupies more memory than might be expected.
202      * To avoid this, override this method and create a new string
203      * instead of a substring.
204      * </p>
205      *
206      * @param value     the full header value from which to create a token
207      * @param start     the index of the first token character
208      * @param end       the index after the last token character
209      *
210      * @return  a string representing the token identified by the arguments
211      */
212     protected String createToken(final String value, final int start, final int end) {
213         return value.substring(start, end);
214     }
215 
216 
217     /**
218      * Determines the starting position of the next token.
219      * This method will iterate over headers if necessary.
220      *
221      * @param pos       the position in the current header at which to
222      *                  start the search
223      *
224      * @return  the position of the token start in the current header,
225      *          negative if no token start could be found
226      */
227     protected int findTokenStart(final int pos) {
228         int from = Args.notNegative(pos, "Search position");
229         boolean found = false;
230         while (!found && (this.currentHeader != null)) {
231 
232             final int to = this.currentHeader.length();
233             while (!found && (from < to)) {
234 
235                 final char ch = this.currentHeader.charAt(from);
236                 if (isTokenSeparator(ch) || isWhitespace(ch)) {
237                     // whitspace and token separators are skipped
238                     from++;
239                 } else if (isTokenChar(this.currentHeader.charAt(from))) {
240                     // found the start of a token
241                     found = true;
242                 } else {
243                     throw new ParseException
244                         ("Invalid character before token (pos " + from +
245                          "): " + this.currentHeader);
246                 }
247             }
248             if (!found) {
249                 if (this.headerIt.hasNext()) {
250                     this.currentHeader = this.headerIt.nextHeader().getValue();
251                     from = 0;
252                 } else {
253                     this.currentHeader = null;
254                 }
255             }
256         } // while headers
257 
258         return found ? from : -1;
259     }
260 
261 
262     /**
263      * Determines the position of the next token separator.
264      * Because of multi-header joining rules, the end of a
265      * header value is a token separator. This method does
266      * therefore not need to iterate over headers.
267      *
268      * @param pos       the position in the current header at which to
269      *                  start the search
270      *
271      * @return  the position of a token separator in the current header,
272      *          or at the end
273      *
274      * @throws ParseException
275      *         if a new token is found before a token separator.
276      *         RFC 2616, section 2.1 explicitly requires a comma between
277      *         tokens for {@code #}.
278      */
279     protected int findTokenSeparator(final int pos) {
280         int from = Args.notNegative(pos, "Search position");
281         boolean found = false;
282         final int to = this.currentHeader.length();
283         while (!found && (from < to)) {
284             final char ch = this.currentHeader.charAt(from);
285             if (isTokenSeparator(ch)) {
286                 found = true;
287             } else if (isWhitespace(ch)) {
288                 from++;
289             } else if (isTokenChar(ch)) {
290                 throw new ParseException
291                     ("Tokens without separator (pos " + from +
292                      "): " + this.currentHeader);
293             } else {
294                 throw new ParseException
295                     ("Invalid character after token (pos " + from +
296                      "): " + this.currentHeader);
297             }
298         }
299 
300         return from;
301     }
302 
303 
304     /**
305      * Determines the ending position of the current token.
306      * This method will not leave the current header value,
307      * since the end of the header value is a token boundary.
308      *
309      * @param from      the position of the first character of the token
310      *
311      * @return  the position after the last character of the token.
312      *          The behavior is undefined if {@code from} does not
313      *          point to a token character in the current header value.
314      */
315     protected int findTokenEnd(final int from) {
316         Args.notNegative(from, "Search position");
317         final int to = this.currentHeader.length();
318         int end = from+1;
319         while ((end < to) && isTokenChar(this.currentHeader.charAt(end))) {
320             end++;
321         }
322 
323         return end;
324     }
325 
326 
327     /**
328      * Checks whether a character is a token separator.
329      * RFC 2616, section 2.1 defines comma as the separator for
330      * {@code #token} sequences. The end of a header value will
331      * also separate tokens, but that is not a character check.
332      *
333      * @param ch        the character to check
334      *
335      * @return  {@code true} if the character is a token separator,
336      *          {@code false} otherwise
337      */
338     protected boolean isTokenSeparator(final char ch) {
339         return (ch == ',');
340     }
341 
342 
343     /**
344      * Checks whether a character is a whitespace character.
345      * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
346      * The optional preceeding line break is irrelevant, since header
347      * continuation is handled transparently when parsing messages.
348      *
349      * @param ch        the character to check
350      *
351      * @return  {@code true} if the character is whitespace,
352      *          {@code false} otherwise
353      */
354     protected boolean isWhitespace(final char ch) {
355 
356         // we do not use Character.isWhitspace(ch) here, since that allows
357         // many control characters which are not whitespace as per RFC 2616
358         return ((ch == '\t') || Character.isSpaceChar(ch));
359     }
360 
361 
362     /**
363      * Checks whether a character is a valid token character.
364      * Whitespace, control characters, and HTTP separators are not
365      * valid token characters. The HTTP specification (RFC 2616, section 2.2)
366      * defines tokens only for the US-ASCII character set, this
367      * method extends the definition to other character sets.
368      *
369      * @param ch        the character to check
370      *
371      * @return  {@code true} if the character is a valid token start,
372      *          {@code false} otherwise
373      */
374     protected boolean isTokenChar(final char ch) {
375 
376         // common sense extension of ALPHA + DIGIT
377         if (Character.isLetterOrDigit(ch)) {
378             return true;
379         }
380 
381         // common sense extension of CTL
382         if (Character.isISOControl(ch)) {
383             return false;
384         }
385 
386         // no common sense extension for this
387         if (isHttpSeparator(ch)) {
388             return false;
389         }
390 
391         // RFC 2616, section 2.2 defines a token character as
392         // "any CHAR except CTLs or separators". The controls
393         // and separators are included in the checks above.
394         // This will yield unexpected results for Unicode format characters.
395         // If that is a problem, overwrite isHttpSeparator(char) to filter
396         // out the false positives.
397         return true;
398     }
399 
400 
401     /**
402      * Checks whether a character is an HTTP separator.
403      * The implementation in this class checks only for the HTTP separators
404      * defined in RFC 2616, section 2.2. If you need to detect other
405      * separators beyond the US-ASCII character set, override this method.
406      *
407      * @param ch        the character to check
408      *
409      * @return  {@code true} if the character is an HTTP separator
410      */
411     protected boolean isHttpSeparator(final char ch) {
412         return (HTTP_SEPARATORS.indexOf(ch) >= 0);
413     }
414 
415 
416 } // class BasicTokenIterator
417