View Javadoc
1   package org.apache.maven.jxr.util;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.util.Collections;
23  import java.util.Vector;
24  
25  /**
26   * This is a small and fast word tokenizer. It has different characteristics from the normal Java tokenizer. It only
27   * considers clear words that are only ended with spaces as strings. EX: "Flight" would be a word but "Flight()" would
28   * not.
29   */
30  public class SimpleWordTokenizer
31  {
32  
33      /**
34       * Description of the Field
35       */
36      public static final char[] BREAKERS = { '(', ')', '[', ' ', '{', '}' };
37  
38      /**
39       * Break the given line into multiple StringUtils
40       */
41      public static StringEntry[] tokenize( String line )
42      {
43  
44          /*
45           * determine where to start processing this String... this could either be the start of the line or just keep
46           * going until the first
47           */
48          int start = getStart( line );
49  
50          // find the first non-BREAKER char and assume that is where you want to start
51  
52          if ( line == null || line.length() == 0 || start == -1 )
53          {
54              return new StringEntry[0];
55          }
56  
57          return tokenize( line, start );
58      }
59  
60      /**
61       * Tokenize the given line but only return StringUtils that match the parameter find.
62       *
63       * @param line String to search in
64       * @param find String to match.
65       */
66      public static StringEntry[] tokenize( String line, String find )
67      {
68  
69          Vector<StringEntry> v = new Vector<StringEntry>();
70  
71          for ( StringEntry se : tokenize( line ) )
72          {
73  
74              if ( se.toString().equals( find ) )
75              {
76                  v.addElement( se );
77              }
78  
79          }
80  
81          StringEntry[] found = new StringEntry[v.size()];
82          Collections.sort( v );
83          v.copyInto( found );
84          return found;
85      }
86  
87      /**
88       * Internal impl. Specify the start and end.
89       */
90      private static StringEntry[] tokenize( String line, int start )
91      {
92  
93          Vector<StringEntry> words = new Vector<StringEntry>();
94  
95          // algorithm works like this... break the line out into segments
96          // that are separated by spaces, and if the entire String doesn't contain
97          // a non-Alpha char then assume it is a word.
98          while ( true )
99          {
100 
101             int next = getNextBreak( line, start );
102 
103             if ( next < 0 || next <= start )
104             {
105                 break;
106             }
107 
108             String word = line.substring( start, next );
109 
110             if ( isWord( word ) )
111             {
112                 words.addElement( new StringEntry( word, start ) );
113             }
114 
115             start = next + 1;
116         }
117 
118         StringEntry[] found = new StringEntry[words.size()];
119         words.copyInto( found );
120         return found;
121     }
122 
123     /**
124      * Go through the entire String and if any character is not a Java identifier part (_, a, b, c, d, etc) then return
125      * false.
126      */
127     private static boolean isWord( String string )
128     {
129 
130         if ( string == null || string.length() == 0 )
131         {
132 
133             return false;
134         }
135 
136         for ( int i = 0; i < string.length(); ++i )
137         {
138 
139             char c = string.charAt( i );
140 
141             if ( !Character.isJavaIdentifierPart( c ) && c != '.' )
142             {
143                 return false;
144             }
145 
146         }
147 
148         return true;
149     }
150 
151     /**
152      * Go through the list of BREAKERS and find the closes one.
153      */
154     private static int getNextBreak( String string, int start )
155     {
156 
157         int breakPoint = -1;
158 
159         for ( int i = 0; i < BREAKERS.length; ++i )
160         {
161 
162             int next = string.indexOf( BREAKERS[i], start );
163 
164             if ( breakPoint == -1 || next < breakPoint && next != -1 )
165             {
166 
167                 breakPoint = next;
168 
169             }
170 
171         }
172 
173         // if the breakPoint is still -1 go to the end of the string
174         if ( breakPoint == -1 )
175         {
176             breakPoint = string.length();
177         }
178 
179         return breakPoint;
180     }
181 
182     /**
183      * Go through the list of BREAKERS and find the closes one.
184      */
185     private static int getStart( String string )
186     {
187 
188         for ( int i = 0; i < string.length(); ++i )
189         {
190 
191             if ( !isBreaker( string.charAt( i ) ) )
192             {
193                 return i;
194             }
195 
196         }
197 
198         return -1;
199     }
200 
201     /**
202      * Return true if the given char is considered a breaker.
203      */
204     private static boolean isBreaker( char c )
205     {
206 
207         for ( int i = 0; i < BREAKERS.length; ++i )
208         {
209 
210             if ( BREAKERS[i] == c )
211             {
212                 return true;
213             }
214 
215         }
216 
217         return false;
218     }
219 
220 }