View Javadoc
1   package org.apache.maven.index.updater;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.BufferedInputStream;
23  import java.io.DataInput;
24  import java.io.DataInputStream;
25  import java.io.EOFException;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.UTFDataFormatException;
29  import java.util.Date;
30  import java.util.zip.GZIPInputStream;
31  
32  import com.google.common.base.Strings;
33  import java.util.LinkedHashSet;
34  import java.util.Set;
35  import org.apache.lucene.document.Document;
36  import org.apache.lucene.document.Field;
37  import org.apache.lucene.document.Field.Index;
38  import org.apache.lucene.document.Field.Store;
39  import org.apache.lucene.index.IndexWriter;
40  import org.apache.maven.index.ArtifactInfo;
41  import org.apache.maven.index.context.IndexUtils;
42  import org.apache.maven.index.context.IndexingContext;
43  
44  /**
45   * An index data reader used to parse transfer index format.
46   *
47   * @author Eugene Kuleshov
48   */
49  public class IndexDataReader
50  {
51      private final DataInputStream dis;
52  
53      public IndexDataReader( final InputStream is )
54          throws IOException
55      {
56          // MINDEXER-13
57          // LightweightHttpWagon may have performed automatic decompression
58          // Handle it transparently
59          is.mark( 2 );
60          InputStream data;
61          if ( is.read() == 0x1f && is.read() == 0x8b ) // GZIPInputStream.GZIP_MAGIC
62          {
63              is.reset();
64              data = new BufferedInputStream( new GZIPInputStream( is, 1024 * 8 ), 1024 * 8 );
65          }
66          else
67          {
68              BufferedInputStream bis = new BufferedInputStream( is, 1024 * 8 );
69              bis.reset();
70              data = bis;
71          }
72  
73          this.dis = new DataInputStream( data );
74      }
75  
76      public IndexDataReadResult readIndex( IndexWriter w, IndexingContext context )
77          throws IOException
78      {
79          long timestamp = readHeader();
80  
81          Date date = null;
82  
83          if ( timestamp != -1 )
84          {
85              date = new Date( timestamp );
86  
87              IndexUtils.updateTimestamp( w.getDirectory(), date );
88          }
89  
90          int n = 0;
91  
92          Document doc;
93          Set<String> rootGroups = new LinkedHashSet<>();
94          Set<String> allGroups = new LinkedHashSet<>();
95  
96          while ( ( doc = readDocument() ) != null )
97          {
98              ArtifactInfo ai = IndexUtils.constructArtifactInfo( doc, context );
99              if ( ai != null )
100             {
101                 w.addDocument( IndexUtils.updateDocument( doc, context, false, ai ) );
102 
103                 rootGroups.add( ai.getRootGroup() );
104                 allGroups.add( ai.getGroupId() );
105 
106             }
107             else
108             {
109                 w.addDocument( doc );
110             }
111             n++;
112         }
113 
114         w.commit();
115 
116         IndexDataReadResult result = new IndexDataReadResult();
117         result.setDocumentCount( n );
118         result.setTimestamp( date );
119         result.setRootGroups( rootGroups );
120         result.setAllGroups( allGroups );
121 
122         return result;
123     }
124 
125     public long readHeader()
126         throws IOException
127     {
128         final byte hdrbyte = (byte) ( ( IndexDataWriter.VERSION << 24 ) >> 24 );
129 
130         if ( hdrbyte != dis.readByte() )
131         {
132             // data format version mismatch
133             throw new IOException( "Provided input contains unexpected data (0x01 expected as 1st byte)!" );
134         }
135 
136         return dis.readLong();
137     }
138 
139     public Document readDocument()
140         throws IOException
141     {
142         int fieldCount;
143         try
144         {
145             fieldCount = dis.readInt();
146         }
147         catch ( EOFException ex )
148         {
149             return null; // no more documents
150         }
151 
152         Document doc = new Document();
153 
154         for ( int i = 0; i < fieldCount; i++ )
155         {
156             doc.add( readField() );
157         }
158 
159         // Fix up UINFO field wrt MINDEXER-41
160         final Field uinfoField = (Field) doc.getField( ArtifactInfo.UINFO );
161         final String info =  doc.get( ArtifactInfo.INFO );
162         if ( uinfoField != null && !Strings.isNullOrEmpty( info ) )
163         {
164             final String[] splitInfo = ArtifactInfo.FS_PATTERN.split( info );
165             if ( splitInfo.length > 6 )
166             {
167                 final String extension = splitInfo[6];
168                 final String uinfoString = uinfoField.stringValue();
169                 if ( uinfoString.endsWith( ArtifactInfo.FS + ArtifactInfo.NA ) )
170                 {
171                     uinfoField.setStringValue( uinfoString + ArtifactInfo.FS + ArtifactInfo.nvl( extension ) );
172                 }
173             }
174         }
175 
176         return doc;
177     }
178 
179     private Field readField()
180         throws IOException
181     {
182         int flags = dis.read();
183 
184         Index index = Index.NO;
185         if ( ( flags & IndexDataWriter.F_INDEXED ) > 0 )
186         {
187             boolean isTokenized = ( flags & IndexDataWriter.F_TOKENIZED ) > 0;
188             index = isTokenized ? Index.ANALYZED : Index.NOT_ANALYZED;
189         }
190 
191         Store store = Store.NO;
192         if ( ( flags & IndexDataWriter.F_STORED ) > 0 )
193         {
194             store = Store.YES;
195         }
196 
197         String name = dis.readUTF();
198         String value = readUTF( dis );
199 
200         return new Field( name, value, store, index );
201     }
202 
203     private static String readUTF( DataInput in )
204         throws IOException
205     {
206         int utflen = in.readInt();
207 
208         byte[] bytearr;
209         char[] chararr;
210 
211         try
212         {
213             bytearr = new byte[utflen];
214             chararr = new char[utflen];
215         }
216         catch ( OutOfMemoryError e )
217         {
218             final IOException ex =
219                 new IOException( "Index data content is inappropriate (is junk?), leads to OutOfMemoryError!"
220                     + " See MINDEXER-28 for more information!" );
221             ex.initCause( e );
222             throw ex;
223         }
224 
225         int c, char2, char3;
226         int count = 0;
227         int chararrCount = 0;
228 
229         in.readFully( bytearr, 0, utflen );
230 
231         while ( count < utflen )
232         {
233             c = bytearr[count] & 0xff;
234             if ( c > 127 )
235             {
236                 break;
237             }
238             count++;
239             chararr[chararrCount++] = (char) c;
240         }
241 
242         while ( count < utflen )
243         {
244             c = bytearr[count] & 0xff;
245             switch ( c >> 4 )
246             {
247                 case 0:
248                 case 1:
249                 case 2:
250                 case 3:
251                 case 4:
252                 case 5:
253                 case 6:
254                 case 7:
255                     /* 0xxxxxxx */
256                     count++;
257                     chararr[chararrCount++] = (char) c;
258                     break;
259 
260                 case 12:
261                 case 13:
262                     /* 110x xxxx 10xx xxxx */
263                     count += 2;
264                     if ( count > utflen )
265                     {
266                         throw new UTFDataFormatException( "malformed input: partial character at end" );
267                     }
268                     char2 = bytearr[count - 1];
269                     if ( ( char2 & 0xC0 ) != 0x80 )
270                     {
271                         throw new UTFDataFormatException( "malformed input around byte " + count );
272                     }
273                     chararr[chararrCount++] = (char) ( ( ( c & 0x1F ) << 6 ) | ( char2 & 0x3F ) );
274                     break;
275 
276                 case 14:
277                     /* 1110 xxxx 10xx xxxx 10xx xxxx */
278                     count += 3;
279                     if ( count > utflen )
280                     {
281                         throw new UTFDataFormatException( "malformed input: partial character at end" );
282                     }
283                     char2 = bytearr[count - 2];
284                     char3 = bytearr[count - 1];
285                     if ( ( ( char2 & 0xC0 ) != 0x80 ) || ( ( char3 & 0xC0 ) != 0x80 ) )
286                     {
287                         throw new UTFDataFormatException( "malformed input around byte " + ( count - 1 ) );
288                     }
289                     chararr[chararrCount++] =
290                         (char) ( ( ( c & 0x0F ) << 12 ) | ( ( char2 & 0x3F ) << 6 ) | ( ( char3 & 0x3F ) << 0 ) );
291                     break;
292 
293                 default:
294                     /* 10xx xxxx, 1111 xxxx */
295                     throw new UTFDataFormatException( "malformed input around byte " + count );
296             }
297         }
298 
299         // The number of chars produced may be less than utflen
300         return new String( chararr, 0, chararrCount );
301     }
302 
303     /**
304      * An index data read result holder
305      */
306     public static class IndexDataReadResult
307     {
308         private Date timestamp;
309 
310         private int documentCount;
311 
312         private Set<String> rootGroups;
313 
314         private Set<String> allGroups;
315 
316         public void setDocumentCount( int documentCount )
317         {
318             this.documentCount = documentCount;
319         }
320 
321         public int getDocumentCount()
322         {
323             return documentCount;
324         }
325 
326         public void setTimestamp( Date timestamp )
327         {
328             this.timestamp = timestamp;
329         }
330 
331         public Date getTimestamp()
332         {
333             return timestamp;
334         }
335 
336         public void setRootGroups( Set<String> rootGroups )
337         {
338             this.rootGroups = rootGroups;
339         }
340 
341         public Set<String> getRootGroups()
342         {
343             return rootGroups;
344         }
345 
346         public void setAllGroups( Set<String> allGroups )
347         {
348             this.allGroups = allGroups;
349         }
350 
351         public Set<String> getAllGroups()
352         {
353             return allGroups;
354         }
355 
356     }
357 
358     /**
359      * Reads index content by using a visitor. <br>
360      * The visitor is called for each read documents after it has been populated with Lucene fields.
361      *
362      * @param visitor an index data visitor
363      * @param context indexing context
364      * @return statistics about read data
365      * @throws IOException in case of an IO exception during index file access
366      */
367     public IndexDataReadResult readIndex( final IndexDataReadVisitor visitor, final IndexingContext context )
368         throws IOException
369     {
370         dis.readByte(); // data format version
371 
372         long timestamp = dis.readLong();
373 
374         Date date = null;
375 
376         if ( timestamp != -1 )
377         {
378             date = new Date( timestamp );
379         }
380 
381         int n = 0;
382 
383         Document doc;
384         while ( ( doc = readDocument() ) != null )
385         {
386             visitor.visitDocument( IndexUtils.updateDocument( doc, context, false ) );
387 
388             n++;
389         }
390 
391         IndexDataReadResult result = new IndexDataReadResult();
392         result.setDocumentCount( n );
393         result.setTimestamp( date );
394         return result;
395     }
396 
397     /**
398      * Visitor of indexed Lucene documents.
399      */
400     public interface IndexDataReadVisitor
401     {
402 
403         /**
404          * Called on each read document. The document is already populated with fields.
405          *
406          * @param document read document
407          */
408         void visitDocument( Document document );
409 
410     }
411 
412 }