View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.index;
20  
21  import java.io.File;
22  import java.io.IOException;
23  import java.util.Arrays;
24  import java.util.HashSet;
25  
26  import org.apache.lucene.search.Query;
27  import org.apache.lucene.store.ByteBuffersDirectory;
28  import org.apache.lucene.store.Directory;
29  import org.apache.maven.index.context.IndexingContext;
30  import org.apache.maven.index.expr.SourcedSearchExpression;
31  import org.junit.Test;
32  
33  import static org.junit.Assert.assertEquals;
34  import static org.junit.Assert.assertNotNull;
35  
36  public class DuplicateSearchTest extends AbstractNexusIndexerTest {
37      protected File repo = new File(getBasedir(), "src/test/repo");
38  
39      protected IndexingContext context1;
40  
41      protected Directory contextDir1 = new ByteBuffersDirectory();
42  
43      protected IndexingContext context2;
44  
45      protected Directory contextDir2 = new ByteBuffersDirectory();
46  
47      @Override
48      protected void prepareNexusIndexer(NexusIndexer nexusIndexer) throws Exception {
49          // we have a context with ID "repo1-ctx" that contains index of repository with ID "repo1"
50          context = nexusIndexer.addIndexingContext("repo1-ctx", "repo1", repo, indexDir, null, null, FULL_CREATORS);
51          // we have a context with ID "repo2-ctx" that contains index of repository with ID "repo2"
52          context1 = nexusIndexer.addIndexingContext("repo2-ctx", "repo2", repo, contextDir1, null, null, FULL_CREATORS);
53          // we have a context with ID "repo3-ctx" that contains index of repository with ID "repo2"
54          context2 = nexusIndexer.addIndexingContext("repo3-ctx", "repo2", repo, contextDir2, null, null, FULL_CREATORS);
55  
56          // note: those three contexts, while representing different entities are actually indexing the same repository
57          // directory, hence, will have exactly same content! Also, context1 and context2 do say, they both index
58          // repository with ID "repo2"!
59  
60          nexusIndexer.scan(context);
61          nexusIndexer.scan(context1);
62          nexusIndexer.scan(context2);
63  
64          assertNotNull(context.getTimestamp());
65          assertNotNull(context1.getTimestamp());
66          assertNotNull(context2.getTimestamp());
67      }
68  
69      // a bit of explanation:
70      // we focus on a G "org.slf4j". The given section (subdir tree) looks like this (simplified):
71      // ├── org
72      //    ├── slf4j
73      //       ├── jcl104-over-slf4j
74      //       │   └── 1.4.2
75      //       │   ├── jcl104-over-slf4j-1.4.2-sources.jar
76      //       │   ├── jcl104-over-slf4j-1.4.2-sources.jar.sha1
77      //       │   ├── jcl104-over-slf4j-1.4.2.jar
78      //       │   ├── jcl104-over-slf4j-1.4.2.jar.sha1
79      //       │   ├── jcl104-over-slf4j-1.4.2.pom
80      //       │   └── jcl104-over-slf4j-1.4.2.pom.sha1
81      //       ├── slf4j-api
82      //       │   ├── 1.4.1
83      //       │   │   ├── slf4j-api-1.4.1-sources.jar
84      //       │   │   ├── slf4j-api-1.4.1-sources.jar.sha1
85      //       │   │   ├── slf4j-api-1.4.1.jar
86      //       │   │   ├── slf4j-api-1.4.1.jar.sha1
87      //       │   │   ├── slf4j-api-1.4.1.pom
88      //       │   │   └── slf4j-api-1.4.1.pom.sha1
89      //       │   └── 1.4.2
90      //       │   ├── slf4j-api-1.4.2-sources.jar
91      //       │   ├── slf4j-api-1.4.2-sources.jar.sha1
92      //       │   ├── slf4j-api-1.4.2.jar
93      //       │   ├── slf4j-api-1.4.2.jar.sha1
94      //       │   ├── slf4j-api-1.4.2.pom
95      //       │   └── slf4j-api-1.4.2.pom.sha1
96      //       └── slf4j-log4j12
97      //       └── 1.4.1
98      //       ├── slf4j-log4j12-1.4.1-bin.tar.gz
99      //       ├── slf4j-log4j12-1.4.1-bin.zip
100     //       ├── slf4j-log4j12-1.4.1-sources.jar
101     //       ├── slf4j-log4j12-1.4.1-sources.jar.sha1
102     //       ├── slf4j-log4j12-1.4.1.jar
103     //       ├── slf4j-log4j12-1.4.1.jar.sha1
104     //       ├── slf4j-log4j12-1.4.1.pom
105     //       └── slf4j-log4j12-1.4.1.pom.sha1
106     //
107     // Records on index are created as: each main and each "classified" artifact is one Document.
108     // Meaning, with structure above, for groupId "org.slf4j" we have 10 records:
109     // G:A:V
110     // org.slf4j:jcl104-over-slf4j:1.4.2:jar
111     // org.slf4j:jcl104-over-slf4j:1.4.2:jar:sources
112     // org.slf4j:slf4j-api:1.4.1:jar
113     // org.slf4j:slf4j-api:1.4.1:jar:sources
114     // org.slf4j:slf4j-api:1.4.2:jar
115     // org.slf4j:slf4j-api:1.4.2:jar:sources
116     // org.slf4j:slf4j-log4j12:1.4.1:jar
117     // org.slf4j:slf4j-log4j12:1.4.1:jar:sources
118     // org.slf4j:slf4j-log4j12:1.4.1:zip:bin
119     // org.slf4j:slf4j-log4j12:1.4.1:tar.gz:bin
120     //
121     // ArtifactInfo, along with GAV carries contextId and repositoryId too!
122 
123     @Test
124     public void testProveSvnRev1158917IsWrong() throws IOException {
125         // change is SVN Rev1158917 (http://svn.apache.org/viewvc?view=revision&revision=1158917) is wrong (and is
126         // undone)
127         // because after removing it, we still dont have GAV dupes in results, here is a proof:
128 
129         Query query = nexusIndexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.slf4j"));
130         FlatSearchRequest fsReq = new FlatSearchRequest(query);
131         fsReq.getContexts().add(context);
132         fsReq.getContexts().add(context1);
133         fsReq.getContexts().add(context2);
134 
135         FlatSearchResponse fsResp = nexusIndexer.searchFlat(fsReq);
136 
137         assertEquals(
138                 "We have 10 GAVs coming from three contextes",
139                 10,
140                 fsResp.getResults().size());
141 
142         // Why? Look at the FlatSearchRequest default comparator it uses, it is ArtifactInfo.VERSION_COMPARATOR
143         // that neglects contextId and repositoryId and compares GAVs only, and the Collection fixed in SVN Rev1158917
144         // is actually a Set<ArtifactInfo with proper comparator set.
145     }
146 
147     @Test
148     public void testHowUniqueSearchShouldBeDone() throws IOException {
149         // my use case: I am searching for duplicates in given two contexts belonging to given groupId "org.slf4j"
150         // I expect to find intersection of two reposes, since both of those indexes/reposes contains that
151 
152         Query query = nexusIndexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.slf4j"));
153 
154         FlatSearchRequest fsReq = new FlatSearchRequest(query);
155         fsReq.setArtifactInfoComparator(ArtifactInfo.CONTEXT_VERSION_COMPARATOR);
156         fsReq.getContexts().add(context);
157         fsReq.getContexts().add(context1);
158         fsReq.getContexts().add(context2);
159 
160         FlatSearchResponse fsResp = nexusIndexer.searchFlat(fsReq);
161 
162         assertEquals(
163                 "We have 10 GAVs coming from three contextes, it is 30",
164                 30,
165                 fsResp.getResults().size());
166 
167         // Why? We set explicitly the comparator to CONTEXT_VERSION_COMPARATOR, that compares GAV+contextId, hence,
168         // will return all hits from all participating contexts.
169     }
170 
171     @Test
172     public void testHowtoPerformAggregatedSearch() throws IOException {
173         // Note: currently this is implemented for IteratorSearches only! TBD for Flat and Grouped searches
174 
175         // my use case: searching across multiple contexts, querying how many combinations of GAs exists in groupId
176         // "org.slf4j".
177 
178         Query query = nexusIndexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.slf4j"));
179 
180         IteratorSearchRequest isReq = new IteratorSearchRequest(query);
181 
182         // so, how many different GA combinations exists, this is almost equal to SQLs group by "groupId, artifactId"
183         isReq.setArtifactInfoFilter(
184                 new UniqueArtifactFilterPostprocessor(new HashSet<>(Arrays.asList(MAVEN.GROUP_ID, MAVEN.ARTIFACT_ID))));
185         isReq.getContexts().add(context);
186         isReq.getContexts().add(context1);
187         isReq.getContexts().add(context2);
188 
189         // Note: iteratorSearch is completely different beast that flat or grouped searches. While it excels in
190         // low memory consumption and extra features (like presented here), it needs special care: you have to handle it
191         // as resource, since lazy loading requires context locking, and if you forget to do so, you will end up with a
192         // flaky
193         // application that will most probably fail (by deadlocking itself or thrashing indexes).
194 
195         IteratorSearchResponse isResp = null;
196         int actualResultCount = 0;
197 
198         try {
199             isResp = nexusIndexer.searchIterator(isReq);
200 
201             // consume the iterator to count actual result set size
202             for (ArtifactInfo ai : isResp) {
203                 actualResultCount++;
204             }
205         } finally {
206             if (isResp != null) {
207                 isResp.close();
208             }
209         }
210 
211         assertEquals("Iterator delivered to us 3 results, since we have 3 GA combinations", 3, actualResultCount);
212         assertEquals(
213                 "IteratorSearch is strange beast, due to it's nature, it cannot say how many elements it (will) return in advance, due to filtering, postprocessing, etc",
214                 -1,
215                 isResp.getReturnedHitsCount());
216         assertEquals(
217                 "The processing/search tackled 10 GAVs coming from three contextes, it is 30. This is the record count that were hit by processing of this search, but IS NOT the count results (it depends on filtering, comparators, etc)!",
218                 30,
219                 isResp.getTotalHitsCount());
220     }
221 }