1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 package org.apache.maven.index; 20 21 import java.io.File; 22 import java.io.IOException; 23 import java.util.Arrays; 24 import java.util.HashSet; 25 26 import org.apache.lucene.search.Query; 27 import org.apache.lucene.store.ByteBuffersDirectory; 28 import org.apache.lucene.store.Directory; 29 import org.apache.maven.index.context.IndexingContext; 30 import org.apache.maven.index.expr.SourcedSearchExpression; 31 import org.junit.Test; 32 33 import static org.junit.Assert.assertEquals; 34 import static org.junit.Assert.assertNotNull; 35 36 public class DuplicateSearchTest extends AbstractNexusIndexerTest { 37 protected File repo = new File(getBasedir(), "src/test/repo"); 38 39 protected IndexingContext context1; 40 41 protected Directory contextDir1 = new ByteBuffersDirectory(); 42 43 protected IndexingContext context2; 44 45 protected Directory contextDir2 = new ByteBuffersDirectory(); 46 47 @Override 48 protected void prepareNexusIndexer(NexusIndexer nexusIndexer) throws Exception { 49 // we have a context with ID "repo1-ctx" that contains index of repository with ID "repo1" 50 context = nexusIndexer.addIndexingContext("repo1-ctx", "repo1", repo, indexDir, null, null, FULL_CREATORS); 51 // we have a context with ID "repo2-ctx" that contains index of repository with ID "repo2" 52 context1 = nexusIndexer.addIndexingContext("repo2-ctx", "repo2", repo, contextDir1, null, null, FULL_CREATORS); 53 // we have a context with ID "repo3-ctx" that contains index of repository with ID "repo2" 54 context2 = nexusIndexer.addIndexingContext("repo3-ctx", "repo2", repo, contextDir2, null, null, FULL_CREATORS); 55 56 // note: those three contexts, while representing different entities are actually indexing the same repository 57 // directory, hence, will have exactly same content! Also, context1 and context2 do say, they both index 58 // repository with ID "repo2"! 59 60 nexusIndexer.scan(context); 61 nexusIndexer.scan(context1); 62 nexusIndexer.scan(context2); 63 64 assertNotNull(context.getTimestamp()); 65 assertNotNull(context1.getTimestamp()); 66 assertNotNull(context2.getTimestamp()); 67 } 68 69 // a bit of explanation: 70 // we focus on a G "org.slf4j". The given section (subdir tree) looks like this (simplified): 71 // ├── org 72 // ├── slf4j 73 // ├── jcl104-over-slf4j 74 // │ └── 1.4.2 75 // │ ├── jcl104-over-slf4j-1.4.2-sources.jar 76 // │ ├── jcl104-over-slf4j-1.4.2-sources.jar.sha1 77 // │ ├── jcl104-over-slf4j-1.4.2.jar 78 // │ ├── jcl104-over-slf4j-1.4.2.jar.sha1 79 // │ ├── jcl104-over-slf4j-1.4.2.pom 80 // │ └── jcl104-over-slf4j-1.4.2.pom.sha1 81 // ├── slf4j-api 82 // │ ├── 1.4.1 83 // │ │ ├── slf4j-api-1.4.1-sources.jar 84 // │ │ ├── slf4j-api-1.4.1-sources.jar.sha1 85 // │ │ ├── slf4j-api-1.4.1.jar 86 // │ │ ├── slf4j-api-1.4.1.jar.sha1 87 // │ │ ├── slf4j-api-1.4.1.pom 88 // │ │ └── slf4j-api-1.4.1.pom.sha1 89 // │ └── 1.4.2 90 // │ ├── slf4j-api-1.4.2-sources.jar 91 // │ ├── slf4j-api-1.4.2-sources.jar.sha1 92 // │ ├── slf4j-api-1.4.2.jar 93 // │ ├── slf4j-api-1.4.2.jar.sha1 94 // │ ├── slf4j-api-1.4.2.pom 95 // │ └── slf4j-api-1.4.2.pom.sha1 96 // └── slf4j-log4j12 97 // └── 1.4.1 98 // ├── slf4j-log4j12-1.4.1-bin.tar.gz 99 // ├── slf4j-log4j12-1.4.1-bin.zip 100 // ├── slf4j-log4j12-1.4.1-sources.jar 101 // ├── slf4j-log4j12-1.4.1-sources.jar.sha1 102 // ├── slf4j-log4j12-1.4.1.jar 103 // ├── slf4j-log4j12-1.4.1.jar.sha1 104 // ├── slf4j-log4j12-1.4.1.pom 105 // └── slf4j-log4j12-1.4.1.pom.sha1 106 // 107 // Records on index are created as: each main and each "classified" artifact is one Document. 108 // Meaning, with structure above, for groupId "org.slf4j" we have 10 records: 109 // G:A:V 110 // org.slf4j:jcl104-over-slf4j:1.4.2:jar 111 // org.slf4j:jcl104-over-slf4j:1.4.2:jar:sources 112 // org.slf4j:slf4j-api:1.4.1:jar 113 // org.slf4j:slf4j-api:1.4.1:jar:sources 114 // org.slf4j:slf4j-api:1.4.2:jar 115 // org.slf4j:slf4j-api:1.4.2:jar:sources 116 // org.slf4j:slf4j-log4j12:1.4.1:jar 117 // org.slf4j:slf4j-log4j12:1.4.1:jar:sources 118 // org.slf4j:slf4j-log4j12:1.4.1:zip:bin 119 // org.slf4j:slf4j-log4j12:1.4.1:tar.gz:bin 120 // 121 // ArtifactInfo, along with GAV carries contextId and repositoryId too! 122 123 @Test 124 public void testProveSvnRev1158917IsWrong() throws IOException { 125 // change is SVN Rev1158917 (http://svn.apache.org/viewvc?view=revision&revision=1158917) is wrong (and is 126 // undone) 127 // because after removing it, we still dont have GAV dupes in results, here is a proof: 128 129 Query query = nexusIndexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.slf4j")); 130 FlatSearchRequest fsReq = new FlatSearchRequest(query); 131 fsReq.getContexts().add(context); 132 fsReq.getContexts().add(context1); 133 fsReq.getContexts().add(context2); 134 135 FlatSearchResponse fsResp = nexusIndexer.searchFlat(fsReq); 136 137 assertEquals( 138 "We have 10 GAVs coming from three contextes", 139 10, 140 fsResp.getResults().size()); 141 142 // Why? Look at the FlatSearchRequest default comparator it uses, it is ArtifactInfo.VERSION_COMPARATOR 143 // that neglects contextId and repositoryId and compares GAVs only, and the Collection fixed in SVN Rev1158917 144 // is actually a Set<ArtifactInfo with proper comparator set. 145 } 146 147 @Test 148 public void testHowUniqueSearchShouldBeDone() throws IOException { 149 // my use case: I am searching for duplicates in given two contexts belonging to given groupId "org.slf4j" 150 // I expect to find intersection of two reposes, since both of those indexes/reposes contains that 151 152 Query query = nexusIndexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.slf4j")); 153 154 FlatSearchRequest fsReq = new FlatSearchRequest(query); 155 fsReq.setArtifactInfoComparator(ArtifactInfo.CONTEXT_VERSION_COMPARATOR); 156 fsReq.getContexts().add(context); 157 fsReq.getContexts().add(context1); 158 fsReq.getContexts().add(context2); 159 160 FlatSearchResponse fsResp = nexusIndexer.searchFlat(fsReq); 161 162 assertEquals( 163 "We have 10 GAVs coming from three contextes, it is 30", 164 30, 165 fsResp.getResults().size()); 166 167 // Why? We set explicitly the comparator to CONTEXT_VERSION_COMPARATOR, that compares GAV+contextId, hence, 168 // will return all hits from all participating contexts. 169 } 170 171 @Test 172 public void testHowtoPerformAggregatedSearch() throws IOException { 173 // Note: currently this is implemented for IteratorSearches only! TBD for Flat and Grouped searches 174 175 // my use case: searching across multiple contexts, querying how many combinations of GAs exists in groupId 176 // "org.slf4j". 177 178 Query query = nexusIndexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.slf4j")); 179 180 IteratorSearchRequest isReq = new IteratorSearchRequest(query); 181 182 // so, how many different GA combinations exists, this is almost equal to SQLs group by "groupId, artifactId" 183 isReq.setArtifactInfoFilter( 184 new UniqueArtifactFilterPostprocessor(new HashSet<>(Arrays.asList(MAVEN.GROUP_ID, MAVEN.ARTIFACT_ID)))); 185 isReq.getContexts().add(context); 186 isReq.getContexts().add(context1); 187 isReq.getContexts().add(context2); 188 189 // Note: iteratorSearch is completely different beast that flat or grouped searches. While it excels in 190 // low memory consumption and extra features (like presented here), it needs special care: you have to handle it 191 // as resource, since lazy loading requires context locking, and if you forget to do so, you will end up with a 192 // flaky 193 // application that will most probably fail (by deadlocking itself or thrashing indexes). 194 195 IteratorSearchResponse isResp = null; 196 int actualResultCount = 0; 197 198 try { 199 isResp = nexusIndexer.searchIterator(isReq); 200 201 // consume the iterator to count actual result set size 202 for (ArtifactInfo ai : isResp) { 203 actualResultCount++; 204 } 205 } finally { 206 if (isResp != null) { 207 isResp.close(); 208 } 209 } 210 211 assertEquals("Iterator delivered to us 3 results, since we have 3 GA combinations", 3, actualResultCount); 212 assertEquals( 213 "IteratorSearch is strange beast, due to it's nature, it cannot say how many elements it (will) return in advance, due to filtering, postprocessing, etc", 214 -1, 215 isResp.getReturnedHitsCount()); 216 assertEquals( 217 "The processing/search tackled 10 GAVs coming from three contextes, it is 30. This is the record count that were hit by processing of this search, but IS NOT the count results (it depends on filtering, comparators, etc)!", 218 30, 219 isResp.getTotalHitsCount()); 220 } 221 }