Coverage Report - org.apache.any23.plugin.crawler.SiteCrawler
 
Classes in this File Line Coverage Branch Coverage Complexity
SiteCrawler
0%
0/59
0%
0/24
2.294
SiteCrawler$1
0%
0/3
N/A
2.294
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.plugin.crawler;
 19  
 
 20  
 import edu.uci.ics.crawler4j.crawler.CrawlController;
 21  
 import edu.uci.ics.crawler4j.crawler.WebCrawler;
 22  
 
 23  
 import java.io.File;
 24  
 import java.net.URL;
 25  
 import java.util.ArrayList;
 26  
 import java.util.Collections;
 27  
 import java.util.List;
 28  
 import java.util.concurrent.ExecutorService;
 29  
 import java.util.concurrent.Executors;
 30  
 import java.util.regex.Pattern;
 31  
 
 32  
 /**
 33  
  * A basic <em>site crawler</em> to extract semantic content
 34  
  * of small/medium size sites.
 35  
  *
 36  
  * @author Michele Mostarda (mostarda@fbk.eu)
 37  
  */
 38  0
 public class SiteCrawler {
 39  
 
 40  
     public static final String DEFAULT_PAGE_FILTER_RE =
 41  
         ".*(\\.(" +
 42  
                     "css|js"                            +
 43  
                     "|bmp|gif|jpe?g|png|tiff?"          +
 44  
                     "|mid|mp2|mp3|mp4|wav|wma"          +
 45  
                     "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
 46  
                     "|pdf"        +
 47  
                     "|swf"        +
 48  
                     "|zip|rar|gz" +
 49  
                     "|xml|txt"    +
 50  
         "))$";
 51  
 
 52  
     /**
 53  
      * Default number of crawler instances.
 54  
      */
 55  
     public static final int DEFAULT_NUM_OF_CRAWLERS = 10;
 56  
 
 57  
     /**
 58  
      * Default crawler implementation.
 59  
      */
 60  0
     public static final Class<? extends WebCrawler> DEFAULT_WEB_CRAWLER = DefaultWebCrawler.class;
 61  
 
 62  
     /**
 63  
      * Default filter applied to skip contents.
 64  
      */
 65  0
     public final Pattern defaultFilters = Pattern.compile(DEFAULT_PAGE_FILTER_RE);
 66  
 
 67  
     /**
 68  
      * The crawler threads controller.
 69  
      */
 70  
     private final CrawlController controller;
 71  
 
 72  
     /**
 73  
      * Crawler listeners.
 74  
      */
 75  0
     private final List<CrawlerListener> listeners = new ArrayList<CrawlerListener>();
 76  
 
 77  
     /**
 78  
      * Actual number of crawler instances.
 79  
      */
 80  0
     private int numOfCrawlers = DEFAULT_NUM_OF_CRAWLERS;
 81  
 
 82  
     /**
 83  
      * Actual web crawler.
 84  
      */
 85  0
     private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
 86  
 
 87  
     /**
 88  
      * Max allowed depth, <code>-1</code> means no limit.
 89  
      */
 90  0
     private int maxDepth = -1;
 91  
 
 92  
     /**
 93  
      *  Max allowed pages, <code>-1</code> means no limit.
 94  
      */
 95  0
     private int maxPages = -1;
 96  
 
 97  
     /**
 98  
      * Subsequent call politeness delay, <code>-1</code> means no limit.
 99  
      */
 100  0
     private int politenessDelay = -1;
 101  
 
 102  
     /**
 103  
      * Internal executor service.
 104  
      */
 105  
     private ExecutorService service;
 106  
 
 107  
     /**
 108  
      * Constructor.
 109  
      *
 110  
      * @param storageFolder location used to store the temporary data structures used by the crawler.
 111  
      */
 112  0
     public SiteCrawler(File storageFolder) {
 113  
         try {
 114  0
             controller = new CrawlController( storageFolder.getAbsolutePath() );
 115  0
         } catch (Exception e) {
 116  0
             throw new IllegalArgumentException("Error while initializing crawler controller.", e);
 117  0
         }
 118  0
     }
 119  
 
 120  
     /**
 121  
      * @return number of crawler instances.
 122  
      */
 123  
     public int getNumOfCrawlers() {
 124  0
         return numOfCrawlers;
 125  
     }
 126  
 
 127  
     /**
 128  
      * Sets the number of crawler instances.
 129  
      *
 130  
      * @param n an integer &gt;= 0.
 131  
      */
 132  
     public void setNumOfCrawlers(int n) {
 133  0
         if(n <=0) throw new IllegalArgumentException("Invalid number of crawlers, must be > 0 .");
 134  0
         this.numOfCrawlers = n;
 135  0
     }
 136  
 
 137  
     public Class<? extends WebCrawler> getWebCrawler() {
 138  0
         return webCrawler;
 139  
     }
 140  
 
 141  
     /**
 142  
      * Sets the actual crawler clas.
 143  
      *
 144  
      * @param c a not <code>class</code>.
 145  
      */
 146  
     public void setWebCrawler(Class<? extends WebCrawler> c) {
 147  0
         if(c == null) throw new NullPointerException("c cannot be null.");
 148  0
         this.webCrawler = c;
 149  0
     }
 150  
 
 151  
     /**
 152  
      * @return the max allowed crawl depth, <code>-1</code> means no limit.
 153  
      */
 154  
     public int getMaxDepth() {
 155  0
         return maxDepth;
 156  
     }
 157  
 
 158  
     /**
 159  
      * Sets the maximum depth.
 160  
      *
 161  
      * @param maxDepth maximum allowed depth. <code>-1</code> means no limit.
 162  
      */
 163  
     public void setMaxDepth(int maxDepth) {
 164  0
         if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
 165  0
         if(maxDepth > 0) try {
 166  0
             controller.setMaximumCrawlDepth(maxDepth);
 167  0
         } catch (Exception e) {
 168  0
             throw new IllegalArgumentException("Error while setting maxDepth.", e);
 169  0
         }
 170  0
         this.maxDepth = maxDepth;
 171  0
     }
 172  
 
 173  
     /**
 174  
      * @return max number of allowed pages.
 175  
      */
 176  
     public int getMaxPages() {
 177  0
         return maxPages;
 178  
     }
 179  
 
 180  
     /**
 181  
      * Sets the maximum collected pages.
 182  
      *
 183  
      * @param maxPages maximum allowed pages. <code>-1</code> means no limit.
 184  
      */
 185  
     public void setMaxPages(int maxPages) {
 186  0
         if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
 187  0
         if(maxPages > 0) controller.setMaximumPagesToFetch(maxPages);
 188  0
         this.maxPages = maxPages;
 189  0
     }
 190  
 
 191  
     /**
 192  
      * @return the politeness delay in milliseconds.
 193  
      */
 194  
     public int getPolitenessDelay() {
 195  0
         return politenessDelay;
 196  
     }
 197  
 
 198  
     /**
 199  
      * Sets the politeness delay. <code>-1</code> means no politeness.
 200  
      *
 201  
      * @param millis delay in milliseconds.
 202  
      */
 203  
     public void setPolitenessDelay(int millis) {
 204  0
         if(millis < -1) throw new IllegalArgumentException("Invalid politenessDelay, must be >= -1");
 205  0
         if(millis >= 0) controller.setPolitenessDelay(millis);
 206  0
         this.politenessDelay = millis;
 207  0
     }
 208  
 
 209  
     /**
 210  
      * Registers a {@link CrawlerListener} to this crawler.
 211  
      *
 212  
      * @param listener
 213  
      */
 214  
     public void addListener(CrawlerListener listener) {
 215  0
         listeners.add(listener);
 216  0
     }
 217  
 
 218  
     /**
 219  
      * Deregisters a {@link CrawlerListener} from this crawler.
 220  
      *
 221  
      * @param listener
 222  
      */
 223  
     public void removeListener(CrawlerListener listener) {
 224  0
         listeners.remove(listener);
 225  0
     }
 226  
 
 227  
     /**
 228  
      * Starts the crawling process.
 229  
      *
 230  
      * @param seed the starting URL for the crawler process.
 231  
      * @param filters filters to be applied to the crawler process. Can be <code>null</code>.
 232  
      * @param wait if <code>true</code> the process will wait for the crawler termination.
 233  
      * @throws Exception
 234  
      */
 235  
     public synchronized void start(
 236  
             final URL seed, final Pattern filters, final boolean wait
 237  
     ) throws Exception {
 238  0
         SharedData.setCrawlData(seed.toExternalForm(), filters, Collections.synchronizedList(listeners) );
 239  0
         controller.addSeed(seed.toExternalForm());
 240  0
         final Runnable internalRunnable = new Runnable() {
 241  
             @Override
 242  
             public void run() {
 243  0
                 controller.start(getWebCrawler(), getNumOfCrawlers());
 244  0
             }
 245  
         };
 246  0
         if(wait) {
 247  0
             internalRunnable.run();
 248  
         } else {
 249  0
             if(service != null) throw new IllegalStateException("Another service seems to run.");
 250  0
             service = Executors.newSingleThreadExecutor();
 251  0
             service.execute(internalRunnable);
 252  
         }
 253  0
     }
 254  
 
 255  
     /**
 256  
      * Starts the crawler process with the {@link #defaultFilters}.
 257  
      *
 258  
      * @param seed the starting URL for the crawler process.
 259  
      * @param wait if <code>true</code> the process will wait for the crawler termination.
 260  
      * @throws Exception
 261  
      */
 262  
     public void start(final URL seed, final boolean wait) throws Exception {
 263  0
         start(seed, defaultFilters, wait);
 264  0
     }
 265  
 
 266  
     /**
 267  
      * Interrupts the crawler process if started with <code>wait</code> flag == <code>false</code>.
 268  
      */
 269  
     public synchronized void stop() {
 270  0
         service.shutdownNow();
 271  0
     }
 272  
 
 273  
 }