Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
DefaultWebCrawler |
|
| 3.5;3.5 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.any23.plugin.crawler; | |
19 | ||
20 | import edu.uci.ics.crawler4j.crawler.Page; | |
21 | import edu.uci.ics.crawler4j.crawler.WebCrawler; | |
22 | import edu.uci.ics.crawler4j.url.WebURL; | |
23 | import org.slf4j.Logger; | |
24 | import org.slf4j.LoggerFactory; | |
25 | ||
26 | import java.util.regex.Pattern; | |
27 | ||
28 | /** | |
29 | * Default {@link WebCrawler} implementation. | |
30 | * | |
31 | * @author Michele Mostarda (mostarda@fbk.eu) | |
32 | */ | |
33 | 0 | public class DefaultWebCrawler extends WebCrawler { |
34 | ||
35 | 0 | private static final Logger logger = LoggerFactory.getLogger(DefaultWebCrawler.class); |
36 | ||
37 | /** | |
38 | * Shared data reference. | |
39 | */ | |
40 | 0 | private final SharedData sharedData = SharedData.getInstance(); |
41 | ||
42 | /** | |
43 | * Page filter pattern. | |
44 | */ | |
45 | 0 | private final Pattern pattern = sharedData.getPattern(); |
46 | ||
47 | /** | |
48 | * Override this method to specify whether the given URL should be visited or not. | |
49 | */ | |
50 | @Override | |
51 | public boolean shouldVisit(WebURL url) { | |
52 | 0 | if (url.getURL() == null) return false; |
53 | 0 | final String href = url.getURL().toLowerCase(); |
54 | 0 | if( ! href.startsWith( sharedData.getSeed() ) ) return false; |
55 | 0 | return pattern == null || ! pattern.matcher(href).matches(); |
56 | } | |
57 | ||
58 | /** | |
59 | * Override this method to implement the single page processing logic. | |
60 | */ | |
61 | @Override | |
62 | public void visit(Page page) { | |
63 | 0 | logger.trace("Visiting page: " + page.getWebURL().getURL()); |
64 | 0 | sharedData.notifyPage(page); |
65 | 0 | } |
66 | ||
67 | } | |
68 |