Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
SharedData |
|
| 2.0;2 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.any23.plugin.crawler; | |
19 | ||
20 | import edu.uci.ics.crawler4j.crawler.Page; | |
21 | ||
22 | import java.util.List; | |
23 | import java.util.regex.Pattern; | |
24 | ||
25 | /** | |
26 | * This class hosts shared data structures accessible | |
27 | * to all the {@link DefaultWebCrawler} instances | |
28 | * run by the {@link SiteCrawler}. | |
29 | * | |
30 | * @author Michele Mostarda (mostarda@fbk.eu) | |
31 | */ | |
32 | public class SharedData { | |
33 | ||
34 | /** | |
35 | * Singleton instance. | |
36 | */ | |
37 | private static SharedData instance; | |
38 | ||
39 | /** | |
40 | * Crawl seed. | |
41 | */ | |
42 | private final String seed; | |
43 | ||
44 | /** | |
45 | * Crawl page filter pattern. | |
46 | */ | |
47 | private final Pattern pattern; | |
48 | ||
49 | /** | |
50 | * List of crawler listeners. | |
51 | */ | |
52 | private final List<CrawlerListener> listeners; | |
53 | ||
54 | // /** | |
55 | // * Output triple handler. | |
56 | // */ | |
57 | // private final TripleHandler tripleHandler; | |
58 | ||
59 | /** | |
60 | * @return the singleton instance. | |
61 | */ | |
62 | protected static SharedData getInstance() { | |
63 | 0 | if(instance == null) throw new IllegalStateException("The configuration has not yet initialized."); |
64 | 0 | return instance; |
65 | } | |
66 | ||
67 | /** | |
68 | * Initializes the crawler data. | |
69 | * | |
70 | * @param seed crawler seed. | |
71 | * @param regex page filter regex. | |
72 | * @param listeners the listeners to be notified of the crawler activity. | |
73 | */ | |
74 | protected static void setCrawlData(String seed, Pattern regex, List<CrawlerListener> listeners) { | |
75 | 0 | instance = new SharedData(seed, regex, listeners); |
76 | 0 | } |
77 | ||
78 | /** | |
79 | * Internal constructor. | |
80 | * | |
81 | * @param seed | |
82 | * @param pattern | |
83 | * @param listeners | |
84 | */ | |
85 | 0 | private SharedData(String seed, Pattern pattern, List<CrawlerListener> listeners) { |
86 | 0 | if(seed == null || seed.trim().length() == 0) |
87 | 0 | throw new IllegalArgumentException( |
88 | String.format("Invalid seed '%s'", seed) | |
89 | ); | |
90 | ||
91 | 0 | this.seed = seed; |
92 | 0 | this.pattern = pattern; |
93 | 0 | this.listeners = listeners; |
94 | 0 | } |
95 | ||
96 | /** | |
97 | * @return crawl seed. | |
98 | */ | |
99 | protected String getSeed() { | |
100 | 0 | return seed; |
101 | } | |
102 | ||
103 | /** | |
104 | * @return page filter pattern. | |
105 | */ | |
106 | protected Pattern getPattern() { | |
107 | 0 | return pattern; |
108 | } | |
109 | ||
110 | /** | |
111 | * Notifies all listeners that a page has been discovered. | |
112 | * | |
113 | * @param page the discovered page. | |
114 | */ | |
115 | protected void notifyPage(Page page) { | |
116 | 0 | for(CrawlerListener listener : listeners) { |
117 | 0 | listener.visitedPage(page); |
118 | } | |
119 | 0 | } |
120 | ||
121 | } |