# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # meabed/debian-jdk # docker build -t apache/nutch:2.x . # FROM meabed/debian-jdk MAINTAINER Nutch Developers "dev@nutch.apache.org" USER root ENV DEBIAN_FRONTEND noninteractive #ant RUN apt-get update && apt-get install -y ant subversion --fix-missing #Download nutch RUN mkdir -p /opt/downloads && cd /opt/downloads && svn co http://svn.apache.org/repos/asf/nutch/branches/2.x apache-nutch-2.x RUN cd /opt RUN ln -s /opt/downloads/apache-nutch-2.x /opt/apache-nutch-2.x ENV NUTCH_ROOT /opt/apache-nutch-2.x ENV HOME /root #Nutch-default # RUN sed -i '/^ http.agent.name<\/name>$/{$!{N;s/^ http.agent.name<\/name>\n <\/value>$/ http.agent.name<\/name>\n Nutch 2.X Cassandra Docker<\/value>/;ty;P;D;:y}}' $NUTCH_ROOT/conf/nutch-default.xml RUN vim -c 'g/name="gora-cassandra"/+1d' -c 'x' $NUTCH_ROOT/ivy/ivy.xml RUN vim -c 'g/name="gora-cassandra"/-1d' -c 'x' $NUTCH_ROOT/ivy/ivy.xml RUN cd $NUTCH_ROOT && ant runtime #native libs RUN rm $NUTCH_ROOT/lib/native/* #RUN mkdir -p $NUTCH_ROOT/lib/native/Linux-amd64-64 #RUN curl -Ls http://dl.bintray.com/meabed/hadoop-debian/hadoop-native-64-2.5.1.tar|tar -x -C $NUTCH_ROOT/lib/native/Linux-amd64-64/ #Modification and compilation again #ADD plugin/nutch2-index-html/src/plugin/ $NUTCH_ROOT/src/plugin/ #RUN sed -i '/dir="index-more" target="deploy".*/ s/.*/&\n /' #$NUTCH_ROOT/src/plugin/build.xml #RUN sed -i '/dir="index-more" target="clean".*/ s/.*/&\n /' #$NUTCH_ROOT/src/plugin/build.xml #RUN cd $NUTCH_ROOT && ant runtime RUN ln -s /opt/apache-nutch-2.x/runtime/local /opt/nutch ENV NUTCH_HOME /opt/nutch # urls folder we will use in crawling $NUTCH_HOME/bin/crawl urls crawlId(test01) elasticsearch_node_name(iData) iteration(1) RUN mkdir $NUTCH_HOME/urls # Adding test urls to use in crawling CMD mkdir -p $NUTCH_HOME/testUrls ADD testUrls $NUTCH_HOME/testUrls # Adding rawcontent that hold html of the page field in index to elasticsearch #RUN sed -i '/field name="date" type.*/ s/.*/&\n\n \n/' $NUTCH_HOME/conf/schema.xml # remove nutche-site.xml default file to replace it by our configuration RUN rm $NUTCH_HOME/conf/nutch-site.xml ADD config/nutch-site.xml $NUTCH_HOME/conf/nutch-site.xml # Port that nutchserver will use ENV NUTCHSERVER_PORT 8899 ADD bootstrap.sh /etc/bootstrap.sh RUN chown root:root /etc/bootstrap.sh RUN chmod 700 /etc/bootstrap.sh VOLUME ["/data"] CMD ["/etc/bootstrap.sh", "-d"] EXPOSE 8899