<%-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --%><%@ page import="java.io.FilterInputStream, java.io.IOException, java.io.InputStream, java.io.InputStreamReader, java.net.URL, java.net.URLConnection, java.net.URLDecoder, java.net.URLEncoder, java.util.ArrayList, java.util.Arrays, java.util.Calendar, java.util.Collections, java.util.Iterator, java.util.List, javax.jcr.Node, javax.jcr.Repository, javax.jcr.Session, javax.jcr.SimpleCredentials, javax.swing.text.AttributeSet, javax.swing.text.html.HTML, javax.swing.text.html.HTMLDocument, javax.swing.text.html.HTMLEditorKit, org.apache.jackrabbit.j2ee.RepositoryAccessServlet, org.apache.jackrabbit.util.Text" %><%@ page contentType="text/html;charset=UTF-8" %><% Repository rep; Session jcrSession; String wspName; try { rep = RepositoryAccessServlet.getRepository(pageContext.getServletContext()); jcrSession = rep.login(new SimpleCredentials("user", "".toCharArray())); wspName = jcrSession.getWorkspace().getName(); } catch (Throwable e) { %>Error while accessing the repository: <%= Text.encodeIllegalXMLCharacters(e.getMessage()) %>
<% %>Check the configuration or use the easy setup wizard.<% return; } try { String seedWord = request.getParameter("seed"); if (seedWord != null) { seedWord = new String(seedWord.getBytes("ISO-8859-1"), "UTF-8"); } int numDocs = 0; List filetypes = new ArrayList(); if (request.getParameter("num") != null) { try { numDocs = Integer.parseInt(request.getParameter("num")); } catch (NumberFormatException e) { // ignore } } String[] types = request.getParameterValues("filetype"); if (types != null) { filetypes.addAll(Arrays.asList(types)); } else { filetypes = DEFAULT_TYPES; } if (seedWord != null && numDocs > 0 && filetypes.size() > 0) { %> Welcome to Apache Jackrabbit - Populate workspace: <%= Text.encodeIllegalXMLCharacters(wspName) %>

Populate workspace: "<%= Text.encodeIllegalXMLCharacters(wspName) %>"


Overall progress

Downloading document

<% Node root = jcrSession.getRootNode(); int n = 0; for (int typeIdx = 0; typeIdx < filetypes.size(); typeIdx++) { String type = (String) filetypes.get(typeIdx); int offset = 0; while (n < numDocs * (typeIdx + 1) / filetypes.size()) { final URL[] urls = new Search(type, seedWord, offset).getURLs(); if (urls.length == 0) { break; } for (int i = 0; i < urls.length; i++) { final URL currentURL = urls[i]; String path = urls[i].getPath(); if (path.startsWith("/")) { path = path.substring(1); } final String host = urls[i].getHost(); List folderNames = new ArrayList(); folderNames.addAll(Arrays.asList(host.split("\\."))); Collections.reverse(folderNames); folderNames.addAll(Arrays.asList(path.split("/", 0))); final String fileName = URLDecoder.decode((String) folderNames.remove(folderNames.size() - 1), "UTF-8").replaceAll(":", "_"); Node node = root; for (Iterator fn = folderNames.iterator(); fn.hasNext(); ) { String name = URLDecoder.decode((String) fn.next(), "UTF-8"); name = name.replaceAll(":", "_"); if (name.length() == 0) { continue; } if (!node.hasNode(name)) { node.addNode(name, "nt:folder"); } node = node.getNode(name); } if (!node.hasNode(fileName)) { final JspWriter fOut = out; Node file = node.addNode(fileName, "nt:file"); final Node resource = file.addNode("jcr:content", "nt:resource"); final Exception[] ex = new Exception[1]; Thread t = new Thread(new Runnable() { public void run() { try { String info = fileName + " (" + host + ")"; URLConnection con = currentURL.openConnection(); InputStream in = con.getInputStream(); try { synchronized (fOut) { fOut.println(""); fOut.flush(); } int length = con.getContentLength(); if (length != -1) { in = new ProgressInputStream(in, length, info, "dp", fOut); } resource.setProperty("jcr:data", in); String mimeType = URLConnection.guessContentTypeFromName(fileName); if (mimeType == null) { if (fileName.endsWith(".doc")) { mimeType = "application/msword"; } else if (fileName.endsWith(".xls")) { mimeType = "application/vnd.ms-excel"; } else if (fileName.endsWith(".ppt")) { mimeType = "application/mspowerpoint"; } else { mimeType = "application/octet-stream"; } } resource.setProperty("jcr:mimeType", mimeType); Calendar lastModified = Calendar.getInstance(); lastModified.setTimeInMillis(con.getLastModified()); resource.setProperty("jcr:lastModified", lastModified); } finally { in.close(); } } catch (Exception e) { ex[0] = e; } } }); t.start(); for (int s = 0; t.isAlive(); s++) { Thread.sleep(100); if (s % 10 == 0) { synchronized (fOut) { fOut.println(""); fOut.flush(); } } } if (ex[0] == null) { jcrSession.save(); n++; synchronized (fOut) { fOut.println(""); fOut.flush(); } if (n >= numDocs * (typeIdx + 1) / filetypes.size()) { break; } } else { jcrSession.refresh(false); } } } offset += 10; } } %>
<% } else { request.setAttribute("title", "Populate workspace " + wspName); %>

This page allows you to populate the workspace with documents downloaded from the Internet.

Seed word (optional):"/>
Number of documents:
Document types:/> Adobe Acrobat PDF
/> Rich Text Format
/> Microsoft Word
/> Microsoft PowerPoint
/> Microsoft Excel
 
<% } } finally { if (jcrSession != null) { jcrSession.logout(); } } %><%! public static final List DEFAULT_TYPES = Arrays.asList( new String[]{"pdf", "rtf", "doc", "ppt", "xls"}); public static class Search { private final String filetype; private final String term; private final int start; public Search(String filetype, String term, int start) { this.filetype = filetype; this.term = term; this.start = start; } public URL[] getURLs() throws Exception { List urls = new ArrayList(); String query = term + " filetype:" + filetype; URL google = new URL("http://www.google.com/search?q=" + URLEncoder.encode(query, "UTF-8") + "&start=" + start); URLConnection con = google.openConnection(); con.setRequestProperty("User-Agent", ""); InputStream in = con.getInputStream(); try { HTMLEditorKit kit = new HTMLEditorKit(); HTMLDocument doc = new HTMLDocument(); doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE); kit.read(new InputStreamReader(in, "UTF-8"), doc, 0); HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A); while (it.isValid()) { AttributeSet attr = it.getAttributes(); if (attr != null) { String href = (String) attr.getAttribute(HTML.Attribute.HREF); if (href != null && href.endsWith("." + filetype)) { URL url = new URL(new URL("http", "www.google.com", "dummy"), href); if (url.getHost().indexOf("google") == -1) { urls.add(url); } } } it.next(); } } finally { in.close(); } return (URL[]) urls.toArray(new URL[urls.size()]); } } public static class ProgressInputStream extends FilterInputStream { private final int length; private final String fileName; private final String varName; private final JspWriter out; private long read; private long nextReport = (16 * 1024); public ProgressInputStream(InputStream in, int length, String fileName, String varName, JspWriter out) { super(in); this.length = length; this.fileName = fileName; this.varName = varName; this.out = out; } public int read() throws IOException { int r = super.read(); reportProgress(r); return r; } public int read(byte b[]) throws IOException { int r = super.read(b); reportProgress(r); return r; } public int read(byte b[], int off, int len) throws IOException { int r = super.read(b, off, len); reportProgress(r); return r; } private void reportProgress(int r) throws IOException { if (r != -1) { read += r; if (read > nextReport || read == length) { // report every 16k synchronized (out) { double s = 1000d * (double) read / (double) length; out.println(""); out.flush(); } nextReport += (16 * 1024); } } } } %>