/* * (c) Copyright 2004, 2005, 2006, 2007 Hewlett-Packard Development Company, LP * [See end of file] */ package arqo.histogram.impl; import java.util.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.hp.hpl.jena.graph.Node; /** * The class represents a histogram for the object * value distribution for a predicate. * * @author Markus Stocker * @version $Id$ */ public class Histogram { // The number of classes for the histogram private int numOfClasses = 4 ; private double lowerBound = 0d ; private double spread = 0d ; private double classSize = 0d ; private Node firstNode = null ; private Node lastNode = null ; private Map histogram = new TreeMap() ; // Map private static Log log = LogFactory.getLog(Histogram.class) ; public Histogram(List nodes) { Collections.sort(nodes, new NodeComparator()) ; firstNode = (Node)nodes.get(0) ; lastNode = (Node)nodes.get(nodes.size() - 1) ; log.debug("Histogram bounds: " + firstNode.toString() + ", " + lastNode.toString()) ; spread = getSpread(firstNode, lastNode) ; classSize = spread / numOfClasses ; log.debug("Histogram spread and class size: " + spread + ", " + classSize) ; for (int i = 0; i < numOfClasses; i++) { double classLowerBound = lowerBound + (i * classSize) ; HistogramClass histogramClass = new HistogramClass() ; histogramClass.setLowerBound(classLowerBound) ; histogram.put(new Double(classLowerBound), histogramClass) ; log.debug("Create histogram class with lower bound: " + classLowerBound) ; } for (Iterator iter = nodes.iterator(); iter.hasNext(); ) { Node node = (Node)iter.next() ; double classLowerBound = getLowerBound(getSpread(firstNode, node), classSize) ; HistogramClass histogramClass = (HistogramClass)histogram.get(new Double(classLowerBound)) ; //log.debug("Increment histogram class for node: " + getSpread(firstNode, node) + ", " + classLowerBound + ", " + node.toString()) ; histogramClass.increment() ; } } private double getLowerBound(double spread, double classSize) { int i = 0 ; double bound = lowerBound + (i * classSize); double nextBound = lowerBound + (++i * classSize) ; // Search in which histogram class the element falls into while (spread > nextBound) { bound = nextBound ; nextBound = lowerBound + (++i * classSize) ; } return bound ; } private double getSpread(Node lowerBound, Node upperBound) { if (lowerBound.isURI() && upperBound.isURI()) return new Double(Math.abs(lowerBound.getURI().compareTo(upperBound.getURI()))).doubleValue() ; Object literal1 = lowerBound.getLiteralValue() ; Object literal2 = upperBound.getLiteralValue() ; String lexical1 = lowerBound.getLiteralLexicalForm() ; String lexical2 = upperBound.getLiteralLexicalForm() ; if (literal1 instanceof Number && literal2 instanceof Number) return Math.abs(new Double(lexical2).doubleValue() - new Double(lexical1).doubleValue()) ; return new Double(Math.abs(lexical1.compareTo(lexical2))).doubleValue() ; } class NodeComparator implements Comparator { public int compare(Object object1, Object object2) { if (! (object1 instanceof Node)) throw new ClassCastException() ; if (! (object2 instanceof Node)) throw new ClassCastException() ; Node node1 = (Node)object1 ; Node node2 = (Node)object2 ; if (node1.isURI() && node2.isURI()) { int value = node1.getURI().compareTo(node2.getURI()) ; if (node1.getURI().equals("http://www.University17.edu")) System.out.println(value + " " + node1.getURI() + " " + node2.getURI()) ; return value ; } Object literal1 = node1.getLiteralValue() ; Object literal2 = node2.getLiteralValue() ; String lexical1 = node1.getLiteralLexicalForm() ; String lexical2 = node2.getLiteralLexicalForm() ; if (literal1 instanceof Number && literal2 instanceof Number) return (new Double(lexical1)).compareTo(new Double(lexical2)) ; if (literal1 instanceof String && literal2 instanceof String) return lexical1.compareTo(lexical2) ; throw new ClassCastException("Nodes have to be of the same type: " + lexical1 + ", " + lexical2) ; } } /** * Return the histogram class frequency for a node (an object). * If the object is not found in the histogram, the method returns * Integer.MAX_VALUE. This is important for the selectivity estimation * of patterns where the predicate is unbound and the object bound, since * we calculate the object selectivity over every histogram and for some, * the object might fall into no histogram class and thus no frequency value, * is returned, i.e. we return Integer.MAX_VALUE. Returning 0 is a bad * idea, since it is a value which actually might be returned by a * histogram class lookup and thus we cannot decide whether or not * the value should be considered. * * @param node * @return Integer */ public int getClassFrequency(Node node) { double classLowerBound = getLowerBound(getSpread(firstNode, node), classSize) ; HistogramClass histogramClass = (HistogramClass)histogram.get(new Double(classLowerBound)) ; return histogramClass.getFrequency() ; } public int getClassFrequencyGEQ(Node node) { int frequency = 0 ; double classLowerBound = getLowerBound(getSpread(firstNode, node), classSize) ; for (Iterator iter = histogram.keySet().iterator(); iter.hasNext(); ) { Double lowerBound = (Double)iter.next() ; HistogramClass histogramClass = (HistogramClass)histogram.get(lowerBound) ; if (lowerBound.doubleValue() >= classLowerBound) frequency += histogramClass.getFrequency() ; } return frequency ; } public int getClassFrequencyG(Node node) { int frequency = 0 ; double classLowerBound = getLowerBound(getSpread(firstNode, node), classSize) ; for (Iterator iter = histogram.keySet().iterator(); iter.hasNext(); ) { Double lowerBound = (Double)iter.next() ; HistogramClass histogramClass = (HistogramClass)histogram.get(lowerBound) ; if (lowerBound.doubleValue() > classLowerBound) frequency += histogramClass.getFrequency() ; } return frequency ; } public int getClassFrequencyLEQ(Node node) { int frequency = 0 ; double classLowerBound = getLowerBound(getSpread(firstNode, node), classSize) ; for (Iterator iter = histogram.keySet().iterator(); iter.hasNext(); ) { Double lowerBound = (Double)iter.next() ; HistogramClass histogramClass = (HistogramClass)histogram.get(lowerBound) ; if (lowerBound.doubleValue() <= classLowerBound) frequency += histogramClass.getFrequency() ; } return frequency ; } public int getClassFrequencyL(Node node) { int frequency = 0 ; double classLowerBound = getLowerBound(getSpread(firstNode, node), classSize) ; for (Iterator iter = histogram.keySet().iterator(); iter.hasNext(); ) { Double lowerBound = (Double)iter.next() ; HistogramClass histogramClass = (HistogramClass)histogram.get(lowerBound) ; if (lowerBound.doubleValue() < classLowerBound) frequency += histogramClass.getFrequency() ; } return frequency ; } /** * Return a list of histogram classes * * @return Set */ public Set getClasses() { Set histogramClasses = new HashSet() ; // Set for (Iterator iter = histogram.values().iterator(); iter.hasNext(); ) histogramClasses.add((HistogramClass)iter.next()) ; return histogramClasses ; } /** * Set the histogram lower bound * * @param lowerBound */ public void setLowerBound(double lowerBound) { this.lowerBound = lowerBound ; } /** * Get the histogram lower bound * * @return double */ public double getLowerBound() { return this.lowerBound ; } /** * Set the histogram class size * * @param classSize */ public void setClassSize(double classSize) { this.classSize = classSize ; } /** * Get the histogram class size * * @return double */ public double getClassSize() { return this.classSize ; } /** * Return the number of elements contained in the histogram. * * @return int */ public int size() { int size = 0 ; // Step through the histogram classes and add up the frequencies for (Iterator iter = histogram.keySet().iterator(); iter.hasNext(); ) { Double key = (Double)iter.next() ; HistogramClass histogramClass = (HistogramClass)histogram.get(key) ; size += histogramClass.getFrequency() ; } return size ; } } /* * (c) Copyright 2004, 2005, 2006, 2007 Hewlett-Packard Development Company, LP * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */