1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.math4.legacy.stat.correlation; 18 19 import org.apache.commons.statistics.distribution.TDistribution; 20 import org.apache.commons.math4.legacy.exception.DimensionMismatchException; 21 import org.apache.commons.math4.legacy.exception.MathIllegalArgumentException; 22 import org.apache.commons.math4.legacy.exception.NullArgumentException; 23 import org.apache.commons.math4.legacy.exception.util.LocalizedFormats; 24 import org.apache.commons.math4.legacy.linear.BlockRealMatrix; 25 import org.apache.commons.math4.legacy.linear.RealMatrix; 26 import org.apache.commons.math4.legacy.stat.regression.SimpleRegression; 27 import org.apache.commons.math4.core.jdkmath.JdkMath; 28 29 /** 30 * Computes Pearson's product-moment correlation coefficients for pairs of arrays 31 * or columns of a matrix. 32 * 33 * <p>The constructors that take <code>RealMatrix</code> or 34 * <code>double[][]</code> arguments generate correlation matrices. The 35 * columns of the input matrices are assumed to represent variable values. 36 * Correlations are given by the formula</p> 37 * 38 * <p><code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code> 39 * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code> 40 * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.</p> 41 * 42 * <p>To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()} 43 * to construct an instance with no data and then {@link #correlation(double[], double[])}. 44 * Correlation matrices can also be computed directly from an instance with no data using 45 * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()}, 46 * {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the 47 * constructors supplying data or a covariance matrix must be used to create the instance.</p> 48 * 49 * @since 2.0 50 */ 51 public class PearsonsCorrelation { 52 53 /** correlation matrix. */ 54 private final RealMatrix correlationMatrix; 55 56 /** number of observations. */ 57 private final int nObs; 58 59 /** 60 * Create a PearsonsCorrelation instance without data. 61 */ 62 public PearsonsCorrelation() { 63 super(); 64 correlationMatrix = null; 65 nObs = 0; 66 } 67 68 /** 69 * Create a PearsonsCorrelation from a rectangular array 70 * whose columns represent values of variables to be correlated. 71 * 72 * Throws MathIllegalArgumentException if the input array does not have at least 73 * two columns and two rows. Pairwise correlations are set to NaN if one 74 * of the correlates has zero variance. 75 * 76 * @param data rectangular array with columns representing variables 77 * @throws MathIllegalArgumentException if the input data array is not 78 * rectangular with at least two rows and two columns. 79 * @see #correlation(double[], double[]) 80 */ 81 public PearsonsCorrelation(double[][] data) { 82 this(new BlockRealMatrix(data)); 83 } 84 85 /** 86 * Create a PearsonsCorrelation from a RealMatrix whose columns 87 * represent variables to be correlated. 88 * 89 * Throws MathIllegalArgumentException if the matrix does not have at least 90 * two columns and two rows. Pairwise correlations are set to NaN if one 91 * of the correlates has zero variance. 92 * 93 * @param matrix matrix with columns representing variables to correlate 94 * @throws MathIllegalArgumentException if the matrix does not contain sufficient data 95 * @see #correlation(double[], double[]) 96 */ 97 public PearsonsCorrelation(RealMatrix matrix) { 98 nObs = matrix.getRowDimension(); 99 correlationMatrix = computeCorrelationMatrix(matrix); 100 } 101 102 /** 103 * Create a PearsonsCorrelation from a {@link Covariance}. The correlation 104 * matrix is computed by scaling the Covariance's covariance matrix. 105 * The Covariance instance must have been created from a data matrix with 106 * columns representing variable values. 107 * 108 * @param covariance Covariance instance 109 */ 110 public PearsonsCorrelation(Covariance covariance) { 111 RealMatrix covarianceMatrix = covariance.getCovarianceMatrix(); 112 if (covarianceMatrix == null) { 113 throw new NullArgumentException(LocalizedFormats.COVARIANCE_MATRIX); 114 } 115 nObs = covariance.getN(); 116 correlationMatrix = covarianceToCorrelation(covarianceMatrix); 117 } 118 119 /** 120 * Create a PearsonsCorrelation from a covariance matrix. The correlation 121 * matrix is computed by scaling the covariance matrix. 122 * 123 * @param covarianceMatrix covariance matrix 124 * @param numberOfObservations the number of observations in the dataset used to compute 125 * the covariance matrix 126 */ 127 public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) { 128 nObs = numberOfObservations; 129 correlationMatrix = covarianceToCorrelation(covarianceMatrix); 130 } 131 132 /** 133 * Returns the correlation matrix. 134 * 135 * <p>This method will return null if the non-argument constructor was used 136 * to create this instance, even if {@link #computeCorrelationMatrix(double[][])} 137 * has been called before it is activated.</p> 138 * 139 * @return correlation matrix 140 */ 141 public RealMatrix getCorrelationMatrix() { 142 return correlationMatrix; 143 } 144 145 /** 146 * Returns a matrix of standard errors associated with the estimates 147 * in the correlation matrix.<br> 148 * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard 149 * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code> 150 * 151 * <p>The formula used to compute the standard error is <br> 152 * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code> 153 * where <code>r</code> is the estimated correlation coefficient and 154 * <code>n</code> is the number of observations in the source dataset.</p> 155 * 156 * <p>To use this method, one of the constructors that supply an input 157 * matrix must have been used to create this instance.</p> 158 * 159 * @return matrix of correlation standard errors 160 * @throws NullPointerException if this instance was created with no data. 161 */ 162 public RealMatrix getCorrelationStandardErrors() { 163 int nVars = correlationMatrix.getColumnDimension(); 164 double[][] out = new double[nVars][nVars]; 165 for (int i = 0; i < nVars; i++) { 166 for (int j = 0; j < nVars; j++) { 167 double r = correlationMatrix.getEntry(i, j); 168 out[i][j] = JdkMath.sqrt((1 - r * r) /(nObs - 2)); 169 } 170 } 171 return new BlockRealMatrix(out); 172 } 173 174 /** 175 * Returns a matrix of p-values associated with the (two-sided) null 176 * hypothesis that the corresponding correlation coefficient is zero. 177 * 178 * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability 179 * that a random variable distributed as <code>t<sub>n-2</sub></code> takes 180 * a value with absolute value greater than or equal to <br> 181 * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p> 182 * 183 * <p>The values in the matrix are sometimes referred to as the 184 * <i>significance</i> of the corresponding correlation coefficients.</p> 185 * 186 * <p>To use this method, one of the constructors that supply an input 187 * matrix must have been used to create this instance.</p> 188 * 189 * @return matrix of p-values 190 * @throws org.apache.commons.math4.legacy.exception.MaxCountExceededException 191 * if an error occurs estimating probabilities 192 * @throws NullPointerException if this instance was created with no data. 193 */ 194 public RealMatrix getCorrelationPValues() { 195 TDistribution tDistribution = TDistribution.of(nObs - 2); 196 int nVars = correlationMatrix.getColumnDimension(); 197 double[][] out = new double[nVars][nVars]; 198 for (int i = 0; i < nVars; i++) { 199 for (int j = 0; j < nVars; j++) { 200 if (i == j) { 201 out[i][j] = 0d; 202 } else { 203 double r = correlationMatrix.getEntry(i, j); 204 double t = JdkMath.abs(r * JdkMath.sqrt((nObs - 2)/(1 - r * r))); 205 out[i][j] = 2 * tDistribution.cumulativeProbability(-t); 206 } 207 } 208 } 209 return new BlockRealMatrix(out); 210 } 211 212 213 /** 214 * Computes the correlation matrix for the columns of the 215 * input matrix, using {@link #correlation(double[], double[])}. 216 * 217 * Throws MathIllegalArgumentException if the matrix does not have at least 218 * two columns and two rows. Pairwise correlations are set to NaN if one 219 * of the correlates has zero variance. 220 * 221 * @param matrix matrix with columns representing variables to correlate 222 * @return correlation matrix 223 * @throws MathIllegalArgumentException if the matrix does not contain sufficient data 224 * @see #correlation(double[], double[]) 225 */ 226 public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { 227 checkSufficientData(matrix); 228 int nVars = matrix.getColumnDimension(); 229 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); 230 for (int i = 0; i < nVars; i++) { 231 for (int j = 0; j < i; j++) { 232 double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); 233 outMatrix.setEntry(i, j, corr); 234 outMatrix.setEntry(j, i, corr); 235 } 236 outMatrix.setEntry(i, i, 1d); 237 } 238 return outMatrix; 239 } 240 241 /** 242 * Computes the correlation matrix for the columns of the 243 * input rectangular array. The columns of the array represent values 244 * of variables to be correlated. 245 * 246 * Throws MathIllegalArgumentException if the matrix does not have at least 247 * two columns and two rows or if the array is not rectangular. Pairwise 248 * correlations are set to NaN if one of the correlates has zero variance. 249 * 250 * @param data matrix with columns representing variables to correlate 251 * @return correlation matrix 252 * @throws MathIllegalArgumentException if the array does not contain sufficient data 253 * @see #correlation(double[], double[]) 254 */ 255 public RealMatrix computeCorrelationMatrix(double[][] data) { 256 return computeCorrelationMatrix(new BlockRealMatrix(data)); 257 } 258 259 /** 260 * Computes the Pearson's product-moment correlation coefficient between two arrays. 261 * 262 * <p>Throws MathIllegalArgumentException if the arrays do not have the same length 263 * or their common length is less than 2. Returns {@code NaN} if either of the arrays 264 * has zero variance (i.e., if one of the arrays does not contain at least two distinct 265 * values).</p> 266 * 267 * @param xArray first data array 268 * @param yArray second data array 269 * @return Returns Pearson's correlation coefficient for the two arrays 270 * @throws DimensionMismatchException if the arrays lengths do not match 271 * @throws MathIllegalArgumentException if there is insufficient data 272 */ 273 public double correlation(final double[] xArray, final double[] yArray) { 274 SimpleRegression regression = new SimpleRegression(); 275 if (xArray.length != yArray.length) { 276 throw new DimensionMismatchException(xArray.length, yArray.length); 277 } else if (xArray.length < 2) { 278 throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_DIMENSION, 279 xArray.length, 2); 280 } else { 281 for(int i=0; i<xArray.length; i++) { 282 regression.addData(xArray[i], yArray[i]); 283 } 284 return regression.getR(); 285 } 286 } 287 288 /** 289 * Derives a correlation matrix from a covariance matrix. 290 * 291 * <p>Uses the formula <br> 292 * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where 293 * <code>r(·,·)</code> is the correlation coefficient and 294 * <code>s(·)</code> means standard deviation.</p> 295 * 296 * @param covarianceMatrix the covariance matrix 297 * @return correlation matrix 298 */ 299 public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) { 300 int nVars = covarianceMatrix.getColumnDimension(); 301 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); 302 for (int i = 0; i < nVars; i++) { 303 double sigma = JdkMath.sqrt(covarianceMatrix.getEntry(i, i)); 304 outMatrix.setEntry(i, i, 1d); 305 for (int j = 0; j < i; j++) { 306 double entry = covarianceMatrix.getEntry(i, j) / 307 (sigma * JdkMath.sqrt(covarianceMatrix.getEntry(j, j))); 308 outMatrix.setEntry(i, j, entry); 309 outMatrix.setEntry(j, i, entry); 310 } 311 } 312 return outMatrix; 313 } 314 315 /** 316 * Throws MathIllegalArgumentException if the matrix does not have at least 317 * two columns and two rows. 318 * 319 * @param matrix matrix to check for sufficiency 320 * @throws MathIllegalArgumentException if there is insufficient data 321 */ 322 private void checkSufficientData(final RealMatrix matrix) { 323 int nRows = matrix.getRowDimension(); 324 int nCols = matrix.getColumnDimension(); 325 if (nRows < 2 || nCols < 2) { 326 throw new MathIllegalArgumentException(LocalizedFormats.INSUFFICIENT_ROWS_AND_COLUMNS, 327 nRows, nCols); 328 } 329 } 330 }