public class StatsUtils extends Object
Constructor and Description |
---|
StatsUtils() |
Modifier and Type | Method and Description |
---|---|
static Long |
addWithExpDecay(List<Long> distinctVals) |
static boolean |
areBasicStatsUptoDateForQueryAnswering(Table table,
Map<String,String> params)
Are the basic stats for the table up-to-date for query planning.
|
static boolean |
areColumnStatsUptoDateForQueryAnswering(Table table,
Map<String,String> params,
String colName)
Are the column stats for the table up-to-date for query planning.
|
static Statistics |
collectStatistics(HiveConf conf,
PrunedPartitionList partList,
ColumnStatsList colStatsCache,
Table table,
TableScanOperator tableScanOperator)
Collect table, partition and column level statistics
|
static Statistics |
collectStatistics(HiveConf conf,
PrunedPartitionList partList,
Table table,
List<ColumnInfo> schema,
List<String> neededColumns,
ColumnStatsList colStatsCache,
List<String> referencedColumns,
boolean fetchColStats) |
static ColStatistics.Range |
combineRange(ColStatistics.Range range1,
ColStatistics.Range range2) |
static int |
estimateRowSizeFromSchema(HiveConf conf,
List<ColumnInfo> schema) |
static int |
estimateRowSizeFromSchema(HiveConf conf,
List<ColumnInfo> schema,
List<String> neededColumns) |
static long |
getAvailableMemory(org.apache.hadoop.conf.Configuration conf) |
static long |
getAvgColLenOf(HiveConf conf,
ObjectInspector oi,
String colType)
Get the raw data size of variable length data types
|
static long |
getAvgColLenOfFixedLengthTypes(String colType)
Get size of fixed length primitives
|
static List<Long> |
getBasicStatForPartitions(Table table,
List<Partition> parts,
String statType)
Get basic stats of partitions
|
static long |
getBasicStatForTable(Table table,
String statType)
Get basic stats of table
|
static ColStatistics |
getColStatistics(ColumnStatisticsObj cso,
String tabName,
String colName)
Convert ColumnStatisticsObj to ColStatistics
|
static ColStatistics |
getColStatisticsFromExpression(HiveConf conf,
Statistics parentStats,
ExprNodeDesc end)
Get column statistics expression nodes
|
static List<ColStatistics> |
getColStatisticsFromExprMap(HiveConf conf,
Statistics parentStats,
Map<String,ExprNodeDesc> colExprMap,
RowSchema rowSchema)
Get column statistics from parent statistics.
|
static List<ColStatistics> |
getColStatisticsUpdatingTableAlias(Statistics parentStats,
RowSchema rowSchema)
Get column statistics from parent statistics given the
row schema of its child.
|
static ColStatistics |
getColStatsForPartCol(ColumnInfo ci,
PartitionIterable partList,
HiveConf conf) |
static long |
getDataSizeFromColumnStats(long numRows,
List<ColStatistics> colStats)
Compute raw data size from column statistics
|
static List<Long> |
getFileSizeForPartitions(HiveConf conf,
List<Partition> parts)
Find the bytes on disks occupied by list of partitions
|
static long |
getFileSizeForTable(HiveConf conf,
Table table)
Find the bytes on disk occupied by a table
|
static String |
getFullyQualifiedTableName(String dbName,
String tabName) |
static long |
getMaxIfOverflow(long val)
negative number of rows or data sizes are invalid.
|
static int |
getNDVPartitionColumn(PartitionIterable partitions,
String partColName) |
static long |
getNumRows(HiveConf conf,
List<ColumnInfo> schema,
Table table,
PrunedPartitionList partitionList,
AtomicInteger noColsMissingStats)
Returns number of rows if it exists.
|
static long |
getNumRows(Table table)
Get number of rows of a give table
|
static List<String> |
getQualifedReducerKeyNames(List<String> keyExprs)
Get qualified column name from output key column names
|
static long |
getRangeDelta(ColStatistics.Range range) |
static long |
getRawDataSize(Table table)
Get raw data size of a give table
|
static float |
getScaledSelectivity(ColStatistics csPK,
ColStatistics csFK)
Scale selectivity based on key range ratio.
|
static long |
getSizeOfComplexTypes(HiveConf conf,
ObjectInspector oi)
Get the size of complex data types
|
static long |
getSizeOfMap(StandardConstantMapObjectInspector scmoi)
Estimate the size of map object
|
static long |
getSizeOfPrimitiveTypeArraysFromType(String colType,
int length,
HiveConf conf)
Get the size of arrays of primitive types
|
static long |
getSizeOfStruct(StandardConstantStructObjectInspector soi) |
static long |
getSumIgnoreNegatives(List<Long> vals)
Get sum of all values in the list that are >0
|
static List<ColStatistics> |
getTableColumnStats(Table table,
List<ColumnInfo> schema,
List<String> neededColumns,
ColumnStatsList colStatsCache)
Get table level column statistics from metastore for needed columns
|
static long |
getTotalSize(Table table)
Get total size of a give table
|
static long |
getWritableSize(ObjectInspector oi,
Object value)
Get size of primitive data types based on their respective writable object inspector
|
static boolean |
hasDiscreteRange(ColStatistics colStat) |
static void |
inferAndSetPrimaryKey(long numRows,
List<ColStatistics> colStats)
Based on the provided column statistics and number of rows, this method infers if the column
can be primary key.
|
static boolean |
inferForeignKey(ColStatistics csPK,
ColStatistics csFK)
Infer foreign key relationship from given column statistics.
|
static long |
safeAdd(long a,
long b)
Bounded addition - overflows become MAX_VALUE
|
static List<Long> |
safeMult(List<Long> l,
float b) |
static long |
safeMult(long a,
double b)
Bounded multiplication - overflows become MAX_VALUE
|
static long |
safeMult(long a,
long b)
Bounded multiplication - overflows become MAX_VALUE
|
public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, ColumnStatsList colStatsCache, Table table, TableScanOperator tableScanOperator) throws HiveException
conf
- - hive configurationpartList
- - partition listtable
- - tabletableScanOperator
- - table scan operatorHiveException
public static long getNumRows(HiveConf conf, List<ColumnInfo> schema, Table table, PrunedPartitionList partitionList, AtomicInteger noColsMissingStats)
conf
- schema
- table
- public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean fetchColStats) throws HiveException
HiveException
public static void inferAndSetPrimaryKey(long numRows, List<ColStatistics> colStats)
numRows
- - number of rowscolStats
- - column statisticspublic static boolean inferForeignKey(ColStatistics csPK, ColStatistics csFK)
csPK
- - column statistics of primary keycsFK
- - column statistics of potential foreign keypublic static float getScaledSelectivity(ColStatistics csPK, ColStatistics csFK)
csPK
- - column statistics of primary keycsFK
- - column statistics of potential foreign keypublic static long getRangeDelta(ColStatistics.Range range)
public static ColStatistics getColStatsForPartCol(ColumnInfo ci, PartitionIterable partList, HiveConf conf)
public static int getNDVPartitionColumn(PartitionIterable partitions, String partColName)
public static int estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema)
public static int estimateRowSizeFromSchema(HiveConf conf, List<ColumnInfo> schema, List<String> neededColumns)
public static long getFileSizeForTable(HiveConf conf, Table table)
conf
- - hive conftable
- - tablepublic static List<Long> getFileSizeForPartitions(HiveConf conf, List<Partition> parts)
conf
- - hive confparts
- - partition listpublic static long getSumIgnoreNegatives(List<Long> vals)
vals
- - list of valuespublic static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tabName, String colName)
cso
- - ColumnStatisticsObjtabName
- - table namecolName
- - column namepublic static List<ColStatistics> getTableColumnStats(Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache)
table
- - tableschema
- - output schemaneededColumns
- - list of needed columnspublic static long getAvgColLenOf(HiveConf conf, ObjectInspector oi, String colType)
conf
- - hive confoi
- - object inspectorcolType
- - column typepublic static long getSizeOfComplexTypes(HiveConf conf, ObjectInspector oi)
public static long getAvgColLenOfFixedLengthTypes(String colType)
colType
- - column typepublic static long getSizeOfPrimitiveTypeArraysFromType(String colType, int length, HiveConf conf)
public static long getSizeOfMap(StandardConstantMapObjectInspector scmoi)
scmoi
- - object inspectorpublic static long getSizeOfStruct(StandardConstantStructObjectInspector soi)
public static long getWritableSize(ObjectInspector oi, Object value)
oi
- - object inspectorvalue
- - valuepublic static List<ColStatistics> getColStatisticsFromExprMap(HiveConf conf, Statistics parentStats, Map<String,ExprNodeDesc> colExprMap, RowSchema rowSchema)
conf
- - hive confparentStats
- - parent statisticscolExprMap
- - column expression maprowSchema
- - row schemapublic static List<ColStatistics> getColStatisticsUpdatingTableAlias(Statistics parentStats, RowSchema rowSchema)
parentStats
- - parent statisticsrowSchema
- - row schemapublic static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats, ExprNodeDesc end)
conf
- - hive confparentStats
- - parent statisticsend
- - expression nodespublic static long getNumRows(Table table)
public static long getRawDataSize(Table table)
public static long getTotalSize(Table table)
public static long getBasicStatForTable(Table table, String statType)
table
- - tablestatType
- - type of statspublic static List<Long> getBasicStatForPartitions(Table table, List<Partition> parts, String statType)
table
- - tableparts
- - partitionsstatType
- - type of statspublic static long getDataSizeFromColumnStats(long numRows, List<ColStatistics> colStats)
numRows
- - number of rowscolStats
- - column statisticspublic static String getFullyQualifiedTableName(String dbName, String tabName)
public static List<String> getQualifedReducerKeyNames(List<String> keyExprs)
keyExprs
- - output key namespublic static long getAvailableMemory(org.apache.hadoop.conf.Configuration conf)
public static long getMaxIfOverflow(long val)
val
- - input valuepublic static long safeMult(long a, double b)
public static long safeAdd(long a, long b)
public static long safeMult(long a, long b)
public static boolean hasDiscreteRange(ColStatistics colStat)
public static ColStatistics.Range combineRange(ColStatistics.Range range1, ColStatistics.Range range2)
public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, Map<String,String> params)
Copyright © 2022 The Apache Software Foundation. All rights reserved.