Skip to content

Commit

Permalink
Allow SegmentMetadataQuery to skip cardinality and size calculations
Browse files Browse the repository at this point in the history
  • Loading branch information
jon-wei committed Sep 22, 2015
1 parent aaa8a88 commit e6a6284
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 43 deletions.
19 changes: 19 additions & 0 deletions docs/content/querying/segmentmetadataquery.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ There are several main parts to a segment metadata query:
|toInclude|A JSON Object representing what columns should be included in the result. Defaults to "all".|no|
|merge|Merge all individual segment metadata results into a single result|no|
|context|See [Context](../querying/query-context.html)|no|
|analysisTypes|A list of Strings specifying what column properties (e.g. cardinality, size) should be calculated and returned in the result. Defaults to ["cardinality", "size"]. See section [analysisTypes](#analysistypes) for more details.|no|

The format of the result is:

Expand Down Expand Up @@ -86,3 +87,21 @@ The grammar is as follows:
``` json
"toInclude": { "type": "list", "columns": [<string list of column names>]}
```

### analysisTypes

This is a list of properties that determines the amount of information returned about the columns, i.e. analyses to be performed on the columns.

By default, all analysis types will be used. If a property is not needed, omitting it from this list will result in a more efficient query.

There are 2 types of column analyses:

#### cardinality

* Estimated floor of cardinality for each column. Only relevant for dimension columns.

#### size

* Estimated byte size for the segment columns if they were stored in a flat format

* Estimated total segment byte size in if it was stored in a flat format
1 change: 1 addition & 0 deletions processing/src/main/java/io/druid/query/Druids.java
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,7 @@ public SegmentMetadataQuery build()
toInclude,
merge,
context,
null,
false
);
}
Expand Down
105 changes: 77 additions & 28 deletions processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import com.metamx.common.logger.Logger;
import com.metamx.common.StringUtils;
import io.druid.query.metadata.metadata.ColumnAnalysis;
import io.druid.query.metadata.metadata.SegmentMetadataQuery;
import io.druid.segment.QueryableIndex;
import io.druid.segment.StorageAdapter;
import io.druid.segment.column.BitmapIndex;
Expand All @@ -38,6 +39,7 @@
import io.druid.segment.serde.ComplexMetrics;

import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;

Expand All @@ -55,7 +57,7 @@ public class SegmentAnalyzer
*/
private static final int NUM_BYTES_IN_TEXT_FLOAT = 8;

public Map<String, ColumnAnalysis> analyze(QueryableIndex index)
public Map<String, ColumnAnalysis> analyze(QueryableIndex index, EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes)
{
Preconditions.checkNotNull(index, "Index cannot be null");

Expand All @@ -69,16 +71,16 @@ public Map<String, ColumnAnalysis> analyze(QueryableIndex index)
final ValueType type = capabilities.getType();
switch (type) {
case LONG:
analysis = analyzeLongColumn(column);
analysis = analyzeLongColumn(column, analysisTypes);
break;
case FLOAT:
analysis = analyzeFloatColumn(column);
analysis = analyzeFloatColumn(column, analysisTypes);
break;
case STRING:
analysis = analyzeStringColumn(column);
analysis = analyzeStringColumn(column, analysisTypes);
break;
case COMPLEX:
analysis = analyzeComplexColumn(column);
analysis = analyzeComplexColumn(column, analysisTypes);
break;
default:
log.warn("Unknown column type[%s].", type);
Expand All @@ -90,13 +92,13 @@ public Map<String, ColumnAnalysis> analyze(QueryableIndex index)

columns.put(
Column.TIME_COLUMN_NAME,
lengthBasedAnalysis(index.getColumn(Column.TIME_COLUMN_NAME), NUM_BYTES_IN_TIMESTAMP)
lengthBasedAnalysis(index.getColumn(Column.TIME_COLUMN_NAME), NUM_BYTES_IN_TIMESTAMP, analysisTypes)
);

return columns;
}

public Map<String, ColumnAnalysis> analyze(StorageAdapter adapter)
public Map<String, ColumnAnalysis> analyze(StorageAdapter adapter, EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes)
{
Preconditions.checkNotNull(adapter, "Adapter cannot be null");
Map<String, ColumnAnalysis> columns = Maps.newTreeMap();
Expand All @@ -114,16 +116,34 @@ public Map<String, ColumnAnalysis> analyze(StorageAdapter adapter)
ValueType capType = capabilities.getType();
switch (capType) {
case LONG:
analysis = lengthBasedAnalysisForAdapter(capType.name(), capabilities, numRows, Longs.BYTES);
analysis = lengthBasedAnalysisForAdapter(
analysisTypes,
capType.name(), capabilities,
numRows, Longs.BYTES
);
break;
case FLOAT:
analysis = lengthBasedAnalysisForAdapter(capType.name(), capabilities, numRows, NUM_BYTES_IN_TEXT_FLOAT);
analysis = lengthBasedAnalysisForAdapter(
analysisTypes,
capType.name(), capabilities,
numRows, NUM_BYTES_IN_TEXT_FLOAT
);
break;
case STRING:
analysis = new ColumnAnalysis(capType.name(), 0, adapter.getDimensionCardinality(columnName), null);
analysis = new ColumnAnalysis(
capType.name(),
0,
analysisHasCardinality(analysisTypes) ? adapter.getDimensionCardinality(columnName) : 0,
null
);
break;
case COMPLEX:
analysis = new ColumnAnalysis(capType.name(), 0, null, null);
analysis = new ColumnAnalysis(
capType.name(),
0,
null,
null
);
break;
default:
log.warn("Unknown column type[%s].", capType);
Expand All @@ -135,33 +155,39 @@ public Map<String, ColumnAnalysis> analyze(StorageAdapter adapter)

columns.put(
Column.TIME_COLUMN_NAME,
lengthBasedAnalysisForAdapter(ValueType.LONG.name(), null, numRows, NUM_BYTES_IN_TIMESTAMP)
lengthBasedAnalysisForAdapter(analysisTypes, ValueType.LONG.name(), null, numRows, NUM_BYTES_IN_TIMESTAMP)
);

return columns;
}

public ColumnAnalysis analyzeLongColumn(Column column)

public ColumnAnalysis analyzeLongColumn(Column column, EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes)
{
return lengthBasedAnalysis(column, Longs.BYTES);
return lengthBasedAnalysis(column, Longs.BYTES, analysisTypes);
}

public ColumnAnalysis analyzeFloatColumn(Column column)
public ColumnAnalysis analyzeFloatColumn(Column column, EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes)
{
return lengthBasedAnalysis(column, NUM_BYTES_IN_TEXT_FLOAT);
return lengthBasedAnalysis(column, NUM_BYTES_IN_TEXT_FLOAT, analysisTypes);
}

private ColumnAnalysis lengthBasedAnalysis(Column column, final int numBytes)
private ColumnAnalysis lengthBasedAnalysis(Column column, final int numBytes, EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes)
{
final ColumnCapabilities capabilities = column.getCapabilities();
if (capabilities.hasMultipleValues()) {
return ColumnAnalysis.error("multi_value");
}

return new ColumnAnalysis(capabilities.getType().name(), column.getLength() * numBytes, null, null);
int size = 0;
if (analysisHasSize(analysisTypes)) {
size = column.getLength() * numBytes;
}

return new ColumnAnalysis(capabilities.getType().name(), size, null, null);
}

public ColumnAnalysis analyzeStringColumn(Column column)
public ColumnAnalysis analyzeStringColumn(Column column, EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes)
{
final ColumnCapabilities capabilities = column.getCapabilities();

Expand All @@ -170,21 +196,28 @@ public ColumnAnalysis analyzeStringColumn(Column column)

int cardinality = bitmapIndex.getCardinality();
long size = 0;
for (int i = 0; i < cardinality; ++i) {
String value = bitmapIndex.getValue(i);

if (value != null) {
size += StringUtils.toUtf8(value).length * bitmapIndex.getBitmap(value).size();
if (analysisHasSize(analysisTypes)) {
for (int i = 0; i < cardinality; ++i) {
String value = bitmapIndex.getValue(i);
if (value != null) {
size += StringUtils.toUtf8(value).length * bitmapIndex.getBitmap(value).size();
}
}
}

return new ColumnAnalysis(capabilities.getType().name(), size, cardinality, null);
return new ColumnAnalysis(
capabilities.getType().name(),
size,
analysisHasCardinality(analysisTypes) ? cardinality : 0,
null
);
}

return ColumnAnalysis.error("string_no_bitmap");
}

public ColumnAnalysis analyzeComplexColumn(Column column)
public ColumnAnalysis analyzeComplexColumn(Column column, EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes)
{
final ColumnCapabilities capabilities = column.getCapabilities();
final ComplexColumn complexColumn = column.getComplexColumn();
Expand All @@ -202,8 +235,10 @@ public ColumnAnalysis analyzeComplexColumn(Column column)

final int length = column.getLength();
long size = 0;
for (int i = 0; i < length; ++i) {
size += inputSizeFn.apply(complexColumn.getRowValue(i));
if (analysisHasSize(analysisTypes)) {
for (int i = 0; i < length; ++i) {
size += inputSizeFn.apply(complexColumn.getRowValue(i));
}
}

return new ColumnAnalysis(typeName, size, null, null);
Expand All @@ -220,14 +255,28 @@ private List<String> getStorageAdapterColumnNames(StorageAdapter adapter)
}

private ColumnAnalysis lengthBasedAnalysisForAdapter(
EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes,
String type, ColumnCapabilities capabilities,
int numRows, final int numBytes
)
{
if (capabilities != null && capabilities.hasMultipleValues()) {
return ColumnAnalysis.error("multi_value");
}
return new ColumnAnalysis(type, numRows * numBytes, null, null);
return new ColumnAnalysis(
type,
analysisHasSize(analysisTypes) ? numRows * numBytes : 0,
null,
null
);
}

private boolean analysisHasSize(EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes) {
return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.SIZE);
}

private boolean analysisHasCardinality(EnumSet<SegmentMetadataQuery.AnalysisType> analysisTypes) {
return analysisTypes.contains(SegmentMetadataQuery.AnalysisType.CARDINALITY);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import io.druid.query.metadata.metadata.SegmentMetadataQuery;
import io.druid.segment.QueryableIndex;
import io.druid.segment.Segment;
import io.druid.segment.StorageAdapter;

import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -82,15 +83,23 @@ public Sequence<SegmentAnalysis> run(Query<SegmentAnalysis> inQ, Map<String, Obj
SegmentMetadataQuery query = (SegmentMetadataQuery) inQ;

final QueryableIndex index = segment.asQueryableIndex();

final Map<String, ColumnAnalysis> analyzedColumns;
final int numRows;
long totalSize = 0;
if (index == null) {
// IncrementalIndexSegments (used by in-memory hydrants in the realtime service) do not have a QueryableIndex
analyzedColumns = analyzer.analyze(segment.asStorageAdapter());
StorageAdapter segmentAdapter = segment.asStorageAdapter();
analyzedColumns = analyzer.analyze(segmentAdapter, query.getAnalysisTypes());
numRows = segmentAdapter.getNumRows();
} else {
analyzedColumns = analyzer.analyze(index);
analyzedColumns = analyzer.analyze(index, query.getAnalysisTypes());
numRows = index.getNumRows();
}

if (query.hasSize()) {
// Initialize with the size of the whitespace, 1 byte per
totalSize = analyzedColumns.size() * index.getNumRows();
totalSize = analyzedColumns.size() * numRows;
}

Map<String, ColumnAnalysis> columns = Maps.newTreeMap();
Expand Down
Loading

0 comments on commit e6a6284

Please sign in to comment.