Skip to content

Commit

Permalink
Update TPC-DS statistics with data size
Browse files Browse the repository at this point in the history
  • Loading branch information
losipiuk authored and arhimondr committed Aug 2, 2018
1 parent fc5419d commit b69e78b
Show file tree
Hide file tree
Showing 79 changed files with 3,383 additions and 2,341 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,21 @@ public class ColumnStatisticsData
private final long nullsCount;
private final Optional<Object> min;
private final Optional<Object> max;
private final Optional<Long> dataSize;

@JsonCreator
public ColumnStatisticsData(
@JsonProperty("distinctValuesCount") long distinctValuesCount,
@JsonProperty("nullsCount") long nullsCount,
@JsonProperty("min") Optional<Object> min,
@JsonProperty("max") Optional<Object> max)
@JsonProperty("max") Optional<Object> max,
@JsonProperty("dataSize") Optional<Long> dataSize)
{
this.distinctValuesCount = distinctValuesCount;
this.nullsCount = nullsCount;
this.min = requireNonNull(min);
this.max = requireNonNull(max);
this.dataSize = requireNonNull(dataSize, "dataSize is null");
}

public long getDistinctValuesCount()
Expand All @@ -60,4 +63,9 @@ public Optional<Object> getMax()
{
return max;
}

public Optional<Long> getDataSize()
{
return dataSize;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ private ColumnStatistics toColumnStatistics(ColumnStatisticsData columnStatistic
columnStatisticsData.getMax()
.map(value -> toPrestoValue(value, type)))
.setDistinctValuesCount(new Estimate(columnStatisticsData.getDistinctValuesCount()))
.setDataSize(columnStatisticsData.getDataSize().map(Estimate::new).orElse(Estimate.unknownValue()))
.setFraction(new Estimate(((double) rowCount - nullCount) / rowCount))
.build());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,187 +5,218 @@
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 1,
"max" : 2
"max" : 2,
"dataSize" : null
},
"cc_call_center_id" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "AAAAAAAABAAAAAAA",
"max" : "AAAAAAAACAAAAAAA"
"max" : "AAAAAAAACAAAAAAA",
"dataSize" : 32
},
"cc_rec_start_date" : {
"distinctValuesCount" : 1,
"nullsCount" : 0,
"min" : 10227,
"max" : 10227
"max" : 10227,
"dataSize" : null
},
"cc_rec_end_date" : {
"distinctValuesCount" : 1,
"nullsCount" : 1,
"min" : 11322,
"max" : 11322
"max" : 11322,
"dataSize" : null
},
"cc_closed_date_sk" : {
"distinctValuesCount" : 0,
"nullsCount" : 2,
"min" : null,
"max" : null
"max" : null,
"dataSize" : null
},
"cc_open_date_sk" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 2450806,
"max" : 2450952
"max" : 2450952,
"dataSize" : null
},
"cc_name" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "Mid Atlantic",
"max" : "NY Metro"
"max" : "NY Metro",
"dataSize" : 20
},
"cc_class" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "large",
"max" : "medium"
"max" : "medium",
"dataSize" : 11
},
"cc_employees" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 2,
"max" : 6
"max" : 6,
"dataSize" : null
},
"cc_sq_ft" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 1138,
"max" : 2268
"max" : 2268,
"dataSize" : null
},
"cc_hours" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "8AM-4PM",
"max" : "8AM-8AM"
"max" : "8AM-8AM",
"dataSize" : 14
},
"cc_manager" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "Bob Belcher",
"max" : "Felipe Perkins"
"max" : "Felipe Perkins",
"dataSize" : 25
},
"cc_mkt_id" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 2,
"max" : 6
"max" : 6,
"dataSize" : null
},
"cc_mkt_class" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "A bit narrow forms matter animals. Consist",
"max" : "More than other authori"
"max" : "More than other authori",
"dataSize" : 65
},
"cc_mkt_desc" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "Largely blank years put substantially deaf, new others. Question",
"max" : "Shared others could not count fully dollars. New members ca"
"max" : "Shared others could not count fully dollars. New members ca",
"dataSize" : 123
},
"cc_market_manager" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "Julius Durham",
"max" : "Julius Tran"
"max" : "Julius Tran",
"dataSize" : 24
},
"cc_division" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 3,
"max" : 5
"max" : 5,
"dataSize" : null
},
"cc_division_name" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "anti",
"max" : "pri"
"max" : "pri",
"dataSize" : 7
},
"cc_company" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 1,
"max" : 6
"max" : 6,
"dataSize" : null
},
"cc_company_name" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "cally",
"max" : "ought"
"max" : "ought",
"dataSize" : 10
},
"cc_street_number" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "730",
"max" : "984"
"max" : "984",
"dataSize" : 6
},
"cc_street_name" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "Ash Hill",
"max" : "Center Hill"
"max" : "Center Hill",
"dataSize" : 19
},
"cc_street_type" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "Boulevard",
"max" : "Way"
"max" : "Way",
"dataSize" : 12
},
"cc_suite_number" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : "Suite 0",
"max" : "Suite 70"
"max" : "Suite 70",
"dataSize" : 15
},
"cc_city" : {
"distinctValuesCount" : 1,
"nullsCount" : 0,
"min" : "Midway",
"max" : "Midway"
"max" : "Midway",
"dataSize" : 6
},
"cc_county" : {
"distinctValuesCount" : 1,
"nullsCount" : 0,
"min" : "Williamson County",
"max" : "Williamson County"
"max" : "Williamson County",
"dataSize" : 17
},
"cc_state" : {
"distinctValuesCount" : 1,
"nullsCount" : 0,
"min" : "TN",
"max" : "TN"
"max" : "TN",
"dataSize" : 2
},
"cc_zip" : {
"distinctValuesCount" : 1,
"nullsCount" : 0,
"min" : "31904",
"max" : "31904"
"max" : "31904",
"dataSize" : 5
},
"cc_country" : {
"distinctValuesCount" : 1,
"nullsCount" : 0,
"min" : "United States",
"max" : "United States"
"max" : "United States",
"dataSize" : 13
},
"cc_gmt_offset" : {
"distinctValuesCount" : 1,
"nullsCount" : 0,
"min" : -500,
"max" : -500
"max" : -500,
"dataSize" : null
},
"cc_tax_percentage" : {
"distinctValuesCount" : 2,
"nullsCount" : 0,
"min" : 11,
"max" : 12
"max" : 12,
"dataSize" : null
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,55 +5,64 @@
"distinctValuesCount" : 11718,
"nullsCount" : 0,
"min" : 1,
"max" : 11718
"max" : 11718,
"dataSize" : null
},
"cp_catalog_page_id" : {
"distinctValuesCount" : 11718,
"nullsCount" : 0,
"min" : "AAAAAAAAAAABAAAA",
"max" : "AAAAAAAAPPPBAAAA"
"max" : "AAAAAAAAPPPBAAAA",
"dataSize" : 187488
},
"cp_start_date_sk" : {
"distinctValuesCount" : 91,
"nullsCount" : 101,
"min" : 2450815,
"max" : 2453005
"max" : 2453005,
"dataSize" : null
},
"cp_end_date_sk" : {
"distinctValuesCount" : 97,
"nullsCount" : 108,
"min" : 2450844,
"max" : 2453186
"max" : 2453186,
"dataSize" : null
},
"cp_department" : {
"distinctValuesCount" : 1,
"nullsCount" : 120,
"min" : "DEPARTMENT",
"max" : "DEPARTMENT"
"max" : "DEPARTMENT",
"dataSize" : 10
},
"cp_catalog_number" : {
"distinctValuesCount" : 109,
"nullsCount" : 104,
"min" : 1,
"max" : 109
"max" : 109,
"dataSize" : null
},
"cp_catalog_page_number" : {
"distinctValuesCount" : 108,
"nullsCount" : 116,
"min" : 1,
"max" : 108
"max" : 108,
"dataSize" : null
},
"cp_description" : {
"distinctValuesCount" : 11609,
"nullsCount" : 109,
"min" : "A bit asleep rooms cannot feel short dry secondary leads. Ab",
"max" : "Youngsters should get very. Bad, necessary years must pick telecommunications. Co"
"max" : "Youngsters should get very. Bad, necessary years must pick telecommunications. Co",
"dataSize" : 865538
},
"cp_type" : {
"distinctValuesCount" : 3,
"nullsCount" : 110,
"min" : "bi-annual",
"max" : "quarterly"
"max" : "quarterly",
"dataSize" : 25
}
}
}
Loading

0 comments on commit b69e78b

Please sign in to comment.