Skip to content

Commit 967faaf

Browse files
author
h2o-ops
committed
Merge remote-tracking branch 'origin/rel-3.46.0'
2 parents b28b049 + b3b813e commit 967faaf

File tree

10 files changed

+73
-26
lines changed

10 files changed

+73
-26
lines changed

h2o-py/h2o/display.py

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
:copyright: (c) 2016 H2O.ai
66
:license: Apache License Version 2.0 (see LICENSE for details)
77
"""
8+
# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython
89
from contextlib import contextmanager
910
import os
1011
import sys

h2o-py/h2o/h2o.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -868,14 +868,22 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
868868
if ind in skipped_columns:
869869
use_type[ind]=False
870870

871-
if column_names is not None:
871+
if column_names is not None:
872872
if not isinstance(column_names, list): raise ValueError("col_names should be a list")
873873
if (skipped_columns is not None) and len(skipped_columns)>0:
874-
if (len(column_names)) != parse_column_len:
874+
# when we are converting a python object to H2OFrame, column_names will include all columns despite
875+
# skipped columns are specified. In this case, we need to make sure that
876+
# len(column_names)-len(skipped_columns)==parse_column_len
877+
# When we are importing a file with skipped columns mentioned, column_names will only contain columns that
878+
# are not skipped. Hence, in this case, we need to check len(column_names) == parse_column_len.
879+
# To combine the two, correct parsing will have conditions len(column_names)-len(skipped_columns)==parse_column_len
880+
# or len(column_names)==parse_column_len. Hence, we will raise an error when
881+
# not(len(column_names)-len(skipped_columns)==parse_column_len or len(column_names)==parse_column_len happened.
882+
if not((len(column_names) == parse_column_len) or ((len(column_names)-len(skipped_columns))==parse_column_len)):
875883
raise ValueError(
876-
"length of col_names should be equal to the number of columns parsed: %d vs %d"
877-
% (len(column_names), parse_column_len))
878-
else:
884+
"length of col_names minus length of skipped_columns should equal the number of columns parsed: "
885+
"%d vs %d" % (len(column_names), parse_column_len))
886+
else: # no skipped columns here
879887
if len(column_names) != len(j["column_types"]): raise ValueError(
880888
"length of col_names should be equal to the number of columns: %d vs %d"
881889
% (len(column_names), len(j["column_types"])))

h2o-py/h2o/plot/_matplotlib.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11

22
def get_matplotlib_pyplot(server, raise_if_not_available=False):
3+
# when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
34
try:
45
# noinspection PyUnresolvedReferences
56
import matplotlib
6-
matplotlib.use("Agg")
7+
if server:
8+
matplotlib.use("Agg")
79
try:
810
# noinspection PyUnresolvedReferences
911
import matplotlib.pyplot as plt
@@ -25,6 +27,7 @@ def get_matplotlib_pyplot(server, raise_if_not_available=False):
2527

2628

2729
def get_polycollection(server, raise_if_not_available=False):
30+
# when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
2831
try:
2932
from matplotlib.collections import PolyCollection as polycoll
3033
return polycoll
@@ -36,6 +39,7 @@ def get_polycollection(server, raise_if_not_available=False):
3639

3740

3841
def get_matplotlib_cm(function_name):
42+
# when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
3943
try:
4044
from matplotlib import cm
4145
return cm
@@ -45,6 +49,7 @@ def get_matplotlib_cm(function_name):
4549

4650

4751
def get_mplot3d_axes(function_name):
52+
# when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
4853
try:
4954
# noinspection PyUnresolvedReferences
5055
from mpl_toolkits.mplot3d import Axes3D

h2o-py/h2o/plot/_plot_result.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- encoding: utf-8 -*-
22
# mutable versions of py immutable types
3+
# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython
34
from h2o.exceptions import H2OError
45

56
__no_export = set(dir()) # all variables defined above this are not exported

h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,10 @@ def H2OFrame_from_H2OFrame():
125125
assert dupl4.columns == ["n1", "s1"]
126126

127127

128-
def H2OFrame_skipped_columns_is_BUGGY():
129-
try:
130-
h2o.H2OFrame(data, skipped_columns=[1])
131-
assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947
132-
except ValueError as e:
133-
assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
128+
def H2OFrame_skipped_columns_BUG_fixed():
129+
f1 = h2o.H2OFrame(data, skipped_columns=[1])
130+
f2 = h2o.H2OFrame(data)
131+
assert f1.ncol == (f2.ncol-1), "expected number of columns: {0}, actual column numbers: {1}".format(f1.ncol, (f2.ncol-1))
134132

135133

136134
pu.run_tests([
@@ -141,5 +139,5 @@ def H2OFrame_skipped_columns_is_BUGGY():
141139
H2OFrame_from_pandas,
142140
H2OFrame_from_scipy,
143141
H2OFrame_from_H2OFrame,
144-
H2OFrame_skipped_columns_is_BUGGY
142+
H2OFrame_skipped_columns_BUG_fixed
145143
])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import sys
2+
sys.path.insert(1,"../../")
3+
import h2o
4+
from tests import pyunit_utils
5+
6+
# Seb has reported that skipped_columns does not work if skipped_columns is called with h2o.H2OFrame
7+
def test_skipped_columns():
8+
data = [[1, 4, "a", 1], [2, 5, "b", 0], [3, 6, "", 1]]
9+
frame = h2o.H2OFrame(data, skipped_columns=[1, 2])
10+
assert frame.ncol == 2, "Expected column number: 2. Actual: {0}".format(frame.ncol)
11+
12+
if __name__ == "__main__":
13+
pyunit_utils.standalone_test(test_skipped_columns)
14+
else:
15+
test_skipped_columns()

h2o-r/h2o-package/R/frame.R

+14-8
Original file line numberDiff line numberDiff line change
@@ -4109,6 +4109,7 @@ use.package <- function(package,
41094109
#'
41104110
#' @param x An \code{R} object.
41114111
#' @param destination_frame A string with the desired name for the H2OFrame
4112+
#' @param skipped_columns A list of integer containing columns to be skipped and not parsed into the final frame
41124113
#' @param use_datatable allow usage of data.table
41134114
#' @param \dots arguments passed to method arguments.
41144115
#' @export
@@ -4135,15 +4136,19 @@ use.package <- function(package,
41354136
#' stopifnot(is.h2o(m_hf), dim(m_hf) == dim(m))
41364137
#' }
41374138
#' }
4138-
as.h2o <- function(x, destination_frame="", ...) {
4139+
as.h2o <- function(x, destination_frame="", skipped_columns=NULL, ...) {
41394140
.key.validate(destination_frame)
4140-
UseMethod("as.h2o")
4141+
if (is.null(skipped_columns)) {
4142+
UseMethod("as.h2o")
4143+
} else {
4144+
as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
4145+
}
41414146
}
41424147

41434148
#' @rdname as.h2o
41444149
#' @method as.h2o default
41454150
#' @export
4146-
as.h2o.default <- function(x, destination_frame="", ...) {
4151+
as.h2o.default <- function(x, destination_frame="", skipped_columns=NULL, ...) {
41474152
if( destination_frame=="" ) {
41484153
subx <- destination_frame.guess(deparse(substitute(x)))
41494154
destination_frame <- .key.make(if(nzchar(subx)) subx else paste0(class(x), "_", collapse = ""))
@@ -4152,13 +4157,13 @@ as.h2o.default <- function(x, destination_frame="", ...) {
41524157
data.frame(C1=x)
41534158
else
41544159
as.data.frame(x, ...)
4155-
as.h2o.data.frame(x, destination_frame=destination_frame)
4160+
as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
41564161
}
41574162

41584163
#' @rdname as.h2o
41594164
#' @method as.h2o H2OFrame
41604165
#' @export
4161-
as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
4166+
as.h2o.H2OFrame <- function(x, destination_frame="", skipped_columns=NULL, ...) {
41624167
if( destination_frame=="" ) {
41634168
subx <- destination_frame.guess(deparse(substitute(x)))
41644169
destination_frame <- .key.make(if(nzchar(subx)) subx else "H2OFrame_copy")
@@ -4173,7 +4178,7 @@ as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
41734178
#' @seealso \code{\link{use.package}}
41744179
#' @references \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}
41754180
#' @export
4176-
as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) {
4181+
as.h2o.data.frame <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
41774182
if( destination_frame=="" ) {
41784183
subx <- destination_frame.guess(deparse(substitute(x)))
41794184
destination_frame <- .key.make(if(nzchar(subx)) subx else "data.frame")
@@ -4203,7 +4208,8 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
42034208
if (verbose) cat(sprintf("writing csv to disk using '%s' took %.2fs\n", fun, proc.time()[[3]]-pt))
42044209
#if (verbose) pt <- proc.time()[[3]] # timings inside
42054210
h2f <- h2o.uploadFile(tmpf, destination_frame = destination_frame, header = TRUE, col.types=types,
4206-
col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)))
4211+
col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)),
4212+
skipped_columns=skipped_columns)
42074213
#if (verbose) cat(sprintf("uploading csv to h2o using 'h2o.uploadFile' took %.2fs\n", proc.time()[[3]]-pt))
42084214
file.remove(tmpf)
42094215
h2f
@@ -4215,7 +4221,7 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
42154221
#' To speedup execution time for large sparse matrices, use h2o datatable. Make sure you have installed and imported data.table and slam packages.
42164222
#' Turn on h2o datatable by options("h2o.use.data.table"=TRUE)
42174223
#' @export
4218-
as.h2o.Matrix <- function(x, destination_frame="", use_datatable=TRUE, ...) {
4224+
as.h2o.Matrix <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
42194225
if( destination_frame=="") {
42204226
subx <- destination_frame.guess(deparse(substitute(x)))
42214227
destination_frame <- .key.make(if(nzchar(subx)) subx else "Matrix")

h2o-r/h2o-package/R/parse.R

+4-2
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,10 @@ h2o.parseSetup <- function(data, pattern="", destination_frame = "", header = NA
219219
else
220220
col.names
221221
if (!is.null(parseSetup$column_names) &&
222-
(length(parseSetup$column_names) != parsedColLength)) {
223-
stop("length of col.names must equal to the number of columns in dataset")
222+
(length(parseSetup$column_names) != parsedColLength)) { # should equal, if not, need to check skipped_columns
223+
if ((!is.null(skipped_columns) && ((length(parseSetup$column_names)-length(skipped_columns)) != parsedColLength))
224+
|| is.null(skipped_columns)) # if no skipped column, this is an error. If skipped columns, check length
225+
stop("length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
224226
}
225227
# change column names to what the user specified
226228
if (!is.null(skipped_columns)) {

h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R

+4-3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ source("../../scripts/h2o-r-test-setup.R")
1111
test.continuous.or.categorical <- function() {
1212
df.hex <- h2o.uploadFile(locate("smalldata/jira/hexdev_29.csv"),
1313
col.types = c("enum", "enum", "enum"))
14+
browser()
1415

1516
expect_true(is.factor(df.hex$h1))
1617
expect_true(is.factor(df.hex$h2))
@@ -36,7 +37,7 @@ test.continuous.or.categorical <- function() {
3637

3738
e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
3839
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
39-
expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
40+
expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
4041

4142
# col.types as character vector
4243
df.hex2 <- h2o.importFile(locate("smalldata/iris/iris.csv"), col.types=c("Numeric","Numeric","Enum","Numeric","Enum"))
@@ -66,7 +67,7 @@ test.continuous.or.categorical <- function() {
6667

6768
e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
6869
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
69-
expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
70+
expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
7071

7172
# col.types as character vector
7273
df.hex4 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files"),
@@ -98,7 +99,7 @@ test.continuous.or.categorical <- function() {
9899

99100
e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
100101
col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
101-
expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
102+
expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
102103

103104
# col.types as character vector
104105
df.hex6 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files_wheader"), col.names=c("C1","C2","C3","C4","C5"),
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
2+
source("../../scripts/h2o-r-test-setup.R")
3+
4+
test.skipped_columns <- function() {
5+
iris_hf <- as.h2o(iris, skipped_columns=c(1,2))
6+
expect_true(ncol(iris_hf) == (ncol(iris)-2))
7+
print("Columns are skipped!!!")
8+
}
9+
10+
doTest("Test skipped_columns when using as.h2o to change data frame to H2O Frame.", test.skipped_columns)

0 commit comments

Comments
 (0)