Merge remote-tracking branch 'origin/rel-3.46.0'

h2o-ops · h2o-ops · commit 967faaf9405c · 2024-05-29T14:56:35.000-07:00
diff --git a/h2o-py/h2o/display.py b/h2o-py/h2o/display.py
@@ -5,6 +5,7 @@
 :copyright: (c) 2016 H2O.ai
 :license:   Apache License Version 2.0 (see LICENSE for details)
 """
+# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython
 from contextlib import contextmanager
 import os
 import sys
diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py
@@ -868,14 +868,22 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
             if ind in skipped_columns:
                 use_type[ind]=False
 
-    if column_names is not None:
+    if column_names is not None: 
         if not isinstance(column_names, list): raise ValueError("col_names should be a list")
         if (skipped_columns is not None) and len(skipped_columns)>0:
-            if (len(column_names)) != parse_column_len:
+          # when we are converting a python object to H2OFrame, column_names will include all columns despite
+          # skipped columns are specified.  In this case, we need to make sure that 
+          # len(column_names)-len(skipped_columns)==parse_column_len
+          # When we are importing a file with skipped columns mentioned, column_names will only contain columns that
+          # are not skipped.  Hence, in this case, we need to check len(column_names) == parse_column_len.
+          # To combine the two, correct parsing will have conditions len(column_names)-len(skipped_columns)==parse_column_len
+          # or len(column_names)==parse_column_len.  Hence, we will raise an error when 
+          # not(len(column_names)-len(skipped_columns)==parse_column_len or len(column_names)==parse_column_len happened.
+         if not((len(column_names) == parse_column_len) or ((len(column_names)-len(skipped_columns))==parse_column_len)):
                 raise ValueError(
-                    "length of col_names should be equal to the number of columns parsed: %d vs %d"
-                    % (len(column_names), parse_column_len))
-        else:
+                    "length of col_names minus length of skipped_columns should equal the number of columns parsed: "
+                    "%d vs %d" % (len(column_names), parse_column_len))
+        else: # no skipped columns here
             if len(column_names) != len(j["column_types"]): raise ValueError(
                 "length of col_names should be equal to the number of columns: %d vs %d"
                 % (len(column_names), len(j["column_types"])))
diff --git a/h2o-py/h2o/plot/_matplotlib.py b/h2o-py/h2o/plot/_matplotlib.py
@@ -1,9 +1,11 @@
 
 def get_matplotlib_pyplot(server, raise_if_not_available=False):
+    # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
     try:
         # noinspection PyUnresolvedReferences
         import matplotlib
-        matplotlib.use("Agg")
+        if server:
+            matplotlib.use("Agg")
         try:
             # noinspection PyUnresolvedReferences
             import matplotlib.pyplot as plt
@@ -25,6 +27,7 @@ def get_matplotlib_pyplot(server, raise_if_not_available=False):
 
 
 def get_polycollection(server, raise_if_not_available=False):
+    # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
     try:
         from matplotlib.collections import PolyCollection as polycoll
         return polycoll
@@ -36,6 +39,7 @@ def get_polycollection(server, raise_if_not_available=False):
     
     
 def get_matplotlib_cm(function_name):
+    # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
     try:
         from matplotlib import cm
         return cm
@@ -45,6 +49,7 @@ def get_matplotlib_cm(function_name):
 
 
 def get_mplot3d_axes(function_name):
+    # when changing this function, please make sure it doesn't break explanations in jupyter, vscode and ipython
     try:
         # noinspection PyUnresolvedReferences
         from mpl_toolkits.mplot3d import Axes3D
diff --git a/h2o-py/h2o/plot/_plot_result.py b/h2o-py/h2o/plot/_plot_result.py
@@ -1,5 +1,6 @@
 # -*- encoding: utf-8 -*-
 # mutable versions of py immutable types
+# when changing this module, please make sure it doesn't break explanations in jupyter, vscode and ipython
 from h2o.exceptions import H2OError
 
 __no_export = set(dir())  # all variables defined above this are not exported
diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
@@ -125,12 +125,10 @@ def H2OFrame_from_H2OFrame():
     assert dupl4.columns == ["n1", "s1"]
 
 
-def H2OFrame_skipped_columns_is_BUGGY():
-    try:
-        h2o.H2OFrame(data, skipped_columns=[1])
-        assert False, "skipped_columns handling may be fixed now"  # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 
-    except ValueError as e:
-        assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
+def H2OFrame_skipped_columns_BUG_fixed():
+    f1 = h2o.H2OFrame(data, skipped_columns=[1])
+    f2 = h2o.H2OFrame(data)
+    assert f1.ncol == (f2.ncol-1), "expected number of columns: {0}, actual column numbers: {1}".format(f1.ncol, (f2.ncol-1))
 
 
 pu.run_tests([
@@ -141,5 +139,5 @@ def H2OFrame_skipped_columns_is_BUGGY():
     H2OFrame_from_pandas,
     H2OFrame_from_scipy,
     H2OFrame_from_H2OFrame,
-    H2OFrame_skipped_columns_is_BUGGY
+    H2OFrame_skipped_columns_BUG_fixed
 ])
diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15947_skipped_column_error.py b/h2o-py/tests/testdir_parser/pyunit_GH_15947_skipped_column_error.py
@@ -0,0 +1,15 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+
+# Seb has reported that skipped_columns does not work if skipped_columns is called with h2o.H2OFrame
+def test_skipped_columns():
+    data = [[1, 4, "a",  1], [2, 5, "b",  0], [3, 6, "", 1]]
+    frame = h2o.H2OFrame(data, skipped_columns=[1, 2])
+    assert frame.ncol == 2, "Expected column number: 2.  Actual: {0}".format(frame.ncol)
+ 
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_skipped_columns)
+else:
+    test_skipped_columns()
diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R
@@ -4109,6 +4109,7 @@ use.package <- function(package,
 #'
 #' @param x An \code{R} object.
 #' @param destination_frame A string with the desired name for the H2OFrame
+#' @param skipped_columns A list of integer containing columns to be skipped and not parsed into the final frame
 #' @param use_datatable allow usage of data.table
 #' @param \dots arguments passed to method arguments.
 #' @export
@@ -4135,15 +4136,19 @@ use.package <- function(package,
 #'   stopifnot(is.h2o(m_hf), dim(m_hf) == dim(m))
 #' }
 #' }
-as.h2o <- function(x, destination_frame="", ...) {
+as.h2o <- function(x, destination_frame="", skipped_columns=NULL, ...) {
   .key.validate(destination_frame)
-  UseMethod("as.h2o")
+  if (is.null(skipped_columns)) {
+    UseMethod("as.h2o")
+  } else {
+    as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
+  }
 }
 
 #' @rdname as.h2o
 #' @method as.h2o default
 #' @export
-as.h2o.default <- function(x, destination_frame="", ...) {
+as.h2o.default <- function(x, destination_frame="", skipped_columns=NULL, ...) {
   if( destination_frame=="" ) {
     subx <- destination_frame.guess(deparse(substitute(x)))
     destination_frame <- .key.make(if(nzchar(subx)) subx else paste0(class(x), "_", collapse = ""))
@@ -4152,13 +4157,13 @@ as.h2o.default <- function(x, destination_frame="", ...) {
     data.frame(C1=x)
   else
     as.data.frame(x, ...)
-  as.h2o.data.frame(x, destination_frame=destination_frame)
+  as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns)
 }
 
 #' @rdname as.h2o
 #' @method as.h2o H2OFrame
 #' @export
-as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
+as.h2o.H2OFrame <- function(x, destination_frame="", skipped_columns=NULL, ...) {
   if( destination_frame=="" ) {
     subx <- destination_frame.guess(deparse(substitute(x)))
     destination_frame <- .key.make(if(nzchar(subx)) subx else "H2OFrame_copy")
@@ -4173,7 +4178,7 @@ as.h2o.H2OFrame <- function(x, destination_frame="", ...) {
 #' @seealso \code{\link{use.package}}
 #' @references \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}
 #' @export
-as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) {
+as.h2o.data.frame <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
   if( destination_frame=="" ) {
     subx <- destination_frame.guess(deparse(substitute(x)))
     destination_frame <- .key.make(if(nzchar(subx)) subx else "data.frame")
@@ -4203,7 +4208,8 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
   if (verbose) cat(sprintf("writing csv to disk using '%s' took %.2fs\n", fun, proc.time()[[3]]-pt))
   #if (verbose) pt <- proc.time()[[3]] # timings inside
   h2f <- h2o.uploadFile(tmpf, destination_frame = destination_frame, header = TRUE, col.types=types,
-                        col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)))
+                        col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)), 
+                        skipped_columns=skipped_columns)
   #if (verbose) cat(sprintf("uploading csv to h2o using 'h2o.uploadFile' took %.2fs\n", proc.time()[[3]]-pt))
   file.remove(tmpf)
   h2f
@@ -4215,7 +4221,7 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...)
 #' To speedup execution time for large sparse matrices, use h2o datatable.  Make sure you have installed and imported data.table and slam packages.
 #' Turn on h2o datatable by options("h2o.use.data.table"=TRUE)
 #' @export
-as.h2o.Matrix <- function(x, destination_frame="", use_datatable=TRUE, ...) {
+as.h2o.Matrix <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) {
   if( destination_frame=="") {
     subx <- destination_frame.guess(deparse(substitute(x)))
     destination_frame <- .key.make(if(nzchar(subx)) subx else "Matrix")
diff --git a/h2o-r/h2o-package/R/parse.R b/h2o-r/h2o-package/R/parse.R
@@ -219,8 +219,10 @@ h2o.parseSetup <- function(data, pattern="", destination_frame = "", header = NA
     else
       col.names
     if (!is.null(parseSetup$column_names) &&
-        (length(parseSetup$column_names) != parsedColLength)) {
-      stop("length of col.names must equal to the number of columns in dataset")
+        (length(parseSetup$column_names) != parsedColLength)) { # should equal, if not, need to check skipped_columns
+        if ((!is.null(skipped_columns) && ((length(parseSetup$column_names)-length(skipped_columns)) != parsedColLength))
+            || is.null(skipped_columns)) # if no skipped column, this is an error.  If skipped columns, check length
+            stop("length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
     }
     # change column names to what the user specified
     if (!is.null(skipped_columns)) {
diff --git a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R
@@ -11,6 +11,7 @@ source("../../scripts/h2o-r-test-setup.R")
 test.continuous.or.categorical <- function() {
   df.hex <- h2o.uploadFile(locate("smalldata/jira/hexdev_29.csv"),
     col.types = c("enum", "enum", "enum"))
+  browser()
 
   expect_true(is.factor(df.hex$h1))
   expect_true(is.factor(df.hex$h2))
@@ -36,7 +37,7 @@ test.continuous.or.categorical <- function() {
 
   e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
                                col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
-  expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
+  expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
 
   # col.types as character vector
   df.hex2 <- h2o.importFile(locate("smalldata/iris/iris.csv"), col.types=c("Numeric","Numeric","Enum","Numeric","Enum"))
@@ -66,7 +67,7 @@ test.continuous.or.categorical <- function() {
 
   e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
                                col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
-  expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
+  expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
 
   # col.types as character vector
   df.hex4 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files"),
@@ -98,7 +99,7 @@ test.continuous.or.categorical <- function() {
 
   e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"),
                                col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x)
-  expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset")
+  expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset")
 
   # col.types as character vector
   df.hex6 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files_wheader"), col.names=c("C1","C2","C3","C4","C5"),
diff --git a/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R b/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R
@@ -0,0 +1,10 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../scripts/h2o-r-test-setup.R")
+
+test.skipped_columns <- function() {
+  iris_hf <- as.h2o(iris, skipped_columns=c(1,2))
+  expect_true(ncol(iris_hf) == (ncol(iris)-2))
+  print("Columns are skipped!!!")
+}
+
+doTest("Test skipped_columns when using as.h2o to change data frame to H2O Frame.", test.skipped_columns)