Skip to content

Commit c228c42

Browse files
committed
Merge branch 'master' of github.com:0xdata/h2o
2 parents 9666a61 + 4e6cd26 commit c228c42

19 files changed

+600
-366
lines changed

R/examples/ClassifiersMSE.R

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
## This script demonstrates how to compute MSE for classification problems
2+
## All the data manipulations happen in H2O
3+
## For simplicity, models are trained with default options, without validation data, and the training MSE is computed
4+
5+
## Start H2O
6+
library(h2o)
7+
h2oServer = h2o.init(nthreads = -1)
8+
9+
## Import data
10+
iris_hex <- as.h2o(h2oServer,iris) # this upload iris from R to H2O, use h2o.importFile() for big data!
11+
response <- 5
12+
predictors <- colnames(iris_hex[,-response])
13+
classes <- levels(iris_hex[,response])
14+
print(classes)
15+
16+
## Compute actual per-row class probabilities (1 or 0)
17+
resp_hex <- iris_hex[,response]
18+
actual_hex <- resp_hex == classes[1]
19+
for (level in levels(resp_hex)[2:length(classes)]) {
20+
actual_hex <- cbind(actual_hex, resp_hex==level)
21+
}
22+
summary(actual_hex)
23+
24+
## Train H2O classifiers
25+
models <- list()
26+
models <- c(models, h2o.deeplearning(x = predictors, y = response, data = iris_hex))
27+
models <- c(models, h2o.randomForest(x = predictors, y = response, data = iris_hex))
28+
models <- c(models, h2o.randomForest(x = predictors, y = response, data = iris_hex, type="BigData"))
29+
models <- c(models, h2o.gbm(x = predictors, y = response, data = iris_hex))
30+
models <- c(models, h2o.naiveBayes(x = predictors, y = response, data = iris_hex))
31+
32+
## Report Training MSE for all models
33+
for (model in models) {
34+
## Make predictions, extract probabilities
35+
train_preds_hex <- h2o.predict(model, iris_hex)[,-1] # h2o.predict returns N+1 columns: label + N probabilities
36+
37+
mse <- 0
38+
for (i in 1:length(classes)) {
39+
mse <- mse + mean((train_preds_hex[,i] - actual_hex[,i])^2)
40+
}
41+
print(paste0(model@key, " <---- MSE: ", mse))
42+
}
43+

h2o-docs/source/faq/general.rst

+6-5
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,20 @@
44
General Issues
55
==============
66

7-
## Downloading and Sending Logs
7+
Downloading and Sending Logs
8+
----------------------------
89

9-
1. From the drop-down **Admin** menu, select **Inspect Log** or go to <http://localhost:54321/LogView.html>.
10+
1. From the drop-down **Admin** menu, select **Inspect Log** or go to http://localhost:54321/LogView.html.
1011
2. On the **Log View** page, click the **Download Logs** button in the
1112
upper left hand corner to download your logs.
1213

13-
.. Image:: Logsdownload.png
14+
.. Image:: Logsdownload.png
1415
:width: 70%
1516

1617

17-
3. Email the logs to <[email protected]> or <[email protected]> and include the following information:
18+
3. Email the logs to [email protected] or [email protected] and include the following information:
1819
- H\ :sub:`2`\ O version number
19-
- Your environment (laptop or server)
20+
- Your environment (laptop or server)
2021
- Your operating system (Linux, Windows, OS X)
2122
- Any other programs you are using with H\ :sub:`2`\ O (Hadoop, Cloudera, R)
2223
- If you are using a cluster or other specific configuration

h2o-docs/source/faq/hadoop_related.rst

+44-31
Original file line numberDiff line numberDiff line change
@@ -43,28 +43,35 @@ description of your Hadoop environment, including the Hadoop distribution and ve
4343

4444
Common Hadoop Questions
4545
""""""""""""""""""""""""
46-
*What's the syntax for the file path of a data set sitting in hdfs?*
46+
**What's the syntax for the file path of a data set sitting in hdfs?**
4747

4848
To locate an HDFS file, go to **Data > Import** and enter **hdfs://** in the **path** field. H\ :sub:`2`\ O automatically detects any HDFS paths. This is a good way to verify the path to your data set before importing through R or any other non-web API.
4949

50-
*When interacting with an H2O cluster launched on multiple Hadoop nodes, is it necessary for R to be installed on all the data nodes?*
50+
---
51+
52+
**When interacting with an H\ :sub:`2`\ O cluster launched on multiple Hadoop nodes, is it necessary for R to be installed on all the data nodes?**
5153

5254
No - as long as the R instance can communicate with one of the nodes in the network, R can be installed on any of the nodes, or even on a local machine that will securely tunnel into the cluster.
5355

54-
*Is it possible to launch the H2O cluster on Hadoop nodes using R’s h2o.init() command?*
56+
---
57+
58+
**Is it possible to launch the H\ :sub:`2`\ O cluster on Hadoop nodes using R’s** `h2o.init()` **command?**
59+
60+
No - follow the instructions in :ref:`Hadoop_Tutorial` and add the IP address to the `h2o.init()` function to connect to the cluster.
5561

56-
No - follow the instructions in :ref:`Hadoop_Tutorial` and add the IP address to the **h2o.init()** function to connect to the cluster.
62+
---
5763

58-
*What does "ERROR: Output directory hdfs://sandbox.hortonworks.com:8020/user/root/hdfsOutputDir already exists?" mean?*
64+
**What does** `"ERROR: Output directory hdfs://sandbox.hortonworks.com:8020/user/root/hdfsOutputDir already exists?"` **mean?**
5965

6066
Each mapper task gets its own output directory in HDFS. To prevent overwriting multiple users' files, each mapper task must have a unique output directory name. Change
61-
the **-output hdfsOutputDir** argument to **-output hdfsOutputDir1** and the task should launch.
67+
the `-output hdfsOutputDir` argument to `-output hdfsOutputDir1` and the task should launch.
6268

63-
*What should I do if H2O starts to launch but times out in 120 seconds?*
69+
---
6470

71+
**What should I do if H\ :sub:`2`\ O starts to launch but times out in 120 seconds?**
6572

66-
1.YARN or MapReduce's configuration is not configured correctly. Enable launching for mapper tasks of specified memory sizes. If YARN only allows mapper tasks with a maximum memory size of 1g and the request requires 2g, then the request will timeout at the default of 120 seconds. Read `Configuration Setup <http://hortonworks.com/blog/how-to-plan-and-configure-yarn-in-hdp-2-0/>`_ to make sure your setup will run.
6773

74+
1. YARN or MapReduce's configuration is not configured correctly. Enable launching for mapper tasks of specified memory sizes. If YARN only allows mapper tasks with a maximum memory size of 1g and the request requires 2g, then the request will timeout at the default of 120 seconds. Read `Configuration Setup <http://hortonworks.com/blog/how-to-plan-and-configure-yarn-in-hdp-2-0/>`_ to make sure your setup will run.
6875

6976
2. The nodes are not communicating with each other. If you request a cluster of two nodes and the output shows a stall in reporting the other nodes and forming a cluster (as shown in the following example), check that the security settings for the network connection between the two nodes are not preventing the nodes from communicating with each other. You should also check to make sure that the flatfile that is generated and being passed has the correct home address; if there are multiple local IP addresses, this could be an issue.
7077

@@ -94,17 +101,21 @@ the **-output hdfsOutputDir** argument to **-output hdfsOutputDir1** and the tas
94101
[Sending flatfile to node 10.115.5.25:54321]
95102
H2O node 10.115.57.45:54321 reports H2O cluster size 1
96103
H2O node 10.115.5.25:54321 reports H2O cluster size 1
104+
105+
---
97106

98-
*What should I do if the H2O job launches but terminates after 600 seconds?*
107+
**What should I do if the H2O job launches but terminates after 600 seconds?**
99108

100109
The likely cause is a driver mismatch - check to make sure the Hadoop distribution matches the driver jar file used to launch H\ :sub:`2`\ O. If your distribution is not currently
101110
available in the package, `email us <support@0xdata>`_ for a new driver file.
102111

103-
*What should I do if I want to create a job with a bigger heap size but YARN doesn't launch and H2O times out?*
112+
---
104113

105-
First, try the job again but with a smaller heap size (**-mapperXmx**) and a smaller number of nodes (**-nodes**) to verify that a small launch can proceed at all.
114+
**What should I do if I want to create a job with a bigger heap size but YARN doesn't launch and H\ :sub:`2`\ O times out?**
106115

107-
If the cluster manager settings are configured for the default maximum memory size but the memory required for the request exceeds that amount, YARN will not launch and H2O will time out.
116+
First, try the job again but with a smaller heap size (`-mapperXmx`) and a smaller number of nodes (`-nodes`) to verify that a small launch can proceed at all.
117+
118+
If the cluster manager settings are configured for the default maximum memory size but the memory required for the request exceeds that amount, YARN will not launch and H\ :sub:`2`\ O will time out.
108119
If you have a default configuration, change the configuration settings in your cluster manager to enable launching of mapper tasks for specific memory sizes. Use the following formula to calculate the amount of memory required:
109120

110121
::
@@ -134,51 +145,53 @@ Output from an H2O launch is shown below:
134145
mapreduce.map.memory.mb: 36864
135146

136147

137-
**mapreduce.map.memory.mb** must be less than the YARN memory configuration values for the launch to succeed. See the examples below for how to change the memory configuration values for your version of Hadoop.
148+
`mapreduce.map.memory.mb` must be less than the YARN memory configuration values for the launch to succeed. See the examples below for how to change the memory configuration values for your version of Hadoop.
138149

139150

140151
**For Cloudera, configure the settings in Cloudera Manager. Depending on how the cluster is configured, you may need to change the settings for more than one role group.**
141152

142-
1. Click **Configuration** and enter the following search term in quotes: **yarn.nodemanager.resource.memory-mb**.
153+
1. Click **Configuration** and enter the following search term in quotes: **yarn.nodemanager.resource.memory-mb**.
143154

144-
2. Enter the amount of memory (in GB) to allocate in the **Value** field. If more than one group is listed, change the values for all listed groups.
155+
2. Enter the amount of memory (in GB) to allocate in the **Value** field. If more than one group is listed, change the values for all listed groups.
145156

146157
.. image:: TroubleshootingHadoopClouderayarnnodemgr.png
147158
:width: 100 %
148159

149-
3. Click the **Save Changes** button in the upper-right corner.
150-
4. Enter the following search term in quotes: **yarn.scheduler.maximum-allocation-mb**
151-
5. Change the value, click the **Save Changes** button in the upper-right corner, and redeploy.
160+
3. Click the **Save Changes** button in the upper-right corner.
161+
4. Enter the following search term in quotes: **yarn.scheduler.maximum-allocation-mb**
162+
5. Change the value, click the **Save Changes** button in the upper-right corner, and redeploy.
152163

153164
.. image:: TroubleshootingHadoopClouderayarnscheduler.png
154165
:width: 100%
155166

156167

157-
For Hortonworks, `configure <http://docs.hortonworks.com/HDPDocuments/Ambari-1.6.0.0/bk_Monitoring_Hadoop_Book/content/monitor-chap2-3-3_2x.html>`_ the settings in Ambari.
158-
1. Select **YARN**, then click the **Configs** tab.
159-
2. Select the group.
160-
3. In the **Node Manager** section, enter the amount of memory (in MB) to allocate in the **yarn.nodemanager.resource.memory-mb** entry field.
168+
**For Hortonworks,** `configure <http://docs.hortonworks.com/HDPDocuments/Ambari-1.6.0.0/bk_Monitoring_Hadoop_Book/content/monitor-chap2-3-3_2x.html>`_ **the settings in Ambari.**
169+
170+
1. Select **YARN**, then click the **Configs** tab.
171+
2. Select the group.
172+
3. In the **Node Manager** section, enter the amount of memory (in MB) to allocate in the **yarn.nodemanager.resource.memory-mb** entry field.
161173

162174
.. image:: TroubleshootingHadoopAmbariNodeMgr.png
163175
:width: 100 %
164176

165-
4. In the **Scheduler** section, enter the amount of memory (in MB)to allocate in the **yarn.scheduler.maximum-allocation-mb** entry field.
177+
4. In the **Scheduler** section, enter the amount of memory (in MB)to allocate in the **yarn.scheduler.maximum-allocation-mb** entry field.
166178

167179
.. image:: TroubleshootingHadoopAmbariyarnscheduler.png
168180
:width: 100 %
169181

170-
5. Click the **Save** button at the bottom of the page and redeploy the cluster.
182+
5. Click the **Save** button at the bottom of the page and redeploy the cluster.
171183

172184

173-
For MapR:
174-
1. Edit the **yarn-site.xml** file for the node running the ResourceManager.
175-
2. Change the values for the **yarn.nodemanager.resource.memory-mb** and **yarn.scheduler.maximum-allocation-mb** properties.
176-
3. Restart the ResourceManager and redeploy the cluster.
185+
**For MapR:**
186+
187+
1. Edit the **yarn-site.xml** file for the node running the ResourceManager.
188+
2. Change the values for the `yarn.nodemanager.resource.memory-mb` and `yarn.scheduler.maximum-allocation-mb` properties.
189+
3. Restart the ResourceManager and redeploy the cluster.
177190

178191

179192
To verify the values were changed, check the values for the following properties:
180193

181-
* **<name>yarn.nodemanager.resource.memory-mb</name>**
182-
* **<name>yarn.scheduler.maximum-allocation-mb</name>**
194+
- `<name>yarn.nodemanager.resource.memory-mb</name>`
195+
- `<name>yarn.scheduler.maximum-allocation-mb</name>`
183196

184-
197+
---

src/main/java/hex/CoxPH.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ protected float[] score0(double[] data, float[] preds) {
196196
}
197197
for (int j = n_cats; j < n_data; ++j)
198198
numsHasNA |= Double.isNaN(data[j]);
199-
if (numsHasNA || (catsHasNA && !catsAllNA) || (n_num == 0 && catsAllNA)) {
199+
if (numsHasNA || (catsHasNA && !catsAllNA) || (n_num > 0 && catsAllNA)) {
200200
for (int i = 1; i <= 2 * n_time; ++i)
201201
preds[i] = Float.NaN;
202202
} else {

src/main/java/hex/FrameTask.java

+24-13
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,22 @@ public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double []
136136

137137
final int [][] _catLvls;
138138

139+
/**
140+
* Apply data transformation on the given column.
141+
*
142+
* @param c - index into fully exponded vector
143+
* @param v - value of the column to be transformed
144+
* @return v transformed by the transformation (e.g. standardization) defined by this dataset for this column
145+
*/
146+
public double applyTransform(int c, double v){
147+
if(c >= _catOffsets[_catOffsets.length-1]) {
148+
c -= _cats;
149+
if (_normSub != null) v -= _normSub[c];
150+
if (_normMul != null) v *= _normMul[c];
151+
}
152+
return v;
153+
}
154+
139155
/**
140156
* Prepare a Frame (with a single response) to be processed by the FrameTask
141157
* 1) Place response at the end
@@ -149,6 +165,9 @@ public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double []
149165
* @return Frame to be used by FrameTask
150166
*/
151167
public static Frame prepareFrame(Frame source, Vec response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols, boolean dropNACols) {
168+
return prepareFrame(source,response != null?new Vec[]{response}:null,ignored_cols,toEnum,dropConstantCols,dropNACols);
169+
}
170+
public static Frame prepareFrame(Frame source, Vec [] response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols, boolean dropNACols) {
152171
Frame fr = new Frame(Key.makeSystem(Key.make().toString()), source._names.clone(), source.vecs().clone());
153172
if (ignored_cols != null) fr.remove(ignored_cols);
154173
final Vec[] vecs = fr.vecs();
@@ -159,21 +178,13 @@ public static Frame prepareFrame(Frame source, Vec response, int[] ignored_cols,
159178

160179
// put response to the end (if not already)
161180
if (response != null) {
162-
for (int i = 0; i < vecs.length - 1; ++i) {
163-
if (vecs[i] == response) {
164-
final String n = fr._names[i];
165-
if (toEnum && !vecs[i].isEnum()) fr.add(n, fr.remove(i).toEnum()); //convert int classes to enums
166-
else fr.add(n, fr.remove(i));
167-
break;
168-
}
169-
}
170-
// special case for when response was at the end already
171-
if (toEnum && !response.isEnum() && vecs[vecs.length - 1] == response) {
172-
final String n = fr._names[vecs.length - 1];
173-
fr.add(n, fr.remove(vecs.length - 1).toEnum());
181+
for(Vec v:response){
182+
int id = fr.find(v);
183+
final String n = fr._names[id];
184+
if (toEnum && !vecs[id].isEnum()) fr.add(n, fr.remove(id).toEnum()); //convert int classes to enums
185+
else fr.add(n, fr.remove(id));
174186
}
175187
}
176-
177188
ArrayList<Integer> constantOrNAs = new ArrayList<Integer>();
178189
{
179190
ArrayList<Integer> constantCols = new ArrayList<Integer>();

0 commit comments

Comments
 (0)