r492: bug in testing dim; added mt to examples

Also improved doc a little bit.
wenzhu888 · Mar 1, 2017 · e58688e · e58688e
1 parent 87c73da
commit e58688e
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -19,17 +19,17 @@ echo 15315611231621249 78 | ./examples/rnn-bit -Ai mul100.kan -
 
 KANN is a standalone and lightweight library in C for constructing and training
 small to medium artificial neural networks such as [multi-layer
-perceptrons][mlp], [convolutional neural networks][cnn], [recurrent neural
+perceptrons][mlp], [convolutional neural networks][cnn] and [recurrent neural
 networks][rnn] (including [LSTM][lstm] and [GRU][gru]). It implements
 graph-based reverse-mode [automatic differentiation][ad] and allows to build
 topologically complex neural networks with recurrence, shared weights and
-multiple inputs/outputs/costs (e.g. with [variational autoencoder][vae]). In
-comparison to mainstream deep learning frameworks such as [TensorFlow][tf],
-KANN is not as scalable, but it is close in flexibility, has a much smaller
-code base and only depends on the standard C library. In comparison to other
-lightweight frameworks such as [tiny-dnn][td], KANN is still smaller, times
-faster and much more versatile, supporting RNN, VAE and non-standard neural
-networks that may fail these lightweight frameworks.
+multiple inputs/outputs/costs. In comparison to mainstream deep learning
+frameworks such as [TensorFlow][tf], KANN is not as scalable, but it is close
+in flexibility, has a much smaller code base and only depends on the standard C
+library. In comparison to other lightweight frameworks such as [tiny-dnn][td],
+KANN is still smaller, times faster and much more versatile, supporting RNN,
+VAE and non-standard neural networks that may fail these lightweight
+frameworks.
 
 KANN could be potentially useful when you want to experiment small to medium
 neural networks in C/C++, to deploy no-so-large models without worrying about
@@ -40,17 +40,16 @@ neural networks in C/C++, to deploy no-so-large models without worrying about
 * Flexible. Model construction by building a computational graph with
   operators. Support RNNs, weight sharing and multiple inputs/outputs.
 
-* Reasonably efficient. Support mini-batching. Optimized matrix product and
-  convolution, coming close to (though not as fast as) OpenBLAS and mainstream
-  deep learning frameworks on CPUs.
+* Efficient. Reasonably optimized matrix product and convolution. Support
+  mini-batching and effective multi-threading. Sometimes faster than mainstream
+  frameworks in their CPU-only mode.
 
-* Small. As of now, KANN has less than 3000 lines of code in four source code
+* Small. As of now, KANN has less than 4000 lines of code in four source code
   files, with no non-standard dependencies by default.
 
 ### Limitations
 
-* CPU only. No out-of-box support of multi-threading (experimental support on
-  the mt branch). As such, KANN is **not** intended for training huge neural
+* CPU only. As such, KANN is **not** intended for training huge neural
   networks.
 
 * Bidirectional RNNs and seq2seq models require manual unrolling, which is
@@ -66,15 +65,7 @@ installation is needed. To compile examples:
 ```sh
 make
 ```
-This generates a few binaries in the [examples](examples) directory. If you
-have BLAS installed, you can ask KANN to use BLAS for matrix multiplication:
-```sh
-make CBLAS=/usr/local
-```
-This usually speeds up MLP and RNN, and may take the advantage of multiple CPU
-cores if your BLAS library is compiled with the multi-core support.
-Convolutional networks won't benefit from BLAS as KANN is not reducing
-convolution to matrix multiplication like Caffe and other libraries.
+This generates a few executables in the [examples](examples) directory.
 
 ## Documentations
 

diff --git a/examples/mlp.c b/examples/mlp.c
@@ -18,13 +18,13 @@ static kann_t *model_gen(int n_in, int n_out, int loss_type, int n_h_layers, int
 int main(int argc, char *argv[])
 {
 	int max_epoch = 50, mini_size = 64, max_drop_streak = 10, loss_type = KANN_C_CEB;
-	int i, j, c, n_h_neurons = 64, n_h_layers = 1, seed = 11;
+	int i, j, c, n_h_neurons = 64, n_h_layers = 1, seed = 11, n_threads = 1;
 	kann_data_t *in = 0;
 	kann_t *ann = 0;
 	char *out_fn = 0, *in_fn = 0;
 	float lr = 0.001f, frac_val = 0.1f, h_dropout = 0.0f;
 
-	while ((c = getopt(argc, argv, "n:l:s:r:m:B:o:i:d:v:M")) >= 0) {
+	while ((c = getopt(argc, argv, "n:l:s:r:m:B:o:i:d:v:Mt:")) >= 0) {
 		if (c == 'n') n_h_neurons = atoi(optarg);
 		else if (c == 'l') n_h_layers = atoi(optarg);
 		else if (c == 's') seed = atoi(optarg);
@@ -36,6 +36,7 @@ int main(int argc, char *argv[])
 		else if (c == 'd') h_dropout = atof(optarg);
 		else if (c == 'v') frac_val = atof(optarg);
 		else if (c == 'M') loss_type = KANN_C_CEM;
+		else if (c == 't') n_threads = atoi(optarg);
 	}
 	if (argc - optind < 1) {
 		FILE *fp = stdout;
@@ -54,6 +55,7 @@ int main(int argc, char *argv[])
 		fprintf(fp, "    -m INT      max number of epochs [%d]\n", max_epoch);
 		fprintf(fp, "    -B INT      mini-batch size [%d]\n", mini_size);
 		fprintf(fp, "    -v FLOAT    fraction of data used for validation [%g]\n", frac_val);
+		fprintf(fp, "    -t INT      number of threads [%d]\n", n_threads);
 		return 1;
 	}
 	if (argc - optind == 1 && in_fn == 0) {
@@ -75,6 +77,7 @@ int main(int argc, char *argv[])
 		assert(in->n_row == out->n_row);
 		if (ann) assert(kann_dim_out(ann) == out->n_col);
 		else ann = model_gen(in->n_col, out->n_col, loss_type, n_h_layers, n_h_neurons, h_dropout);
+		if (n_threads > 1) kann_mt(ann, n_threads, mini_size);
 		kann_train_fnn1(ann, lr, mini_size, max_epoch, max_drop_streak, frac_val, in->n_row, in->x, out->x);
 		if (out_fn) kann_save(out_fn, ann);
 		kann_data_free(out);

diff --git a/examples/mnist-cnn.c b/examples/mnist-cnn.c
@@ -9,17 +9,18 @@ int main(int argc, char *argv[])
 	kann_t *ann;
 	kann_data_t *x, *y;
 	char *fn_in = 0, *fn_out = 0;
-	int c, mini_size = 64, max_epoch = 20, max_drop_streak = 10, seed = 131, n_h_fc = 128, n_h_flt = 32;
+	int c, mini_size = 64, max_epoch = 20, max_drop_streak = 10, seed = 131, n_h_fc = 128, n_h_flt = 32, n_threads = 1;
 	float lr = 0.001f, dropout = 0.2f, frac_val = 0.1f;
 
-	while ((c = getopt(argc, argv, "i:o:m:h:f:d:s:")) >= 0) {
+	while ((c = getopt(argc, argv, "i:o:m:h:f:d:s:t:")) >= 0) {
 		if (c == 'i') fn_in = optarg;
 		else if (c == 'o') fn_out = optarg;
 		else if (c == 'm') max_epoch = atoi(optarg);
 		else if (c == 'h') n_h_fc = atoi(optarg);
 		else if (c == 'f') n_h_flt = atoi(optarg);
 		else if (c == 'd') dropout = atof(optarg);
 		else if (c == 's') seed = atoi(optarg);
+		else if (c == 't') n_threads = atoi(optarg);
 	}
 
 	if (argc - optind == 0 || (argc - optind == 1 && fn_in == 0)) {
@@ -51,6 +52,7 @@ int main(int argc, char *argv[])
 
 	if (y) { // training
 		assert(y->n_col == 10);
+		if (n_threads > 1) kann_mt(ann, n_threads, mini_size);
 		kann_train_fnn1(ann, lr, mini_size, max_epoch, max_drop_streak, frac_val, x->n_row, x->x, y->x);
 		if (fn_out) kann_save(fn_out, ann);
 		kann_data_free(y);

diff --git a/kann.h b/kann.h
@@ -97,6 +97,10 @@ void kann_delete_unrolled(kann_t *a); // delete a network generated by kann_unro
 /**
  * Enable/disable multi-threading (requiring pthread)
  *
+ * KANN splits a mini-batch to $n_threads mini-mini-batches and puts each of
+ * them on one thread. So far, only kann_cost() takes the advantage of
+ * multi-threading.
+ *
  * @param ann             network
  * @param n_threads       number of threads; <=1 to completely disable multi-threading
  * @param max_batch_size  max mini-batch size; shall no smaller than n_threads

diff --git a/kautodiff.c b/kautodiff.c
@@ -30,7 +30,7 @@ static inline kad_node_t *kad_vleaf(uint8_t flag, float *x, float *g, int n_d, v
 {
 	int i;
 	kad_node_t *p;
-	if (n_d >= KAD_MAX_DIM) return 0;
+	if (n_d > KAD_MAX_DIM) return 0;
 	p = (kad_node_t*)calloc(1, sizeof(kad_node_t));
 	p->n_d = n_d;
 	for (i = 0; i < n_d; ++i)

diff --git a/kautodiff.h b/kautodiff.h
@@ -27,7 +27,7 @@
 #ifndef KANN_AUTODIFF_H
 #define KANN_AUTODIFF_H
 
-#define KAD_VERSION "r491"
+#define KAD_VERSION "r492"
 
 #include <stdio.h>
 #include <stdint.h>
@@ -36,8 +36,8 @@
 #define KAD_MAX_DIM 4     // max dimension
 #define KAD_MAX_OP  64    // max number of operators
 
-/* A computational graph is an acyclic directed graph. In the graph, an
- * external node represents a variable, a constant or a feed; an internal node
+/* A computational graph is a directed acyclic graph. In the graph, an external
+ * node represents a variable, a constant or a feed; an internal node
  * represents an operator; an edge from node v to w indicates v is an operand
  * of w.
  */