Add documentation and remove unneeded build of decoder packages

icaas · Nov 9, 2018 · 38b5447 · 38b5447
1 parent 4d05c9a
commit 38b5447
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -213,13 +213,13 @@ cd DeepSpeech
 pip3 install -r requirements.txt
 ```
 
-You'll also need to download `native_client.tar.xz` or build the native client files yourself to get the custom TensorFlow OP needed for decoding the outputs of the neural network. You can use `util/taskcluster.py` to download the files for your architecture:
+You'll also need to install the `ds_ctcdecoder` Python package which is required for decoding the outputs of the acoustic model into text. We have binaries available in our CI infrastructure, you can use `util/taskcluster.py` to get a URL to the decoder package. When you pass the `--decoder` option, the script will print the URL to the appropriate decoder package for your platform and Python version:
 
 ```bash
-python3 util/taskcluster.py --target .
+pip3 install $(python3 util/taskcluster.py --decoder)
 ```
 
-This will download the native client files for the x86_64 architecture without CUDA support, and extract them into the current folder. If you prefer building the binaries from source, see the [native_client README file](native_client/README.md). We also have binaries with CUDA enabled ("--arch gpu") and for ARM7 ("--arch arm").
+This command will download and install the `ds_ctcdecoder` package. If you prefer building the binaries from source, see the [native_client README file](native_client/README.md). You can override the platform with `--arch` if you want the package for ARM7 (`--arch arm`) or ARM64 (`--arch arm64`).
 
 ### Recommendations
 

diff --git a/native_client/README.md b/native_client/README.md
@@ -55,7 +55,7 @@ If you'd like to build the binaries yourself, you'll need the following pre-requ
 
 It is required to use our fork of TensorFlow since it includes fixes for common problems encountered when building the native client files.
 
-If you'd like to build the language bindings, you'll also need:
+If you'd like to build the language bindings or the decoder package, you'll also need:
 
 * [SWIG](http://www.swig.org/)
 * [node-pre-gyp](https://github.com/mapbox/node-pre-gyp) (for Node.JS bindings only)
@@ -115,7 +115,7 @@ Included are a set of generated Python bindings. After following the above build
 ```
 cd native_client/python
 make bindings
-sudo pip install dist/deepspeech*
+pip install dist/deepspeech*
 ```
 
 The API mirrors the C++ API and is demonstrated in [client.py](python/client.py). Refer to [deepspeech.h](deepspeech.h) for documentation.
@@ -131,3 +131,13 @@ make npm-pack
 ```
 
 This will create the package `deepspeech-0.3.0.tgz` in `native_client/javascript`.
+
+## Building the CTC decoder package
+
+To build the `ds_ctcdecoder` package, you'll need the general requirements listed above (in particular SWIG). The command below builds the bindings using 8 processes for compilation. Adjust the parameter accordingly for more or less parallelism.
+
+```
+cd native_client/ctcdecode
+make bindings NUM_PROCESSES=8
+pip install dist/*.whl
+```
diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,4 @@ six
 requests
 tables
 attrdict
+setuptools
diff --git a/taskcluster/host-build.sh b/taskcluster/host-build.sh
@@ -21,7 +21,5 @@ do_deepspeech_binary_build
 
 do_deepspeech_python_build
 
-do_deepspeech_decoder_build
-
 do_deepspeech_nodejs_build
 
diff --git a/util/taskcluster.py b/util/taskcluster.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function, absolute_import, division
 
+import platform
 import subprocess
 import sys
 import os
@@ -16,7 +17,7 @@
 
 TASKCLUSTER_SCHEME = os.getenv('TASKCLUSTER_SCHEME', DEFAULT_SCHEMES['deepspeech'])
 
-def get_tc_url(arch_string=None, artifact_name='native_client.tar.xz', branch_name='master'):
+def get_tc_url(arch_string, artifact_name='native_client.tar.xz', branch_name='master'):
     assert arch_string is not None
     assert artifact_name is not None
     assert len(artifact_name) > 0
@@ -59,13 +60,16 @@ def maybe_download_tc_bin(**kwargs):
     final_stat = os.stat(final_file)
     os.chmod(final_file, final_stat.st_mode | stat.S_IEXEC)
 
+def read(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
 if __name__ == '__main__':
     import argparse
 
     parser = argparse.ArgumentParser(description='Tooling to ease downloading of components from TaskCluster.')
-    parser.add_argument('--target', required=True,
+    parser.add_argument('--target', required=False,
                         help='Where to put the native client binary files')
-    parser.add_argument('--arch', required=False, default='cpu',
+    parser.add_argument('--arch', required=False,
                         help='Which architecture to download binaries for. "arm" for ARM 7 (32-bit), "gpu" for CUDA enabled x86_64 binaries, "cpu" for CPU-only x86_64 binaries, "osx" for CPU-only x86_64 OSX binaries. Optional ("cpu" by default)')
     parser.add_argument('--artifact', required=False,
                         default='native_client.tar.xz',
@@ -74,9 +78,55 @@ def maybe_download_tc_bin(**kwargs):
                         help='Name of the TaskCluster scheme to use.')
     parser.add_argument('--branch', required=False, default='master',
                         help='Branch name to use. Defaulting to "master".')
+    parser.add_argument('--decoder', action='store_true',
+                        help='Get URL to ds_ctcdecoder Python package.')
 
     args = parser.parse_args()
 
+    if not args.target and not args.decoder:
+        print('Pass either --target or --decoder.')
+        exit(1)
+
+    is_arm = 'arm' in platform.machine()
+    is_mac = 'darwin' in sys.platform
+    is_64bit = sys.maxsize > (2**31 - 1)
+    is_ucs2 = sys.maxunicode < 0x10ffff
+
+    if not args.arch:
+        if is_arm:
+            args.arch = 'arm64' if is_64bit else 'arm'
+        elif is_mac:
+            args.arch = 'osx'
+        else:
+            args.arch = 'cpu'
+
+    if args.decoder:
+        plat = platform.system().lower()
+        arch = platform.machine()
+
+        if plat == 'linux' and arch == 'x86_64':
+            plat = 'manylinux1'
+
+        if plat == 'darwin':
+            plat = 'macosx_10_10'
+
+        from pkg_resources import parse_version
+        ds_version = parse_version(read('../VERSION'))
+
+        m_or_mu = 'mu' if is_ucs2 else 'm'
+        pyver = ''.join(map(str, sys.version_info[0:2]))
+
+        artifact = "ds_ctcdecoder-{ds_version}-cp{pyver}-cp{pyver}{m_or_mu}-{platform}_{arch}.whl".format(
+            ds_version=ds_version,
+            pyver=pyver,
+            m_or_mu=m_or_mu,
+            platform=plat,
+            arch=arch
+        )
+
+        print(get_tc_url(args.arch, artifact, args.branch))
+        exit(0)
+
     if args.source is not None:
         if args.source in DEFAULT_SCHEMES:
             TASKCLUSTER_SCHEME = DEFAULT_SCHEMES[args.source]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21,7 +21,5 @@ do_deepspeech_binary_build

		do_deepspeech_python_build

		do_deepspeech_decoder_build

		do_deepspeech_nodejs_build