From c43fee541da8a15c3871c1963a2927ab4dca3e07 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sat, 1 Aug 2015 11:27:13 -0700 Subject: [PATCH] enable basic sphinx doc --- README.md | 13 +- doc/.gitignore | 7 + doc/Makefile | 192 ++++++++++++++++++++++ doc/conf.py | 158 ++++++++++++++++++ doc/external_memory.md | 10 +- doc/{README.md => index.md} | 26 ++- doc/input_format.md | 9 +- doc/parameter.md | 2 +- doc/python/python_api.rst | 36 ++++ doc/{python.md => python/python_intro.md} | 3 +- doc/sphinx_util.py | 50 ++++++ python-package/xgboost/__init__.py | 4 + python-package/xgboost/core.py | 63 ++++--- python-package/xgboost/sklearn.py | 12 +- 14 files changed, 529 insertions(+), 56 deletions(-) create mode 100644 doc/.gitignore create mode 100644 doc/Makefile create mode 100644 doc/conf.py rename doc/{README.md => index.md} (87%) create mode 100644 doc/python/python_api.rst rename doc/{python.md => python/python_intro.md} (98%) create mode 100644 doc/sphinx_util.py diff --git a/README.md b/README.md index 0f6ffc7faa3b..be93e99fda76 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ DMLC/XGBoost [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost) [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. +An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version. It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data @@ -13,7 +13,7 @@ Contents -------- * [What's New](#whats-new) * [Version](#version) -* [Documentation](doc/README.md) +* [Documentation](doc/index.md) * [Build Instruction](doc/build.md) * [Features](#features) * [Distributed XGBoost](multi-node) @@ -43,15 +43,14 @@ Version Features -------- - -* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py), - [R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R), +* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py), + [R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R), [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl) * Its fast! Benchmark numbers comparing xgboost, H20, Spark, R - [benchm-ml numbers](https://github.com/szilard/benchm-ml) * Memory efficient - Handles sparse matrices, supports external memory * Accurate prediction, and used extensively by data scientists and kagglers - [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links) * Distributed version runs on Hadoop (YARN), MPI, SGE etc., scales to billions of examples. - + Bug Reporting ------------- @@ -74,4 +73,4 @@ License XGBoost in Graphlab Create -------------------------- * XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html) -* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: +* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge: diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 000000000000..382c3419ff43 --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1,7 @@ +html +latex +*.sh +_* +doxygen +parser.py +*.pyc diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 000000000000..40bba2a280db --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,192 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/rabit" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 000000000000..b08f495f58ae --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +# +# documentation build configuration file, created by +# sphinx-quickstart on Thu Jul 23 19:40:08 2015. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. +import sys +import os, subprocess +import shlex +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +libpath = os.path.join(curr_path, '../python-package/') +sys.path.insert(0, libpath) +sys.path.insert(0, curr_path) + +from sphinx_util import MarkdownParser + +# -- General configuration ------------------------------------------------ + +# General information about the project. +project = u'xgboost' +author = u'%s developers' % project +copyright = u'2015, %s' % author +github_doc_root = 'https://github.com/dmlc/xgboost/tree/master/doc/' + +# add markdown parser +MarkdownParser.github_doc_root = github_doc_root +source_parsers = { + '.md': MarkdownParser, +} +os.environ['XGBOOST_BUILD_DOC'] = '1' +# Version information. +import xgboost +version = xgboost.__version__ +release = xgboost.__version__ + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.mathjax', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = ['.rst', '.md'] + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Output file base name for HTML help builder. +htmlhelp_basename = project + 'doc' + +# -- Options for LaTeX output --------------------------------------------- +latex_elements = { +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, '%s.tex' % project, project, + author, 'manual'), +] + +# hook for doxygen +def run_doxygen(folder): + """Run the doxygen make command in the designated folder.""" + try: + retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True) + if retcode < 0: + sys.stderr.write("doxygen terminated by signal %s" % (-retcode)) + except OSError as e: + sys.stderr.write("doxygen execution failed: %s" % e) + +def generate_doxygen_xml(app): + """Run the doxygen make commands if we're on the ReadTheDocs server""" + read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' + if read_the_docs_build: + run_doxygen('..') + +def setup(app): + # Add hook for building doxygen xml when needed + # no c++ API for now + # app.connect("builder-inited", generate_doxygen_xml) + pass diff --git a/doc/external_memory.md b/doc/external_memory.md index f8eec83fc8d1..e50c02e570fb 100644 --- a/doc/external_memory.md +++ b/doc/external_memory.md @@ -1,5 +1,5 @@ Using XGBoost External Memory Version(beta) -==== +=========================================== There is no big difference between using external memory version and in-memory version. The only difference is the filename format. @@ -19,13 +19,13 @@ You can find that there is additional ```#dtrain.cache``` following the libsvm f For CLI version, simply use ```"../data/agaricus.txt.train#dtrain.cache"``` in filename. Performance Note -==== +---------------- * the parameter ```nthread``` should be set to number of ***real*** cores - Most modern CPU offer hyperthreading, which means you can have a 4 core cpu with 8 threads - Set nthread to be 4 for maximum performance in such case Distributed Version -==== +------------------- The external memory mode naturally works on distributed version, you can simply set path like ``` data = "hdfs:///path-to-data/#dtrain.cache" @@ -34,8 +34,8 @@ xgboost will cache the data to the local position. When you run on YARN, the cur so that you can directly use ```dtrain.cache``` to cache to current folder. -Usage Note: -==== +Usage Note +---------- * This is a experimental version - If you like to try and test it, report results to https://github.com/dmlc/xgboost/issues/244 * Currently only importing from libsvm format is supported diff --git a/doc/README.md b/doc/index.md similarity index 87% rename from doc/README.md rename to doc/index.md index e8df7d57de12..5d8d5b26f647 100644 --- a/doc/README.md +++ b/doc/index.md @@ -1,6 +1,9 @@ -List of Documentations -==== -* [Using XGBoost in Python](python.md) +XGBoost Documentation +===================== + + + +* [Using XGBoost in Python](python/python_intro.md) * [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd) * [Learning to use xgboost by example](../demo) * [External Memory Version](external_memory.md) @@ -11,24 +14,29 @@ List of Documentations - [Notes on Parameter Tunning](param_tuning.md) * Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) -How to get started -==== + +How to Get Started +------------------ * Try to read the [binary classification example](../demo/binary_classification) for getting started example * Find the guide specific language guide above for the language you like to use * [Learning to use xgboost by example](../demo) contains lots of useful examples -Highlight Links -==== +Example Highlight Links +----------------------- This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request. * [Kaggle CrowdFlower winner's solution by Chenglong Chen](https://github.com/ChenglongChen/Kaggle_CrowdFlower) * [Kaggle Malware Prediction winner's solution](https://github.com/xiaozhouwang/kaggle_Microsoft_Malware) * [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) * [Feature Importance Analysis with XGBoost in Tax audit](http://fr.slideshare.net/MichaelBENESTY/feature-importance-analysis-with-xgboost-in-tax-audit) * Video tutorial: [Better Optimization with Repeated Cross Validation and the XGBoost model](https://www.youtube.com/watch?v=Og7CGAfSr_Y) -* [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) +* [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) + +API Reference +------------- + * [Python API Reference](python/python_api.rst) Contribution -==== +------------ Contribution of documents and use-cases are welcomed! * This package use Google C++ style * Check tool of codestyle diff --git a/doc/input_format.md b/doc/input_format.md index 557b875121f0..3986d07fb182 100644 --- a/doc/input_format.md +++ b/doc/input_format.md @@ -1,12 +1,13 @@ -Input Format -==== +Text Input Format of DMatrix +============================ + ## Basic Input Format As we have mentioned, XGBoost takes LibSVM format. For training or predicting, XGBoost takes an instance file with the format as below: train.txt ``` 1 101:1.2 102:0.03 -0 1:2.1 10001:300 10002:400 +0 1:2.1 10001:300 10002:400 0 0:1.3 1:0.3 1 0:0.01 1:0.3 0 0:0.2 1:0.3 @@ -37,7 +38,7 @@ train.txt.weight 0.5 ``` It means that XGBoost will emphasize more on the first and fourth instanceļ¼Œ that is to say positive instances while training. -The configuration is similar to configuring the group information. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.weight" in the same directory and if there is, will use the weights while training models. Weights will be included into an "xxx.buffer" file that is created by XGBoost automatically. If you want to update the weights, you need to delete the "xxx.buffer" file prior to launching XGBoost. +The configuration is similar to configuring the group information. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.weight" in the same directory and if there is, will use the weights while training models. Weights will be included into an "xxx.buffer" file that is created by XGBoost automatically. If you want to update the weights, you need to delete the "xxx.buffer" file prior to launching XGBoost. ## Initial Margin file XGBoost supports providing each instance an initial margin prediction. For example, if we have a initial prediction using logistic regression for "train.txt" file, we can create the following file: diff --git a/doc/parameter.md b/doc/parameter.md index 13eefa0fec6a..53cdd806f2b9 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -1,5 +1,5 @@ XGBoost Parameters -==== +================== Before running XGboost, we must set three types of parameters, general parameters, booster parameters and task parameters: - General parameters relates to which booster we are using to do boosting, commonly tree or linear model - Booster parameters depends on which booster you have chosen diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst new file mode 100644 index 000000000000..e665efe84a2d --- /dev/null +++ b/doc/python/python_api.rst @@ -0,0 +1,36 @@ +Python API Reference +==================== +This page gives the Python API reference of xgboost. + +Core Data Structure +------------------- +.. automodule:: xgboost.core + +.. autoclass:: xgboost.DMatrix + :members: + :show-inheritance: + +.. autoclass:: xgboost.Booster + :members: + :show-inheritance: + + +Learning API +------------ +.. automodule:: xgboost.training + +.. autofunction:: xgboost.train + +.. autofunction:: xgboost.cv + + +Scikit-Learn API +---------------- +.. automodule:: xgboost.sklearn +.. autoclass:: xgboost.XGBRegressor + :members: + :show-inheritance: +.. autoclass:: xgboost.XGBClassifier + :members: + :show-inheritance: + diff --git a/doc/python.md b/doc/python/python_intro.md similarity index 98% rename from doc/python.md rename to doc/python/python_intro.md index 93b5c43d4bac..2acb73b3c340 100644 --- a/doc/python.md +++ b/doc/python/python_intro.md @@ -1,5 +1,5 @@ XGBoost Python Module -==== +===================== This page will introduce XGBoost Python module, including: * [Building and Import](#building-and-import) @@ -8,6 +8,7 @@ This page will introduce XGBoost Python module, including: * [Train Model](#training-model) * [Early Stopping](#early-stopping) * [Prediction](#prediction) +* [API Reference](python_api.md) A [walk through python example](https://github.com/tqchen/xgboost/blob/master/demo/guide-python) for UCI Mushroom dataset is provided. diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py new file mode 100644 index 000000000000..33c98d3815bc --- /dev/null +++ b/doc/sphinx_util.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +"""Helper hacking utilty function for customization.""" +import sys +import os +import subprocess + +# TODO: make less hacky way than this one +if os.environ.get('READTHEDOCS', None) == 'True': + subprocess.call('cd ..; rm -rf recommonmark;' + + 'git clone https://github.com/tqchen/recommonmark;' + + 'cp recommonmark/recommonmark/parser.py doc/parser', shell=True) + +sys.path.insert(0, os.path.abspath('..')) +import parser + +class MarkdownParser(parser.CommonMarkParser): + github_doc_root = None + doc_suffix = set(['md', 'rst']) + + @staticmethod + def remap_url(url): + if MarkdownParser.github_doc_root is None or url is None: + return url + if url.startswith('#'): + return url + arr = url.split('#', 1) + ssuffix = arr[0].rsplit('.', 1) + + if len(ssuffix) == 2 and (ssuffix[-1] in MarkdownParser.doc_suffix + and arr[0].find('://') == -1): + arr[0] = ssuffix[0] + '.html' + return '#'.join(arr) + else: + if arr[0].find('://') == -1: + return MarkdownParser.github_doc_root + url + else: + return url + + def reference(self, block): + block.destination = remap_url(block.destination) + return super(MarkdownParser, self).reference(block) + +# inplace modify the function in recommonmark module to allow link remap +old_ref = parser.reference + +def reference(block): + block.destination = MarkdownParser.remap_url(block.destination) + return old_ref(block) + +parser.reference = reference diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py index 6f967b8378dc..b284c27e0d90 100644 --- a/python-package/xgboost/__init__.py +++ b/python-package/xgboost/__init__.py @@ -10,3 +10,7 @@ from .sklearn import XGBModel, XGBClassifier, XGBRegressor __version__ = '0.4' + +__all__ = ['DMatrix', 'Booster', + 'train', 'cv', + 'XGBModel', 'XGBClassifier', 'XGBRegressor'] diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 85017cb82709..0849d276cf9e 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -50,20 +50,24 @@ def find_lib_path(): else: dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - if len(lib_path) == 0: + if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False): raise XGBoostLibraryNotFound( - 'Cannot find XGBoost Libarary in the candicate path %s,' + - 'Did you run build.sh in root oath?' % str(dll_path)) + 'Cannot find XGBoost Libarary in the candicate path, ' + + 'did you run build.sh in root path?\n' + 'List of candidates:\n' + ('\n'.join(dll_path))) return lib_path + def _load_lib(): """Load xgboost Library.""" lib_path = find_lib_path() + if len(lib_path) == 0: + return None lib = ctypes.cdll.LoadLibrary(lib_path[0]) lib.XGBGetLastError.restype = ctypes.c_char_p - return lib + # load the XGBoost library globally _LIB = _load_lib() @@ -119,6 +123,7 @@ class DMatrix(object): DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed. + You can construct DMatrix from numpy.arrays """ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): """ @@ -127,15 +132,16 @@ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): Parameters ---------- data : string/numpy array/scipy.sparse - Data source, string type is the path of svmlight format txt file, - xgb buffer or path to cache_file - label : list or numpy 1-D array (optional) + Data source of DMatrix. + When data is string type, it represents the path libsvm format txt file, + or binary file that xgboost can read from. + label : list or numpy 1-D array, optional Label of the training data. - missing : float + missing : float, optional Value in the data which needs to be present as a missing value. - weight : list or numpy 1-D array (optional) + weight : list or numpy 1-D array , optional Weight for each instance. - silent: boolean + silent : boolean, optional Whether print messages during construction """ # force into void_p, mac need to pass things in as void_p @@ -469,13 +475,22 @@ def copy(self): """Copy the booster object. Returns - -------- - a copied booster model + ------- + booster: `Booster` + a copied booster model """ return self.__copy__() def set_param(self, params, value=None): - """Set parameters into the DMatrix.""" + """Set parameters into the Booster. + + Parameters + ---------- + params: dict/list/str + list of key,value paris, dict of key to value or simply str key + value: optional + value of the specified parameter, when params is str key + """ if isinstance(params, collections.Mapping): params = params.items() elif isinstance(params, STRING_TYPES) and value is not None: @@ -485,7 +500,7 @@ def set_param(self, params, value=None): def update(self, dtrain, iteration, fobj=None): """ - Update (one iteration). + Update for one iteration, with objective function calculated internally. Parameters ---------- @@ -507,7 +522,7 @@ def update(self, dtrain, iteration, fobj=None): def boost(self, dtrain, grad, hess): """ - Update. + Boost the booster for one iteration, with customized gradient statistics. Parameters ---------- @@ -542,7 +557,8 @@ def eval_set(self, evals, iteration=0, feval=None): Returns ------- - evaluation result + result: str + Evaluation result string. """ if feval is None: for d in evals: @@ -567,18 +583,21 @@ def eval_set(self, evals, iteration=0, feval=None): def eval(self, data, name='eval', iteration=0): """Evaluate the model on mat. - Parameters - --------- + ---------- data : DMatrix The dmatrix storing the input. - name : str (default = 'eval') - The name of the dataset + name : str, optional + The name of the dataset. + iteration : int, optional + The current iteration number. - iteration : int (default = 0) - The current iteration number + Returns + ------- + result: str + Evaluation result string. """ return self.eval_set([(data, name)], iteration) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 4a5771724b03..6f176972aced 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -206,9 +206,9 @@ def predict(self, data): class XGBClassifier(XGBModel, XGBClassifierBase): # pylint: disable=missing-docstring,too-many-arguments,invalid-name - __doc__ = """ - Implementation of the scikit-learn API for XGBoost classification - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) + __doc__ = """Implementation of the scikit-learn API for XGBoost classification. + + """ + '\n'.join(XGBModel.__doc__.split('\n')[2:]) def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, @@ -335,7 +335,5 @@ def predict_proba(self, data): class XGBRegressor(XGBModel, XGBRegressorBase): # pylint: disable=missing-docstring - __doc__ = """ - Implementation of the scikit-learn API for XGBoost regression - """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) - + __doc__ = """Implementation of the scikit-learn API for XGBoost regression. + """ + '\n'.join(XGBModel.__doc__.split('\n')[2:])