Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/incu…

…bator-airflow
Vic020 · Jun 3, 2016 · 89edb6f · 89edb6f
2 parents fb5a3b3 + f72b0d3
commit 89edb6f
Show file tree

Hide file tree

Showing 5 changed files with 301 additions and 156 deletions.
diff --git a/README.md b/README.md
@@ -104,6 +104,7 @@ Currently **officially** using Airflow:
 * [Lucid](http://luc.id) [[@jbrownlucid](https://github.com/jbrownlucid) & [@kkourtchikov](https://github.com/kkourtchikov)]
 * [Lyft](https://www.lyft.com/)[[@SaurabhBajaj](https://github.com/SaurabhBajaj)]
 * [Nerdwallet](https://www.nerdwallet.com)
+* [Qubole](https://qubole.com) [[@msumit](https://github.com/msumit)]
 * [Sense360](https://github.com/Sense360) [[@kamilmroczek](https://github.com/KamilMroczek)]
 * [Sidecar](https://hello.getsidecar.com/) [[@getsidecar](https://github.com/getsidecar)]
 * [SimilarWeb](https://www.similarweb.com/) [[@similarweb](https://github.com/similarweb)]

diff --git a/airflow/contrib/operators/qubole_operator.py b/airflow/contrib/operators/qubole_operator.py
@@ -5,96 +5,82 @@
 
 class QuboleOperator(BaseOperator):
     """
-    Executes commands on Qubole (https://qubole.com).
+    Execute tasks (commands) on QDS (https://qubole.com).
+
+    :param qubole_conn_id: Connection id which consists of qds auth_token
+    :type qubole_conn_id: str
+
+    kwargs:
+        :command_type: type of command to be executed, e.g. hivecmd, shellcmd, hadoopcmd
+        :tags: array of tags to be assigned with the command
+        :cluster_label: cluster label on which the command will be executed
+        :name: name to be given to command
+
+        **Arguments specific to command types**
 
-    mandatory:
-        :param command_type: type of command to be executed, e.g. hivecmd, shellcmd, hadoopcmd
-    other:
         hivecmd:
-            :param query: inline query statement
-            :param script_location: s3 location containing query statement
-            :param sample_size:
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :query: inline query statement
+            :script_location: s3 location containing query statement
+            :sample_size: size of sample in bytes on which to run query
+            :macros: macro values which were used in query
         prestocmd:
-            :param query: inline query statement
-            :param script_location: s3 location containing query statement
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :query: inline query statement
+            :script_location: s3 location containing query statement
+            :macros: macro values which were used in query
         hadoopcmd:
-            :param sub_commnad: must be one these ["jar", "s3distcp", "streaming"] followed by 1 or more args
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :sub_commnad: must be one these ["jar", "s3distcp", "streaming"] followed by 1 or more args
         shellcmd:
-            :param script: inline command with args
-            :param script_location: s3 location containing query statement
-            :param files: list of files in s3 bucket as file1,file2 format. These files will be copied into the working
-                          directory where the qubole command is being executed.
-            :param archives: list of archives in s3 bucket as archive1,archive2 format. These will be unarchived into
-                             the working directory where the qubole command is being executed
-            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied)
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :script: inline command with args
+            :script_location: s3 location containing query statement
+            :files: list of files in s3 bucket as file1,file2 format. These files will be copied into the working directory where the qubole command is being executed.
+            :archives: list of archives in s3 bucket as archive1,archive2 format. These will be unarchived intothe working directory where the qubole command is being executed
+            :parameters: any extra args which need to be passed to script (only when script_location is supplied)
         pigcmd:
-            :param script: inline query statement (latin_statements)
-            :param script_location: s3 location containing pig query
-            :param parameters: any extra args which need to be passed to script (only wwhen script_location is supplied
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
-        dbtapquerycmd:
-            :param db_tap_id: data store ID of the target database, in Qubole.
-            :param query: inline query statement
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param name: name to be given to command
+            :script: inline query statement (latin_statements)
+            :script_location: s3 location containing pig query
+            :parameters: any extra args which need to be passed to script (only when script_location is supplied
         sparkcmd:
-            :param program: the complete Spark Program in Scala, SQL, Command, R, or Python
-            :param cmdline: spark-submit command line, all required information must be specify in cmdline itself.
-            :param sql: inline sql query
-            :param script_location: s3 location containing query statement
-            :param language: language of the program, Scala, SQL, Command, R, or Python
-            :param app_id: ID of an Spark job server app
-            :param arguments: spark-submit command line arguments
-            :param user_program_arguments: arguments that the user program takes in
-            :param macros: macro values which were used in query
-            :param tags: array of tags to be assigned with the command
-            :param cluster_label: cluster label on which to execute command
-            :param name: name to be given to command
+            :program: the complete Spark Program in Scala, SQL, Command, R, or Python
+            :cmdline: spark-submit command line, all required information must be specify in cmdline itself.
+            :sql: inline sql query
+            :script_location: s3 location containing query statement
+            :language: language of the program, Scala, SQL, Command, R, or Python
+            :app_id: ID of an Spark job server app
+            :arguments: spark-submit command line arguments
+            :user_program_arguments: arguments that the user program takes in
+            :macros: macro values which were used in query
+        dbtapquerycmd:
+            :db_tap_id: data store ID of the target database, in Qubole.
+            :query: inline query statement
+            :macros: macro values which were used in query
         dbexportcmd:
-            :param mode: 1 (simple), 2 (advance)
-            :param hive_table: Name of the hive table
-            :param partition_spec: partition specification for Hive table.
-            :param dbtap_id: data store ID of the target database, in Qubole.
-            :param db_table: name of the db table
-            :param db_update_mode: allowinsert or updateonly
-            :param db_update_keys: columns used to determine the uniqueness of rows
-            :param export_dir: HDFS/S3 location from which data will be exported.
-            :param fields_terminated_by:
-            :param tags: array of tags to be assigned with the command
-            :param name: name to be given to command
+            :mode: 1 (simple), 2 (advance)
+            :hive_table: Name of the hive table
+            :partition_spec: partition specification for Hive table.
+            :dbtap_id: data store ID of the target database, in Qubole.
+            :db_table: name of the db table
+            :db_update_mode: allowinsert or updateonly
+            :db_update_keys: columns used to determine the uniqueness of rows
+            :export_dir: HDFS/S3 location from which data will be exported.
+            :fields_terminated_by: hex of the char used as column separator in the dataset.
         dbimportcmd:
-            :param mode: 1 (simple), 2 (advance)
-            :param hive_table: Name of the hive table
-            :param dbtap_id: data store ID of the target database, in Qubole.
-            :param db_table: name of the db table
-            :param where_clause: where clause, if any
-            :param parallelism: number of parallel db connections to use for extracting data
-            :param extract_query: SQL query to extract data from db. $CONDITIONS must be part of the where clause.
-            :param boundary_query: Query to be used get range of row IDs to be extracted
-            :param split_column: Column used as row ID to split data into ranges (mode 2)
-            :param tags: array of tags to be assigned with the command
-            :param name: name to be given to command
-
+            :mode: 1 (simple), 2 (advance)
+            :hive_table: Name of the hive table
+            :dbtap_id: data store ID of the target database, in Qubole.
+            :db_table: name of the db table
+            :where_clause: where clause, if any
+            :parallelism: number of parallel db connections to use for extracting data
+            :extract_query: SQL query to extract data from db. $CONDITIONS must be part of the where clause.
+            :boundary_query: Query to be used get range of row IDs to be extracted
+            :split_column: Column used as row ID to split data into ranges (mode 2)
+
+    .. note:: Following fields are template-supported : ``query``, ``script_location``, ``sub_command``, ``script``, ``files``,
+        ``archives``, ``program``, ``cmdline``, ``sql``, ``where_clause``, ``extract_query``, ``boundary_query``, ``macros``, ``tags``,
+        ``name``, ``parameters``. You can also use ``.txt`` files for template driven use cases.
     """
 
-    template_fields = ('query', 'script_location', 'sub_command', 'script', 'files', 'archives', 'program', 'cmdline', 'sql', 'where_clause', 'extract_query', 'boundary_query', 'macros', 'tags', 'name')
+    template_fields = ('query', 'script_location', 'sub_command', 'script', 'files', 'archives', 'program', 'cmdline',
+                       'sql', 'where_clause', 'extract_query', 'boundary_query', 'macros', 'tags', 'name', 'parameters')
     template_ext = ('.txt')
     ui_color = '#3064A1'
     ui_fgcolor = '#fff'

diff --git a/dev/README.md b/dev/README.md
@@ -18,18 +18,22 @@ Usage: airflow-pr [OPTIONS] COMMAND [ARGS]...
   This tool should be used by Airflow committers to test PRs, merge them
   into the master branch, and close related JIRA issues.
 
-  NOTE: this tool will restore your current branch when it finishes, but
-  you will lose any uncommitted changes.
+  Before you begin, make sure you have created the 'apache' and 'github' git
+  remotes. You can use the "setup_git_remotes" command to do this
+  automatically. If you do not want to use these remote names, you can tell
+  the PR tool by setting the appropriate environment variables. For more
+  information, run:
 
-  *** Please commit any changes you wish to keep before proceeding. ***
+      airflow-pr merge --help
 
 Options:
   --help  Show this message and exit.
 
 Commands:
-  close_jira  Close a JIRA issue (without merging a PR)
-  merge       Merge a GitHub PR into Airflow master
-  work_local  Clone a GitHub PR locally for testing (no push)
+  close_jira         Close a JIRA issue (without merging a PR)
+  merge              Merge a GitHub PR into Airflow master
+  setup_git_remotes  Set up default git remotes
+  work_local         Clone a GitHub PR locally for testing (no push)
 ```
 
 #### Commands
@@ -40,6 +44,8 @@ Execute `airflow-pr work_local` to only merge the PR locally. The tool will paus
 
 Execute `airflow-pr close_jira` to close a JIRA issue without needing to merge a PR. You will be prompted for an issue number and close comment.
 
+Execute `airflow-pr setup_git_remotes` to configure the default (expected) git remotes. See below for details.
+
 ### Configuration
 
 #### Python Libraries
@@ -49,8 +55,12 @@ pip install click jira
 ```
 
 #### git Remotes
+tl;dr run `airflow-pr setup_git_remotes` before using the tool for the first time.
+
 Before using the merge tool, users need to make sure their git remotes are configured. By default, the tool assumes a setup like the one below, where the github repo remote is named `github` and the Apache repo remote is named `apache`. If users have other remote names, they can be supplied by setting environment variables `GITHUB_REMOTE_NAME` and `APACHE_REMOTE_NAME`, respectively.
 
+Users can configure this automatically by running `airflow-pr setup_git_remotes`.
+
 ```bash
 $ git remote -v
 apache	https://git-wip-us.apache.org/repos/asf/incubator-airflow.git (fetch)