From d20ce8851f375611c10aba0cdd32a345df7f2c48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=86=89=E5=B0=8F=E9=BE=99?= Date: Fri, 6 Dec 2019 18:18:15 +0800 Subject: [PATCH] Update website for 2.4.2 release (#5801) * Update website for 2.4.2 release Signed-off-by: xiaolong.ran * update website docs for release 2.4.2 Signed-off-by: xiaolong.ran --- site2/website/releases.json | 1 + .../version-2.4.2/admin-api-functions.md | 546 +++++ .../admin-api-non-persistent-topics.md | 264 +++ .../version-2.4.2/admin-api-overview.md | 89 + .../admin-api-partitioned-topics.md | 377 +++ .../version-2.4.2/administration-dashboard.md | 60 + .../version-2.4.2/administration-upgrade.md | 153 ++ .../version-2.4.2/client-libraries-java.md | 708 ++++++ .../version-2.4.2/concepts-messaging.md | 415 ++++ .../version-2.4.2/concepts-tiered-storage.md | 18 + .../version-2.4.2/cookbooks-compaction.md | 127 + .../version-2.4.2/develop-bare-metal.md | 446 ++++ .../version-2.4.2/functions-cli.md | 196 ++ .../version-2.4.2/functions-debug.md | 448 ++++ .../version-2.4.2/functions-deploy.md | 211 ++ .../version-2.4.2/functions-develop.md | 899 +++++++ .../version-2.4.2/functions-overview.md | 188 ++ .../version-2.4.2/functions-runtime.md | 142 ++ .../version-2.4.2/functions-worker.md | 243 ++ .../getting-started-standalone.md | 226 ++ .../version-2.4.2/io-cdc-canal.md | 175 ++ .../version-2.4.2/io-cdc-debezium.md | 261 ++ .../version-2.4.2/io-connectors.md | 31 + .../versioned_docs/version-2.4.2/io-debug.md | 329 +++ .../version-2.4.2/io-develop.md | 196 ++ .../versioned_docs/version-2.4.2/io-jdbc.md | 24 + .../versioned_docs/version-2.4.2/io-netty.md | 148 ++ .../versioned_docs/version-2.4.2/io-use.md | 1505 ++++++++++++ .../version-2.4.2/reference-configuration.md | 494 ++++ .../reference-connector-admin.md | 601 +++++ .../version-2.4.2/reference-metrics.md | 244 ++ .../version-2.4.2/reference-pulsar-admin.md | 2095 +++++++++++++++++ .../schema-evolution-compatibility.md | 784 ++++++ .../version-2.4.2/schema-get-started.md | 91 + .../version-2.4.2/schema-manage.md | 786 +++++++ .../version-2.4.2/schema-understand.md | 576 +++++ .../version-2.4.2/security-athenz.md | 93 + .../version-2.4.2/security-authorization.md | 100 + .../version-2.4.2/security-kerberos.md | 391 +++ .../version-2.4.2/security-overview.md | 38 + .../security-tls-authentication.md | 156 ++ .../version-2.4.2/security-tls-transport.md | 230 ++ .../version-2.4.2/security-token-client.md | 123 + .../version-2.4.2-sidebars.json | 142 ++ site2/website/versions.json | 1 + 45 files changed, 15371 insertions(+) create mode 100644 site2/website/versioned_docs/version-2.4.2/admin-api-functions.md create mode 100644 site2/website/versioned_docs/version-2.4.2/admin-api-non-persistent-topics.md create mode 100644 site2/website/versioned_docs/version-2.4.2/admin-api-overview.md create mode 100644 site2/website/versioned_docs/version-2.4.2/admin-api-partitioned-topics.md create mode 100644 site2/website/versioned_docs/version-2.4.2/administration-dashboard.md create mode 100644 site2/website/versioned_docs/version-2.4.2/administration-upgrade.md create mode 100644 site2/website/versioned_docs/version-2.4.2/client-libraries-java.md create mode 100644 site2/website/versioned_docs/version-2.4.2/concepts-messaging.md create mode 100644 site2/website/versioned_docs/version-2.4.2/concepts-tiered-storage.md create mode 100644 site2/website/versioned_docs/version-2.4.2/cookbooks-compaction.md create mode 100644 site2/website/versioned_docs/version-2.4.2/develop-bare-metal.md create mode 100644 site2/website/versioned_docs/version-2.4.2/functions-cli.md create mode 100644 site2/website/versioned_docs/version-2.4.2/functions-debug.md create mode 100644 site2/website/versioned_docs/version-2.4.2/functions-deploy.md create mode 100644 site2/website/versioned_docs/version-2.4.2/functions-develop.md create mode 100644 site2/website/versioned_docs/version-2.4.2/functions-overview.md create mode 100644 site2/website/versioned_docs/version-2.4.2/functions-runtime.md create mode 100644 site2/website/versioned_docs/version-2.4.2/functions-worker.md create mode 100644 site2/website/versioned_docs/version-2.4.2/getting-started-standalone.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-cdc-canal.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-cdc-debezium.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-connectors.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-debug.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-develop.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-jdbc.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-netty.md create mode 100644 site2/website/versioned_docs/version-2.4.2/io-use.md create mode 100644 site2/website/versioned_docs/version-2.4.2/reference-configuration.md create mode 100644 site2/website/versioned_docs/version-2.4.2/reference-connector-admin.md create mode 100644 site2/website/versioned_docs/version-2.4.2/reference-metrics.md create mode 100644 site2/website/versioned_docs/version-2.4.2/reference-pulsar-admin.md create mode 100644 site2/website/versioned_docs/version-2.4.2/schema-evolution-compatibility.md create mode 100644 site2/website/versioned_docs/version-2.4.2/schema-get-started.md create mode 100644 site2/website/versioned_docs/version-2.4.2/schema-manage.md create mode 100644 site2/website/versioned_docs/version-2.4.2/schema-understand.md create mode 100644 site2/website/versioned_docs/version-2.4.2/security-athenz.md create mode 100644 site2/website/versioned_docs/version-2.4.2/security-authorization.md create mode 100644 site2/website/versioned_docs/version-2.4.2/security-kerberos.md create mode 100644 site2/website/versioned_docs/version-2.4.2/security-overview.md create mode 100644 site2/website/versioned_docs/version-2.4.2/security-tls-authentication.md create mode 100644 site2/website/versioned_docs/version-2.4.2/security-tls-transport.md create mode 100644 site2/website/versioned_docs/version-2.4.2/security-token-client.md create mode 100644 site2/website/versioned_sidebars/version-2.4.2-sidebars.json diff --git a/site2/website/releases.json b/site2/website/releases.json index 8544a40636f8b..0f4da24423a69 100644 --- a/site2/website/releases.json +++ b/site2/website/releases.json @@ -1,4 +1,5 @@ [ + "2.4.2", "2.4.1", "2.4.0", "2.3.2", diff --git a/site2/website/versioned_docs/version-2.4.2/admin-api-functions.md b/site2/website/versioned_docs/version-2.4.2/admin-api-functions.md new file mode 100644 index 0000000000000..adb37df492720 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/admin-api-functions.md @@ -0,0 +1,546 @@ +--- +id: version-2.4.2-admin-api-functions +title: Managing functions +sidebar_label: Functions +original_id: admin-api-functions +--- + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics +* apply a user-supplied processing logic to each message +* publish the results of the computation to another topic + +Functions can be managed via the following methods. + +Method | Description +---|--- +**Admin CLI** | The [`functions`](reference-pulsar-admin.md#functions) command of the [`pulsar-admin`](reference-pulsar-admin.md) tool. +**REST API** |The `/admin/v3/functions` endpoint of the admin {@inject: rest:REST:/} API. +**Java Admin API**| The `functions` method of the {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object in the [Java API](client-libraries-java.md). + +## Function resources + +You can perform the following operations on functions. + +### Create a function + +You can create a Pulsar function in cluster mode (deploy it on a Pulsar cluster) using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`create`](reference-pulsar-admin.md#functions-create) subcommand. + +**Example** + +```shell +$ pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --inputs test-input-topic \ + --output persistent://public/default/test-output-topic \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --jar /examples/api-examples.jar +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName} + +#### Java Admin API + +```java +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +functionConfig.setProcessingGuarantees(FunctionConfig.ProcessingGuarantees.ATLEAST_ONCE); +functionConfig.setTopicsPattern(sourceTopicPattern); +functionConfig.setSubName(subscriptionName); +functionConfig.setAutoAck(true); +functionConfig.setOutput(sinkTopic); +admin.functions().createFunction(functionConfig, fileName); +``` + +### Update a function + +You can update a Pulsar function that has been deployed to a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`update`](reference-pulsar-admin.md#functions-update) subcommand. + +**Example** + +```shell +$ pulsar-admin functions update \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --output persistent://public/default/update-output-topic \ + # other options +``` + +#### REST Admin API + +{@inject: endpoint|PUT|/admin/v3/functions/{tenant}/{namespace}/{functionName} + +#### Java Admin API + +```java +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setTenant(tenant); +functionConfig.setNamespace(namespace); +functionConfig.setName(functionName); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setParallelism(1); +functionConfig.setClassName("org.apache.pulsar.functions.api.examples.ExclamationFunction"); +UpdateOptions updateOptions = new UpdateOptions(); +updateOptions.setUpdateAuthData(updateAuthData); +admin.functions().updateFunction(functionConfig, userCodeFile, updateOptions); +``` + +### Start an instance of a function + +You can start a stopped function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +```shell +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/{instanceId}/start + +#### Java Admin API + +```java +admin.functions().startFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); +``` + +### Start all instances of a function + +You can start all stopped function instances using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`start`](reference-pulsar-admin.md#functions-start) subcommand. + +**Example** + +```shell +$ pulsar-admin functions start \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/start + +#### Java + +```java +admin.functions().startFunction(tenant, namespace, functionName); +``` + +### Stop an instance of a function + +You can stop a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/{instanceId}/stop + +#### Java Admin API + +```java +admin.functions().stopFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); +``` + +### Stop all instances of a function + +You can stop all function instances using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`stop`](reference-pulsar-admin.md#functions-stop) subcommand. + +**Example** + +```shell +$ pulsar-admin functions stop \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/stop + +#### Java Admin API + +```java +admin.functions().stopFunction(tenant, namespace, functionName); +``` + +### Restart an instance of a function + +Restart a function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/{instanceId}/restart + +#### Java Admin API + +```java +admin.functions().restartFunction(tenant, namespace, functionName, Integer.parseInt(instanceId)); +``` + +### Restart all instances of a function + +You can restart all function instances using Admin CLI, REST API or Java admin API. + +#### Admin CLI + +Use the [`restart`](reference-pulsar-admin.md#functions-restart) subcommand. + +**Example** + +```shell +$ pulsar-admin functions restart \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/restart + +#### Java Admin API + +```java +admin.functions().restartFunction(tenant, namespace, functionName); +``` + +### List all functions + +You can list all Pulsar functions running under a specific tenant and namespace using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`list`](reference-pulsar-admin.md#functions-list) subcommand. + +**Example** + +```shell +$ pulsar-admin functions list \ + --tenant public \ + --namespace default +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v3/functions/{tenant}/{namespace} + +#### Java Admin API + +```java +admin.functions().getFunctions(tenant, namespace); +``` + +### Delete a function + +You can delete a Pulsar function that is running on a Pulsar cluster using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`delete`](reference-pulsar-admin.md#functions-delete) subcommand. + +**Example** + +```shell +$ pulsar-admin functions delete \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v3/functions/{tenant}/{namespace}/{functionName} + +#### Java Admin API + +```java +admin.functions().deleteFunction(tenant, namespace, functionName); +``` + +### Get info about a function + +You can get information about a Pulsar function currently running in cluster mode using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`get`](reference-pulsar-admin.md#functions-get) subcommand. + +**Example** + +```shell +$ pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v3/functions/{tenant}/{namespace}/{functionName} + +#### Java Admin API + +```java +admin.functions().getFunction(tenant, namespace, functionName); +``` + +### Get status of an instance of a function + +You can get the current status of a Pulsar function instance with `instance-id` using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v3/functions/{tenant}/{namespace}/{functionName}/{instanceId}/status + +#### Java Admin API + +```java +admin.functions().getFunctionStatus(tenant, namespace, functionName, Integer.parseInt(instanceId)); +``` + +### Get status of all instances of a function + +You can get the current status of a Pulsar function instance using Admin CLI, REST API or Java Admin API. + +#### Admin CLI + +Use the [`status`](reference-pulsar-admin.md#functions-status) subcommand. + +**Example** + +```shell +$ pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v3/functions/{tenant}/{namespace}/{functionName}/status + +#### Java Admin API + +```java +admin.functions().getFunctionStatus(tenant, namespace, functionName); +``` + +### Get stats of an instance of a function + +You can get the current stats of a Pulsar Function instance with `instance-id` using Admin CLI, REST API or Java admin API. + +#### Admin CLI + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --instance-id 1 +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v3/functions/{tenant}/{namespace}/{functionName}/{instanceId}/stats + +#### Java Admin API + +```java +admin.functions().getFunctionStats(tenant, namespace, functionName, Integer.parseInt(instanceId)); +``` + +### Get stats of all instances of a function + +You can get the current stats of a Pulsar function using Admin CLI, REST API or Java admin API. + +#### Admin CLI + +Use the [`stats`](reference-pulsar-admin.md#functions-stats) subcommand. + +**Example** + +```shell +$ pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v3/functions/{tenant}/{namespace}/{functionName}/stats + +#### Java Admin API + +```java +admin.functions().getFunctionStats(tenant, namespace, functionName); +``` + +### Trigger a function + +You can trigger a specified Pulsar function with a supplied value using Admin CLI, REST API or Java admin API. + +#### Admin CLI + +Use the [`trigger`](reference-pulsar-admin.md#functions-trigger) subcommand. + +**Example** + +```shell +$ pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --topic (the name of input topic) \ + --trigger-value \"hello pulsar\" + # or --trigger-file (the path of trigger file) +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/trigger + +#### Java Admin API + +```java +admin.functions().triggerFunction(tenant, namespace, functionName, topic, triggerValue, triggerFile); +``` + +### Put state associated with a function + +You can put the state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +#### Admin CLI + +Use the [`putstate`](reference-pulsar-admin.md#functions-putstate) subcommand. + +**Example** + +```shell +$ pulsar-admin functions putstate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --state "{\"key\":\"pulsar\", \"stringValue\":\"hello pulsar\"}" +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v3/functions/{tenant}/{namespace}/{functionName}/state/{key} + +#### Java Admin API + +```java +TypeReference typeRef = new TypeReference() {}; +FunctionState stateRepr = ObjectMapperFactory.getThreadLocal().readValue(state, typeRef); +admin.functions().putFunctionState(tenant, namespace, functionName, stateRepr); +``` + +### Fetch state associated with a function + +You can fetch the current state associated with a Pulsar function using Admin CLI, REST API or Java admin API. + +#### Admin CLI + +Use the [`querystate`](reference-pulsar-admin.md#functions-querystate) subcommand. + +**Example** + +```shell +$ pulsar-admin functions querystate \ + --tenant public \ + --namespace default \ + --name (the name of Pulsar Functions) \ + --key (the key of state) +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v3/functions/{tenant}/{namespace}/{functionName}/state/{key} + +#### Java Admin CLI + +```java +admin.functions().getFunctionState(tenant, namespace, functionName, key); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/admin-api-non-persistent-topics.md b/site2/website/versioned_docs/version-2.4.2/admin-api-non-persistent-topics.md new file mode 100644 index 0000000000000..24b4e21d95974 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/admin-api-non-persistent-topics.md @@ -0,0 +1,264 @@ +--- +id: version-2.4.2-admin-api-non-persistent-topics +title: Managing non-persistent topics +sidebar_label: Non-Persistent topics +original_id: admin-api-non-persistent-topics +--- + +Non-persistent can be used in applications that only want to consume real time published messages and +do not need persistent guarantee that can also reduce message-publish latency by removing overhead of +persisting messages. + +In all of the instructions and commands below, the topic name structure is: + +```shell +non-persistent://tenant/namespace/topic +``` + +## Non-persistent topics resources + +### Get stats + +It shows current statistics of a given non-partitioned topic. + + - **msgRateIn**: The sum of all local and replication publishers' publish rates in messages per second + + - **msgThroughputIn**: Same as above, but in bytes per second instead of messages per second + + - **msgRateOut**: The sum of all local and replication consumers' dispatch rates in messages per second + + - **msgThroughputOut**: Same as above, but in bytes per second instead of messages per second + + - **averageMsgSize**: The average size in bytes of messages published within the last interval + + - **publishers**: The list of all local publishers into the topic. There can be zero or thousands + + - **averageMsgSize**: Average message size in bytes from this publisher within the last interval + + - **producerId**: Internal identifier for this producer on this topic + + - **producerName**: Internal identifier for this producer, generated by the client library + + - **address**: IP address and source port for the connection of this producer + + - **connectedSince**: Timestamp this producer was created or last reconnected + + - **subscriptions**: The list of all local subscriptions to the topic + + - **my-subscription**: The name of this subscription (client defined) + + - **type**: This subscription type + + - **consumers**: The list of connected consumers for this subscription + + - **consumerName**: Internal identifier for this consumer, generated by the client library + + - **availablePermits**: The number of messages this consumer has space for in the client library's listen queue. A value less than 1 means the client library's queue is full and receive() isn't being called. A non-negative value means this consumer is ready to be dispatched messages. + + - **replication**: This section gives the stats for cross-colo replication of this topic + + - **connected**: Whether the outbound replicator is connected + + - **inboundConnection**: The IP and port of the broker in the remote cluster's publisher connection to this broker + + - **inboundConnectedSince**: The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute. + + - **msgDropRate**: for publisher: publish: broker only allows configured number of in flight per connection, and drops all other published messages above the threshold. Broker also drops messages for subscriptions in case of unavailable limit and connection is not writable. + + +```json +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "msgDropRate" : 0.0, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null, + "msgDropRate" : 0.0 + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers" : [ { + "msgRateOut" : 20343.506296021893, + "msgThroughputOut" : 2.0979855364233278E7, + "msgRateRedeliver" : 0.0, + "consumerName" : "fe3c0", + "availablePermits" : 950, + "unackedMessages" : 0, + "blockedConsumerOnUnackedMsgs" : false, + "address" : "/10.73.210.249:60578", + "connectedSince" : "2017-07-26 15:13:48.026-0700", + "clientVersion" : "1.19-incubating-SNAPSHOT" + } ], + "msgDropRate" : 432.2390921571593 + + } + }, + "replication": {} +} +``` + +#### pulsar-admin + +Topic stats can be fetched using [`stats`](reference-pulsar-admin.md#stats) command. + +```shell +$ pulsar-admin non-persistent stats \ + non-persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/non-persistent/:tenant/:namespace/:topic/stats|operation/getStats} + + +#### Java + +```java +String topic = "non-persistent://my-tenant/my-namespace/my-topic"; +admin.nonPersistentTopics().getStats(topic); +``` + +### Get internal stats + +It shows detailed statistics of a topic. + +#### pulsar-admin + +Topic internal-stats can be fetched using [`stats-internal`](reference-pulsar-admin.md#stats-internal) command. + +```shell +$ pulsar-admin non-persistent stats-internal \ + non-persistent://test-tenant/ns1/tp1 \ + +{ + "entriesAddedCounter" : 48834, + "numberOfEntries" : 0, + "totalSize" : 0, + "cursors" : { + "s1" : { + "waitingReadOp" : false, + "pendingReadOps" : 0, + "messagesConsumedCounter" : 0, + "cursorLedger" : 0, + "cursorLedgerLastEntry" : 0 + } + } +} + +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/non-persistent/:tenant/:namespace/:topic/internalStats|operation/getInternalStats} + +#### Java + +```java +String topic = "non-persistent://my-tenant/my-namespace/my-topic"; +admin.nonPersistentTopics().getInternalStats(topic); +``` + +### Create partitioned topic + +Partitioned topics in Pulsar must be explicitly created. When creating a new partitioned topic you need to provide a name for the topic as well as the desired number of partitions. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +#### pulsar-admin + +```shell +$ bin/pulsar-admin non-persistent create-partitioned-topic \ + non-persistent://my-tenant/my-namespace/my-topic \ + --partitions 4 +``` + +#### REST API + +{@inject: endpoint|PUT|/admin/v2/non-persistent/:tenant/:namespace/:topic/partitions|operation/createPartitionedTopic} + +#### Java + +```java +String topicName = "non-persistent://my-tenant/my-namespace/my-topic"; +int numPartitions = 4; +admin.nonPersistentTopics().createPartitionedTopic(topicName, numPartitions); +``` + +### Get metadata + +Partitioned topics have metadata associated with them that you can fetch as a JSON object. The following metadata fields are currently available: + +Field | Meaning +:-----|:------- +`partitions` | The number of partitions into which the topic is divided + +#### pulsar-admin + +```shell +$ pulsar-admin non-persistent get-partitioned-topic-metadata \ + non-persistent://my-tenant/my-namespace/my-topic +{ + "partitions": 4 +} +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/non-persistent/:tenant/:namespace/:topic/partitions|operation/getPartitionedMetadata} + + +#### Java + +```java +String topicName = "non-persistent://my-tenant/my-namespace/my-topic"; +admin.nonPersistentTopics().getPartitionedTopicMetadata(topicName); +``` + +### Unload topic + +It unloads a topic. + +#### pulsar-admin + +Topic can be unloaded using [`unload`](reference-pulsar-admin.md#unload) command. + +```shell +$ pulsar-admin non-persistent unload \ + non-persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|PUT|/admin/v2/non-persistent/:tenant/:namespace/:topic/unload|operation/unloadTopic} + +#### Java + +```java +String topic = "non-persistent://my-tenantmy-namespace/my-topic"; +admin.nonPersistentTopics().unload(topic); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/admin-api-overview.md b/site2/website/versioned_docs/version-2.4.2/admin-api-overview.md new file mode 100644 index 0000000000000..8c82c27901489 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/admin-api-overview.md @@ -0,0 +1,89 @@ +--- +id: version-2.4.2-admin-api-overview +title: The Pulsar admin interface +sidebar_label: Overview +original_id: admin-api-overview +--- + +The Pulsar admin interface enables you to manage all of the important entities in a Pulsar [instance](reference-terminology.md#instance), such as [tenants](reference-terminology.md#tenant), [topics](reference-terminology.md#topic), and [namespaces](reference-terminology.md#namespace). + +You can currently interact with the admin interface via: + +- Making HTTP calls against the admin {@inject: rest:REST:/} API provided by Pulsar [brokers](reference-terminology.md#broker). For some restful apis, they might be redirected to topic owner brokers for serving + with [`307 Temporary Redirect`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/307), hence the HTTP callers should handle `307 Temporary Redirect`. If you are using `curl`, you should specify `-L` + to handle redirections. +- The `pulsar-admin` CLI tool, which is available in the `bin` folder of your [Pulsar installation](getting-started-standalone.md): + +```shell +$ bin/pulsar-admin +``` + +Full documentation for this tool can be found in the [Pulsar command-line tools](reference-pulsar-admin.md) doc. + +- A Java client interface. + +> #### The REST API is the admin interface +> Under the hood, both the `pulsar-admin` CLI tool and the Java client both use the REST API. If you’d like to implement your own admin interface client, you should use the REST API as well. Full documentation can be found here. + +In this document, examples from each of the three available interfaces will be shown. + +## Admin setup + +Each of Pulsar's three admin interfaces---the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool, the [Java admin API](/api/admin), and the {@inject: rest:REST:/} API ---requires some special setup if you have [authentication](security-overview.md#authentication-providers) enabled in your Pulsar [instance](reference-terminology.md#instance). + +### pulsar-admin + +If you have [authentication](security-overview.md#authentication-providers) enabled, you will need to provide an auth configuration to use the [`pulsar-admin`](reference-pulsar-admin.md) tool. By default, the configuration for the `pulsar-admin` tool is found in the [`conf/client.conf`](reference-configuration.md#client) file. Here are the available parameters: + +|Name|Description|Default| +|----|-----------|-------| +|webServiceUrl|The web URL for the cluster.|http://localhost:8080/| +|brokerServiceUrl|The Pulsar protocol URL for the cluster.|pulsar://localhost:6650/| +|authPlugin|The authentication plugin.| | +|authParams|The authentication parameters for the cluster, as a comma-separated string.| | +|useTls|Whether or not TLS authentication will be enforced in the cluster.|false| +|tlsAllowInsecureConnection|Accept untrusted TLS certificate from client.|false| +|tlsTrustCertsFilePath|Path for the trusted TLS certificate file.| | + +### REST API + +You can find documentation for the REST API exposed by Pulsar [brokers](reference-terminology.md#broker) in this reference {@inject: rest:document:/}. + +### Java admin client + +To use the Java admin API, instantiate a {@inject: javadoc:PulsarAdmin:/admin/org/apache/pulsar/client/admin/PulsarAdmin} object, specifying a URL for a Pulsar [broker](reference-terminology.md#broker) and a {@inject: javadoc:PulsarAdminBuilder:/admin/org/apache/pulsar/client/admin/PulsarAdminBuilder}. Here's a minimal example using `localhost`: + +```java +String url = "http://localhost:8080"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); +``` + +If you have multiple brokers to use, you can use multi-host like Pulsar service. For example, +```java +String url = "http://localhost:8080,localhost:8081,localhost:8082"; +// Pass auth-plugin class fully-qualified name if Pulsar-security enabled +String authPluginClassName = "com.org.MyAuthPluginClass"; +// Pass auth-param if auth-plugin class requires it +String authParams = "param1=value1"; +boolean useTls = false; +boolean tlsAllowInsecureConnection = false; +String tlsTrustCertsFilePath = null; +PulsarAdmin admin = PulsarAdmin.builder() +.authentication(authPluginClassName,authParams) +.serviceHttpUrl(url) +.tlsTrustCertsFilePath(tlsTrustCertsFilePath) +.allowTlsInsecureConnection(tlsAllowInsecureConnection) +.build(); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/admin-api-partitioned-topics.md b/site2/website/versioned_docs/version-2.4.2/admin-api-partitioned-topics.md new file mode 100644 index 0000000000000..433ac8bf58456 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/admin-api-partitioned-topics.md @@ -0,0 +1,377 @@ +--- +id: version-2.4.2-admin-api-partitioned-topics +title: Managing partitioned topics +sidebar_label: Partitioned topics +original_id: admin-api-partitioned-topics +--- + + +You can use Pulsar's [admin API](admin-api-overview.md) to create and manage partitioned topics. + +In all of the instructions and commands below, the topic name structure is: + +```shell +persistent://tenant/namespace/topic +``` + +## Partitioned topics resources + +### Create + +Partitioned topics in Pulsar must be explicitly created. When creating a new partitioned topic you +need to provide a name for the topic as well as the desired number of partitions. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +#### pulsar-admin + +You can create partitioned topics using the [`create-partitioned-topic`](reference-pulsar-admin.md#create-partitioned-topic) +command and specifying the topic name as an argument and the number of partitions using the `-p` or `--partitions` flag. +Here's an example: + +```shell +$ bin/pulsar-admin topics create-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 4 +``` + +#### REST API + +{@inject: endpoint|PUT|/admin/v2/persistent/:tenant/:namespace/:topic/partitions|operation/createPartitionedTopic} + +#### Java + +```java +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +int numPartitions = 4; +admin.persistentTopics().createPartitionedTopic(topicName, numPartitions); +``` + +## Nonpartitioned topics resources + +### Create + +Nonpartitioned topics in Pulsar must be explicitly created if allowAutoTopicCreation or createIfMissing is disabled. +When creating a non-partitioned topic, you need to provide a topic name. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +#### pulsar-admin + +You can create non-partitioned topics using the [`create`](reference-pulsar-admin.md#create) +command and specifying the topic name as an argument. This is an example command: + +```shell +$ bin/pulsar-admin topics create persistent://my-tenant/my-namespace/my-topic +``` + +#### REST API + +{@inject: endpoint|PUT|admin/v2/persistent/:tenant/:namespace/:topic|operation/createNonPartitionedTopic} + +#### Java + +```java +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createNonPartitionedTopic(topicName); +``` + +### Get metadata + +Partitioned topics have metadata associated with them that you can fetch as a JSON object. +The following metadata fields are currently available: + +Field | Meaning +:-----|:------- +`partitions` | The number of partitions into which the topic is divided + +#### pulsar-admin + +You can see the number of partitions in a partitioned topic using the +[`get-partitioned-topic-metadata`](reference-pulsar-admin.md#get-partitioned-topic-metadata) +subcommand. Here's an example: + +```shell +$ pulsar-admin topics get-partitioned-topic-metadata \ + persistent://my-tenant/my-namespace/my-topic +{ + "partitions": 4 +} +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/partitions|operation/getPartitionedMetadata} + +#### Java + +```java +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getPartitionedTopicMetadata(topicName); +``` + +### Update + +You can update the number of partitions on an existing partitioned topic +*if* the topic is non-global. To update, the new number of partitions must be greater +than the existing number. + +Decrementing the number of partitions would deleting the topic, which is not supported in Pulsar. + +Already created partitioned producers and consumers will automatically find the newly created partitions. + +#### pulsar-admin + +Partitioned topics can be updated using the +[`update-partitioned-topic`](reference-pulsar-admin.md#update-partitioned-topic) command. + +```shell +$ pulsar-admin topics update-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 8 +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/persistent/:tenant/:cluster/:namespace/:destination/partitions|operation/updatePartitionedTopic} + +#### Java + +```java +admin.persistentTopics().updatePartitionedTopic(persistentTopic, numPartitions); +``` + +### Delete + +#### pulsar-admin + +Partitioned topics can be deleted using the +[`delete-partitioned-topic`](reference-pulsar-admin.md#delete-partitioned-topic) command, specifying the topic by name: + +```shell +$ bin/pulsar-admin topics delete-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/persistent/:topic/:namespace/:destination/partitions|operation/deletePartitionedTopic} + +#### Java + +```java +admin.persistentTopics().delete(persistentTopic); +``` + +### List + +It provides a list of persistent topics existing under a given namespace. + +#### pulsar-admin + +```shell +$ pulsar-admin topics list tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace|operation/getPartitionedTopicList} + +#### Java + +```java +admin.persistentTopics().getList(namespace); +``` + +### Stats + +It shows current statistics of a given partitioned topic. Here's an example payload: + +```json +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} +``` + +The following stats are available: + +|Stat|Description| +|----|-----------| +|msgRateIn|The sum of all local and replication publishers’ publish rates in messages per second| +|msgThroughputIn|Same as msgRateIn but in bytes per second instead of messages per second| +|msgRateOut|The sum of all local and replication consumers’ dispatch rates in messages per second| +|msgThroughputOut|Same as msgRateOut but in bytes per second instead of messages per second| +|averageMsgSize|Average message size, in bytes, from this publisher within the last interval| +|storageSize|The sum of the ledgers’ storage size for this topic| +|publishers|The list of all local publishers into the topic. There can be anywhere from zero to thousands.| +|producerId|Internal identifier for this producer on this topic| +|producerName|Internal identifier for this producer, generated by the client library| +|address|IP address and source port for the connection of this producer| +|connectedSince|Timestamp this producer was created or last reconnected| +|subscriptions|The list of all local subscriptions to the topic| +|my-subscription|The name of this subscription (client defined)| +|msgBacklog|The count of messages in backlog for this subscription| +|type|This subscription type| +|msgRateExpired|The rate at which messages were discarded instead of dispatched from this subscription due to TTL| +|consumers|The list of connected consumers for this subscription| +|consumerName|Internal identifier for this consumer, generated by the client library| +|availablePermits|The number of messages this consumer has space for in the client library’s listen queue. A value of 0 means the client library’s queue is full and receive() isn’t being called. A nonzero value means this consumer is ready to be dispatched messages.| +|replication|This section gives the stats for cross-colo replication of this topic| +|replicationBacklog|The outbound replication backlog in messages| +|connected|Whether the outbound replicator is connected| +|replicationDelayInSeconds|How long the oldest message has been waiting to be sent through the connection, if connected is true| +|inboundConnection|The IP and port of the broker in the remote cluster’s publisher connection to this broker| +|inboundConnectedSince|The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute.| + +#### pulsar-admin + +The stats for the partitioned topic and its connected producers and consumers can be fetched by using the +[`partitioned-stats`](reference-pulsar-admin.md#partitioned-stats) command, specifying the topic by name: + +```shell +$ pulsar-admin topics partitioned-stats \ + persistent://test-tenant/namespace/topic \ + --per-partition +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/partitioned-stats|operation/getPartitionedStats} + +#### Java + +```java +admin.persistentTopics().getStats(persistentTopic); +``` + +### Internal stats + +It shows detailed statistics of a topic. + +|Stat|Description| +|----|-----------| +|entriesAddedCounter|Messages published since this broker loaded this topic| +|numberOfEntries|Total number of messages being tracked| +|totalSize|Total storage size in bytes of all messages| +|currentLedgerEntries|Count of messages written to the ledger currently open for writing| +|currentLedgerSize|Size in bytes of messages written to ledger currently open for writing| +|lastLedgerCreatedTimestamp|Time when last ledger was created| +|lastLedgerCreationFailureTimestamp|time when last ledger was failed| +|waitingCursorsCount|How many cursors are caught up and waiting for a new message to be published| +|pendingAddEntriesCount|How many messages have (asynchronous) write requests we are waiting on completion| +|lastConfirmedEntry|The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger has been opened or is currently being opened but has no entries written yet.| +|state|The state of the cursor ledger. Open means we have a cursor ledger for saving updates of the markDeletePosition.| +|ledgers|The ordered list of all ledgers for this topic holding its messages| +|cursors|The list of all cursors on this topic. There will be one for every subscription you saw in the topic stats.| +|markDeletePosition|The ack position: the last message the subscriber acknowledged receiving| +|readPosition|The latest position of subscriber for reading message| +|waitingReadOp|This is true when the subscription has read the latest message published to the topic and is waiting on new messages to be published.| +|pendingReadOps|The counter for how many outstanding read requests to the BookKeepers we have in progress| +|messagesConsumedCounter|Number of messages this cursor has acked since this broker loaded this topic| +|cursorLedger|The ledger being used to persistently store the current markDeletePosition| +|cursorLedgerLastEntry|The last entryid used to persistently store the current markDeletePosition| +|individuallyDeletedMessages|If Acks are being done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position| +|lastLedgerSwitchTimestamp|The last time the cursor ledger was rolled over| + + +```json +{ + "entriesAddedCounter": 20449518, + "numberOfEntries": 3233, + "totalSize": 331482, + "currentLedgerEntries": 3233, + "currentLedgerSize": 331482, + "lastLedgerCreatedTimestamp": "2016-06-29 03:00:23.825", + "lastLedgerCreationFailureTimestamp": null, + "waitingCursorsCount": 1, + "pendingAddEntriesCount": 0, + "lastConfirmedEntry": "324711539:3232", + "state": "LedgerOpened", + "ledgers": [ + { + "ledgerId": 324711539, + "entries": 0, + "size": 0 + } + ], + "cursors": { + "my-subscription": { + "markDeletePosition": "324711539:3133", + "readPosition": "324711539:3233", + "waitingReadOp": true, + "pendingReadOps": 0, + "messagesConsumedCounter": 20449501, + "cursorLedger": 324702104, + "cursorLedgerLastEntry": 21, + "individuallyDeletedMessages": "[(324711539:3134‥324711539:3136], (324711539:3137‥324711539:3140], ]", + "lastLedgerSwitchTimestamp": "2016-06-29 01:30:19.313", + "state": "Open" + } + } +} +``` + +#### pulsar-admin + +The internal stats for the partitioned topic can be fetched by using the +[`stats-internal`](reference-pulsar-admin.md#stats-internal) command, specifying the topic by name: + +```shell +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/namespace/topic +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/internalStats|operation/getInternalStats} + +#### Java + +```java +admin.persistentTopics().getInternalStats(persistentTopic); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/administration-dashboard.md b/site2/website/versioned_docs/version-2.4.2/administration-dashboard.md new file mode 100644 index 0000000000000..69ae0f9e4a83a --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/administration-dashboard.md @@ -0,0 +1,60 @@ +--- +id: version-2.4.2-administration-dashboard +title: The Pulsar dashboard +sidebar_label: Dashboard +original_id: administration-dashboard +--- + +The Pulsar dashboard is a web application that enables users to monitor current stats for all [topics](reference-terminology.md#topic) in tabular form. + +The dashboard is a data collector that polls stats from all the brokers in a Pulsar instance (across multiple clusters) and stores all the information in a [PostgreSQL](https://www.postgresql.org/) database. + +A [Django](https://www.djangoproject.com) web app is used to render the collected data. + +## Install + +The easiest way to use the dashboard is to run it inside a [Docker](https://www.docker.com/products/docker) container. + +```shell +$ SERVICE_URL=http://broker.example.com:8080/ +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + apachepulsar/pulsar-dashboard:{{pulsar:version}} +``` + +The {@inject: github:`Dockerfile`:/dashboard/Dockerfile} can be found in `dashboard` directory, you can build a image from scratch also: + +```shell +$ docker build -t apachepulsar/pulsar-dashboard dashboard +``` + +If token authentication is enabled: +> Provided token should have super-user access. +```shell +$ SERVICE_URL=http://broker.example.com:8080/ +$ JWT_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + -e JWT_TOKEN=$JWT_TOKEN \ + apachepulsar/pulsar-dashboard +``` + +You need to specify only one service URL for a Pulsar cluster. Internally, the collector will figure out all the existing clusters and the brokers from where it needs to pull the metrics. If you're connecting the dashboard to Pulsar running in standalone mode, the URL will be `http://:8080` by default. `` is the ip address or hostname of the machine running Pulsar standalone. The ip address or hostname should be accessible from the docker instance running dashboard. + +Once the Docker container is running, the web dashboard will be accessible via `localhost` or whichever host is being used by Docker. + +> The `SERVICE_URL` that the dashboard uses needs to be reachable from inside the Docker container + +If the Pulsar service is running in standalone mode in `localhost`, the `SERVICE_URL` would have to +be the IP of the machine. + +Similarly, given the Pulsar standalone advertises itself with localhost by default, we need to +explicitely set the advertise address to the host IP. For example: + +```shell +$ bin/pulsar standalone --advertised-address 1.2.3.4 +``` + +### Known issues + +Only Pulsar Token [authentication](security-overview.md#authentication-providers) is supported as of now. diff --git a/site2/website/versioned_docs/version-2.4.2/administration-upgrade.md b/site2/website/versioned_docs/version-2.4.2/administration-upgrade.md new file mode 100644 index 0000000000000..78bdd73d6ba7b --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/administration-upgrade.md @@ -0,0 +1,153 @@ +--- +id: version-2.4.2-administration-upgrade +title: Upgrade Guide +sidebar_label: Upgrade +original_id: administration-upgrade +--- + +## Upgrade guidelines + +Apache Pulsar is comprised of multiple components, ZooKeeper, bookies and brokers. These components are either stateful or stateless. You do not have to upgrade ZooKeeper nodes unless you have special requirement. While upgrading, you need to pay attention to bookies (stateful), brokers and proxies (stateless). + +The following are some guidelines on upgrading a Pulsar cluster. Read the guidelines before upgrading. + +- Backup all your configuration files before upgrading. +- Read guide entirely, make a plan, and then execute the plan. When you make upgrade plan, you need to take your specific requirements and environment into consideration. +- Pay attention to the upgrading order of components. In general, you do not need to upgrade + your ZooKeeper or configuration store cluster (the global ZooKeeper cluster). You + need to upgrade bookies first, and then upgrade brokers, proxies, and your clients. +- If `autorecovery` is enabled, you need to disable `autorecovery` in the upgrade process, and re-enable it after completing the process. +- Read the release notes carefully for each release. Release notes contain features, configuration changes that might impact your upgrade. +- Upgrade a small subset of nodes of each type to canary test the new version before upgrading all nodes of that type in the cluster. When you have upgraded the canary nodes, run for a while to ensure that they are working correctly. +- Upgrade one data center to verify new version before upgrading all data centers if your cluster is running in multi-cluster replicated mode. + +> Note: Currently, Apache Pulsar is compatible between versions. + +## Upgrade sequence + +To upgrade an Apache Pulsar cluster, follow the upgrade sequence. + +1. Upgrade ZooKeeper (optional) +- Canary test: test an upgraded version in one or a small set of ZooKeeper nodes. +- Rolling upgrade: rollout the upgraded version to all ZooKeeper servers incrementally, one at a time. Monitor your dashboard during the whole rolling upgrade process. +2. Upgrade bookies +- Canary test: test an upgraded version in one or a small set of bookies. +- Rolling upgrade: + - a. Disable `autorecovery` with the following command. + ```shell + bin/bookkeeper shell autorecovery -disable + ``` + - b. Rollout the upgraded version to all bookies in the cluster after you have determined a version is safe after canary. + - c. After all bookies are upgraded, re-enable `autorecovery` with the following command. + ```shell + bin/bookkeeper shell autorecovery -enable + ``` +3. Upgrade brokers +- Canary test: test an upgraded version in one or a small set of brokers. +- Rolling upgrade: rollout the upgraded version to all brokers in the cluster after you have determined a version is safe after canary. +4. Upgrade proxies +- Canary test: test an upgraded version in one or a small set of proxies. +- Rolling upgrade: rollout the upgraded version to all proxies in the cluster after you have determined a version is safe after canary. + +## Upgrade ZooKeeper (optional) +While upgrading ZooKeeper servers, you can do canary test first, and then upgrade all ZooKeeper servers in the cluster. + +### Canary test + +You can test an upgraded version in one of ZooKeeper servers before upgrading all ZooKeeper servers in your cluster. + +To upgrade ZooKeeper server to a new version, complete the following steps: + +1. Stop a ZooKeeper server. +2. Upgrade the binary and configuration files. +3. Start the ZooKeeper server with the new binary files. +4. Use `pulsar zookeeper-shell` to connect to the newly upgraded ZooKeeper server and run a few commands to verify if it works as expected. +5. Run the ZooKeeper server for a few days, observe and make sure the ZooKeeper cluster runs well. + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic ZooKeeper node, revert the binary and configuration, and restart the ZooKeeper with the reverted binary. + +### Upgrade all ZooKeeper servers + +After canary test to upgrade one ZooKeeper in your cluster, you can upgrade all ZooKeeper servers in your cluster. + +You can upgrade all ZooKeeper servers one by one by following steps in canary test. + +## Upgrade bookies + +While upgrading bookies, you can do canary test first, and then upgrade all bookies in the cluster. +For more details, you can read Apache BookKeeper [Upgrade guide](http://bookkeeper.apache.org/docs/latest/admin/upgrade). + +### Canary test + +You can test an upgraded version in one or a small set of bookies before upgrading all bookies in your cluster. + +To upgrade bookie to a new version, complete the following steps: + +1. Stop a bookie. +2. Upgrade the binary and configuration files. +3. Start the bookie in `ReadOnly` mode. It is to verify if the bookie of this new version runs well for read workload. + ```shell + bin/pulsar bookie --readOnly + ``` +4. When the bookie runs successfully in `ReadOnly` mode, stop the bookie and restart it in `Write/Read` mode. + ```shell + bin/pulsar bookie + ``` +5. Observe and make sure the cluster serves both write and read traffic. + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic bookie node. Other bookies in the cluster will replace this problematic bookie node with autorecovery. + +### Upgrade all bookies + +After canary test to upgrade some bookies in your cluster, you can upgrade all bookies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, upgrade one bookie at a time. In a downtime upgrade scenario, shut down the entire cluster, upgrade each bookie, and then start the cluster. + +While upgrading in both scenarios, the procedure is the same for each bookie. + +1. Stop the bookie. +2. Upgrade the software (either new binary or new configuration files). +2. Start the bookie. + +> **Advanced operations** +> When upgrading a large BookKeeper cluster in rolling upgrade scenario, it is slow to upgrade one bookie at a time. If you have configured rack-aware or region-aware placement policy, you can upgrade bookies rack by rack or region by region. It speeds up the whole upgrade process. + +## Upgrade brokers and proxies + +The upgrade procedure for brokers and proxies is the same. Brokers and proxies are `stateless`, so it is easy to upgrade the two services. + +### Canary test + +You can test an upgraded version in one or a small set of nodes before upgrading all nodes in your cluster. + +To upgrade to a new version, complete the following steps: + +1. Stop a broker (or proxy). +2. Upgrade the binary and configuration file. +3. Start a broker (or proxy). + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic broker (or proxy) node. Revert to the old version and restart the broker (or proxy). + +### Upgrade all brokers/proxies + +After canary test to upgrade some brokers/proxies in your cluster, you can upgrade all brokers/proxies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, you can upgrade one broker or one proxy at a time if the size of the cluster is small. If your cluster is large, you can upgrade brokers or proxies in batches. When you upgrade a batch of brokers or proxies, make sure the remaining brokers and proxies in the cluster have enough capacity to handle the traffic during upgrade. + +In a downtime upgrade scenario, shut down the entire cluster, upgrade each broker/proxy, and then start the cluster. + +While upgrading in both scenarios, the procedure is the same for each broker or proxy. + +1. Stop the broker or proxy. +2. Upgrade the software (either new binary or new configuration files). +3. Start the broker or proxy. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/client-libraries-java.md b/site2/website/versioned_docs/version-2.4.2/client-libraries-java.md new file mode 100644 index 0000000000000..ebb07c4d701a1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/client-libraries-java.md @@ -0,0 +1,708 @@ +--- +id: version-2.4.2-client-libraries-java +title: The Pulsar Java client +sidebar_label: Java +original_id: client-libraries-java +--- + +The Pulsar Java client can be used both to create Java producers, consumers, and [readers](#reader-interface) of messages and to perform [administrative tasks](admin-api-overview.md). The current version of the Java client is **{{pulsar:version}}**. + +Javadoc for the Pulsar client is divided up into two domains, by package: + +Package | Description | Maven Artifact +:-------|:------------|:-------------- +[`org.apache.pulsar.client.api`](/api/client) | The producer and consumer API | [org.apache.pulsar:pulsar-client:{{pulsar:version}}](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C{{pulsar:version}}%7Cjar) +[`org.apache.pulsar.client.admin`](/api/admin) | The Java [admin API](admin-api-overview.md) | [org.apache.pulsar:pulsar-client-admin:{{pulsar:version}}](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-admin%7C{{pulsar:version}}%7Cjar) + +This document will focus only on the client API for producing and consuming messages on Pulsar topics. For a guide to using the Java admin client, see [The Pulsar admin interface](admin-api-overview.md). + +## Installation + +The latest version of the Pulsar Java client library is available via [Maven Central](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C{{pulsar:version}}%7Cjar). To use the latest version, add the `pulsar-client` library to your build configuration. + +### Maven + +If you're using Maven, add this to your `pom.xml`: + +```xml + +{{pulsar:version}} + + + + org.apache.pulsar + pulsar-client + ${pulsar.version} + +``` + +### Gradle + +If you're using Gradle, add this to your `build.gradle` file: + +```groovy +def pulsarVersion = '{{pulsar:version}}' + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-client', version: pulsarVersion +} +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http +pulsar://localhost:6650 +``` + +If you have more than one broker, the URL may look like this: +```http +pulsar://localhost:6550,localhost:6651,localhost:6652 +``` + +A URL for a production Pulsar cluster may look something like this: + +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Client configuration + +You can instantiate a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object using just a URL for the target Pulsar [cluster](reference-terminology.md#cluster), like this: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +``` + +If you have multiple brokers, you can initiate a PulsarClient like this: +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650,localhost:6651,localhost:6652") + .build(); +``` + +> #### Default broker URLs for standalone clusters +> If you're running a cluster in [standalone mode](getting-started-standalone.md), the broker will be available at the `pulsar://localhost:6650` URL by default. + +If you create a client, you may use the `loadConf` configuration. Below are the available parameters used in `loadConf`. + +| Type | Name | Description | Default +|---|---|---|--- +String | `serviceUrl` |Service URL provider for Pulsar service | None +String | `authPluginClassName` | Name of the authentication plugin | None +String | `authParams` | String represents parameters for the authentication plugin
**Example**
key1:val1,key2:val2|None +long|`operationTimeoutMs`|Operation timeout |30000 +long|`statsIntervalSeconds`|Interval between each stat info
Stats is activated with positive `statsInterval`
`statsIntervalSeconds` should be set to 1 second at least |60 +int|`numIoThreads`| Number of threads used for handling connections to brokers | 1 +int|`numListenerThreads`|Number of threads used for handling message listeners | 1 +boolean|`useTcpNoDelay`|Whether to use TCP no-delay flag on the connection to disable Nagle algorithm |true +boolean |`useTls` |Whether to use TLS encryption on the connection| false +string | `tlsTrustCertsFilePath` |Path to the trusted TLS certificate file|None +boolean|`tlsAllowInsecureConnection`|Whether the Pulsar client accepts untrusted TLS certificate from broker | false +boolean | `tlsHostnameVerificationEnable` | Whether to enable TLS hostname verification|false +int|`concurrentLookupRequest`|Number of concurrent lookup requests allowed to send on each broker connection to prevent overload on broker|5000 +int|`maxLookupRequest`|Maximum number of lookup requests allowed on each broker connection to prevent overload on broker | 50000 +int|`maxNumberOfRejectedRequestPerConnection`|Maximum number of rejected requests of a broker in a certain time frame (30 seconds) after the current connection is closed and the client creates a new connection to connect to a different broker|50 +int|`keepAliveIntervalSeconds`|Seconds of keeping alive interval for each client broker connection|30 +int|`connectionTimeoutMs`|Duration of waiting for a connection to a broker to be established
If the duration passes without a response from a broker, the connection attempt is dropped|10000 +int|`requestTimeoutMs`|Maximum duration for completing a request |60000 +int|`defaultBackoffIntervalNanos`| Default duration for a backoff interval | TimeUnit.MILLISECONDS.toNanos(100); +long|`maxBackoffIntervalNanos`|Maximum duration for a backoff interval|TimeUnit.SECONDS.toNanos(30) + +Check out the Javadoc for the {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} class for a full listing of configurable parameters. + +> In addition to client-level configuration, you can also apply [producer](#configuring-producers) and [consumer](#configuring-consumers) specific configuration, as you'll see in the sections below. + +## Producers + +In Pulsar, producers write messages to topics. Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object (as in the section [above](#client-configuration)), you can create a {@inject: javadoc:Producer:/client/org/apache/pulsar/client/api/Producer} for a specific Pulsar [topic](reference-terminology.md#topic). + +```java +Producer producer = client.newProducer() + .topic("my-topic") + .create(); + +// You can then send messages to the broker and topic you specified: +producer.send("My message".getBytes()); +``` + +By default, producers produce messages that consist of byte arrays. You can produce different types, however, by specifying a message [schema](#schemas). + +```java +Producer stringProducer = client.newProducer(Schema.STRING) + .topic("my-topic") + .create(); +stringProducer.send("My message"); +``` + +> You should always make sure to close your producers, consumers, and clients when they are no longer needed: +> ```java +> producer.close(); +> consumer.close(); +> client.close(); +> ``` +> +> Close operations can also be asynchronous: +> ```java +> producer.closeAsync() +> .thenRun(() -> System.out.println("Producer closed")); +> .exceptionally((ex) -> { +> System.err.println("Failed to close producer: " + ex); +> return ex; +> }); +> ``` + +### Configuring producers + +If you instantiate a `Producer` object specifying only a topic name, as in the example above, the producer will use the default configuration. To use a non-default configuration, there's a variety of configurable parameters that you can set. + +For a full listing, see the Javadoc for the {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder} class. Here's an example: + +```java +Producer producer = client.newProducer() + .topic("my-topic") + .batchingMaxPublishDelay(10, TimeUnit.MILLISECONDS) + .sendTimeout(10, TimeUnit.SECONDS) + .blockIfQueueFull(true) + .create(); +``` + +### Message routing + +When using partitioned topics, you can specify the routing mode whenever you publish messages using a producer. For more on specifying a routing mode using the Java client, see the [Partitioned Topics](cookbooks-partitioned.md) cookbook. + +### Async send + +You can also publish messages [asynchronously](concepts-messaging.md#send-modes) using the Java client. With async send, the producer will put the message in a blocking queue and return immediately. The client library will then send the message to the broker in the background. If the queue is full (max size configurable), the producer could be blocked or fail immediately when calling the API, depending on arguments passed to the producer. + +Here's an example async send operation: + +```java +producer.sendAsync("my-async-message".getBytes()).thenAccept(msgId -> { + System.out.printf("Message with ID %s successfully sent", msgId); +}); +``` + +As you can see from the example above, async send operations return a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId} wrapped in a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Configuring messages + +In addition to a value, it's possible to set additional items on a given message: + +```java +producer.newMessage() + .key("my-message-key") + .value("my-async-message".getBytes()) + .property("my-key", "my-value") + .property("my-other-key", "my-other-value") + .send(); +``` + +As for the previous case, it's also possible to terminate the builder chain with `sendAsync()` and +get a future returned. + +## Consumers + +In Pulsar, consumers subscribe to topics and handle messages that producers publish to those topics. You can instantiate a new [consumer](reference-terminology.md#consumer) by first instantiating a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object and passing it a URL for a Pulsar broker (as [above](#client-configuration)). + +Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object, you can create a {@inject: javadoc:Consumer:/client/org/apache/pulsar/client/api/Consumer} by specifying a [topic](reference-terminology.md#topic) and a [subscription](concepts-messaging.md#subscription-modes). + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscribe(); +``` + +The `subscribe` method will automatically subscribe the consumer to the specified topic and subscription. One way to make the consumer listen on the topic is to set up a `while` loop. In this example loop, the consumer listens for messages, prints the contents of any message that's received, and then [acknowledges](reference-terminology.md#acknowledgment-ack) that the message has been processed. If the processing logic fails, we use [negative acknowledgement](reference-terminology.md#acknowledgment-ack) +to have the message redelivered at a later point in time. + +```java +while (true) { + // Wait for a message + Message msg = consumer.receive(); + + try { + // Do something with the message + System.out.printf("Message received: %s", new String(msg.getData())); + + // Acknowledge the message so that it can be deleted by the message broker + consumer.acknowledge(msg); + } catch (Exception e) { + // Message failed to process, redeliver later + consumer.negativeAcknowledge(msg); + } +} +``` + +### Configuring consumers + +If you instantiate a `Consumer` object specifying only a topic and subscription name, as in the example above, the consumer will use the default configuration. To use a non-default configuration, there's a variety of configurable parameters that you can set. For a full listing, see the Javadoc for the {@inject: javadoc:ConsumerBuilder:/client/org/apache/pulsar/client/api/ConsumerBuilder} class. Here's an example: + +Here's an example configuration: + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .ackTimeout(10, TimeUnit.SECONDS) + .subscriptionType(SubscriptionType.Exclusive) + .subscribe(); +``` + +### Async receive + +The `receive` method will receive messages synchronously (the consumer process will be blocked until a message is available). You can also use [async receive](concepts-messaging.md#receive-modes), which will return immediately with a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) object that completes once a new message is available. + +Here's an example: + +```java +CompletableFuture asyncMessage = consumer.receiveAsync(); +``` + +Async receive operations return a {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} wrapped inside of a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously using [multi-topic subscriptions](concepts-messaging.md#multi-topic-subscriptions). To use multi-topic subscriptions you can supply either a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +Here are some examples: + +```java +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +ConsumerBuilder consumerBuilder = pulsarClient.newConsumer() + .subscriptionName(subscription); + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(allTopicsInNamespace) + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(someTopicsInNamespace) + .subscribe(); +``` + +You can also subscribe to an explicit list of topics (across namespaces if you wish): + +```java +List topics = Arrays.asList( + "topic-1", + "topic-2", + "topic-3" +); + +Consumer multiTopicConsumer = consumerBuilder + .topics(topics) + .subscribe(); + +// Alternatively: +Consumer multiTopicConsumer = consumerBuilder + .topics( + "topic-1", + "topic-2", + "topic-3" + ) + .subscribe(); +``` + +You can also subscribe to multiple topics asynchronously using the `subscribeAsync` method rather than the synchronous `subscribe` method. Here's an example: + +```java +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default.*"); +consumerBuilder + .topics(topics) + .subscribeAsync() + .thenAccept(this::receiveMessageFromConsumer); + +private void receiveMessageFromConsumer(Consumer consumer) { + consumer.receiveAsync().thenAccept(message -> { + // Do something with the received message + receiveMessageFromConsumer(consumer); + }); +} +``` + +### Subscription modes + +Pulsar has various [subscription modes](concepts-messaging#subscription-modes) to match different scenarios. A topic can have multiple subscriptions with different subscription modes. However, a subscription can only have one subscription mode at a time. + +A subscription is identified with the subscription name, and a subscription name can specify only one subscription mode at a time. You can change the subscription mode, yet you have to let all existing consumers of this subscription offline first. + +Different subscription modes have different message distribution modes. This section describes the differences of subscription modes and how to use them. + +In order to better describe their differences, assuming you have a topic named "my-topic", and the producer has published 10 messages. + +```java +Producer producer = client.newProducer(Schema.STRING) + .topic("my-topic") + .enableBatching(false) + .create(); +// 3 messages with "key-1", 3 messages with "key-2", 2 messages with "key-3" and 2 messages with "key-4" +producer.newMessage().key("key-1").value("message-1-1").send(); +producer.newMessage().key("key-1").value("message-1-2").send(); +producer.newMessage().key("key-1").value("message-1-3").send(); +producer.newMessage().key("key-2").value("message-2-1").send(); +producer.newMessage().key("key-2").value("message-2-2").send(); +producer.newMessage().key("key-2").value("message-2-3").send(); +producer.newMessage().key("key-3").value("message-3-1").send(); +producer.newMessage().key("key-3").value("message-3-2").send(); +producer.newMessage().key("key-4").value("message-4-1").send(); +producer.newMessage().key("key-4").value("message-4-2").send(); +``` + +#### Exclusive + +Create a new consumer and subscribe with the `Exclusive` subscription mode. + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Exclusive) + .subscribe() +``` + +Only the first consumer is allowed to the subscription, other consumers receive an error. The first consumer receives all 10 messages, and the consuming order is the same as the producing order. + +> Note: +> +> If topic is a partitioned topic, the first consumer subscribes to all partitioned topics, other consumers are not assigned with partitions and receive an error. + +#### Failover + +Create new consumers and subscribe with the`Failover` subscription mode. + +```java +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +//conumser1 is the active consumer, consumer2 is the standby consumer. +//consumer1 receives 5 messages and then crashes, consumer2 takes over as an active consumer. + + +``` + +Multiple consumers can attach to the same subscription, yet only the first consumer is active, and others are standby. When the active consumer is disconnected, messages will be dispatched to one of standby consumers, and the standby consumer becomes active consumer. + +If the first active consumer receives 5 messages and is disconnected, the standby consumer becomes active consumer. Consumer1 will receive: + +``` +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-2", "message-2-1") +("key-2", "message-2-2") +``` + +consumer2 will receive: + +``` +("key-2", "message-2-3") +("key-3", "message-3-1") +("key-3", "message-3-2") +("key-4", "message-4-1") +("key-4", "message-4-2") +``` + +> Note: +> +> If a topic is a partitioned topic, each partition only has one active consumer, messages of one partition only distributed to one consumer, messages of multiple partitions are distributed to multiple consumers. + +#### Shared + +Create new consumers and subscribe with `Shared` subscription mode: + +```java +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() +//Both consumer1 and consumer 2 is active consumers. +``` + +In shared subscription mode, multiple consumers can attach to the same subscription and message are delivered in a round robin distribution across consumers. + +If a broker dispatches only one message at a time, consumer1 will receive: + +``` +("key-1", "message-1-1") +("key-1", "message-1-3") +("key-2", "message-2-2") +("key-3", "message-3-1") +("key-4", "message-4-1") +``` + +consumer 2 will receive: + +``` +("key-1", "message-1-2") +("key-2", "message-2-1") +("key-2", "message-2-3") +("key-3", "message-3-2") +("key-4", "message-4-2") +``` + +`Shared` subscription is different from `Exclusive` and `Failover` subscription modes. `Shared` subscription has better flexibility, but cannot provide order guarantee. + +#### Key_Shared + +This is a new subscription mode since 2.4.0 release, create new consumers and subscribe with `Key_Shared` subscription mode: + +```java +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. +``` + +`Key_Shared` subscription is like `Shared` subscription, all consumers can attach to the same subscription. But it is different from `Key_Shared` subscription, messages with the same key are delivered to only one consumer in order. The possible distribution of messages between different consumers(by default we do not know in advance which keys will be assigned to a consumer, but a key will only be assigned to a consumer at the same time. ) . + +consumer1 will receive: + +``` +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-3", "message-3-1") +("key-3", "message-3-2") +``` + +consumer 2 will receive: + +``` +("key-2", "message-2-1") +("key-2", "message-2-2") +("key-2", "message-2-3") +("key-4", "message-4-1") +("key-4", "message-4-2") +``` + +> Note: +> +> If the message key is not specified, messages without key will be dispatched to one consumer in order by default. + +## Reader interface + +With the [reader interface](concepts-clients.md#reader-interface), Pulsar clients can "manually position" themselves within a topic, reading all messages from a specified message onward. The Pulsar API for Java enables you to create {@inject: javadoc:Reader:/client/org/apache/pulsar/client/api/Reader} objects by specifying a topic, a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId}, and {@inject: javadoc:ReaderConfiguration:/client/org/apache/pulsar/client/api/ReaderConfiguration}. + +Here's an example: + +```java +ReaderConfiguration conf = new ReaderConfiguration(); +byte[] msgIdBytes = // Some message ID byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +while (true) { + Message message = reader.readNext(); + // Process message +} +``` + +In the example above, a `Reader` object is instantiated for a specific topic and message (by ID); the reader then iterates over each message in the topic after the message identified by `msgIdBytes` (how that value is obtained depends on the application). + +The code sample above shows pointing the `Reader` object to a specific message (by ID), but you can also use `MessageId.earliest` to point to the earliest available message on the topic of `MessageId.latest` to point to the most recent available message. + +## Schemas + +In Pulsar, all message data consists of byte arrays "under the hood." [Message schemas](schema-get-started.md) enable you to use other types of data when constructing and handling messages (from simple types like strings to more complex, application-specific types). If you construct, say, a [producer](#producers) without specifying a schema, then the producer can only produce messages of type `byte[]`. Here's an example: + +```java +Producer producer = client.newProducer() + .topic(topic) + .create(); +``` + +The producer above is equivalent to a `Producer` (in fact, you should *always* explicitly specify the type). If you'd like to use a producer for a different type of data, you'll need to specify a **schema** that informs Pulsar which data type will be transmitted over the [topic](reference-terminology.md#topic). + +### Schema example + +Let's say that you have a `SensorReading` class that you'd like to transmit over a Pulsar topic: + +```java +public class SensorReading { + public float temperature; + + public SensorReading(float temperature) { + this.temperature = temperature; + } + + // A no-arg constructor is required + public SensorReading() { + } + + public float getTemperature() { + return temperature; + } + + public void setTemperature(float temperature) { + this.temperature = temperature; + } +} +``` + +You could then create a `Producer` (or `Consumer`) like so: + +```java +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-readings") + .create(); +``` + +The following schema formats are currently available for Java: + +* No schema or the byte array schema (which can be applied using `Schema.BYTES`): + + ```java + Producer bytesProducer = client.newProducer(Schema.BYTES) + .topic("some-raw-bytes-topic") + .create(); + ``` + + Or, equivalently: + + ```java + Producer bytesProducer = client.newProducer() + .topic("some-raw-bytes-topic") + .create(); + ``` + +* `String` for normal UTF-8-encoded string data. This schema can be applied using `Schema.STRING`: + + ```java + Producer stringProducer = client.newProducer(Schema.STRING) + .topic("some-string-topic") + .create(); + ``` + +* JSON schemas can be created for POJOs using `Schema.JSON`. Here's an example: + + ```java + Producer pojoProducer = client.newProducer(Schema.JSON(MyPojo.class)) + .topic("some-pojo-topic") + .create(); + ``` + +* Protobuf schemas can be generate using `Schema.PROTOBUF`. The following example shows how to create the Protobuf schema and use it to instantiate a new producer: + + ```java + Producer protobufProducer = client.newProducer(Schema.PROTOBUF(MyProtobuf.class)) + .topic("some-protobuf-topic") + .create(); + ``` + +* Avro schemas can be defined with the help of `Schema.AVRO`. The next code snippet demonstrates the creation and usage of the Avro schema: + + ```java + Producer avroProducer = client.newProducer(Schema.AVRO(MyAvro.class)) + .topic("some-avro-topic") + .create(); + ``` + +## Authentication + +Pulsar currently supports two authentication schemes: [TLS](security-tls-authentication.md) and [Athenz](security-athenz.md). The Pulsar Java client can be used with both. + +### TLS Authentication + +To use [TLS](security-tls-authentication.md), you need to set TLS to `true` using the `setUseTls` method, point your Pulsar client to a TLS cert path, and provide paths to cert and key files. + +Here's an example configuration: + +```java +Map authParams = new HashMap<>(); +authParams.put("tlsCertFile", "/path/to/client-cert.pem"); +authParams.put("tlsKeyFile", "/path/to/client-key.pem"); + +Authentication tlsAuth = AuthenticationFactory + .create(AuthenticationTls.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(tlsAuth) + .build(); +``` + +### Athenz + +To use [Athenz](security-athenz.md) as an authentication provider, you need to [use TLS](#tls-authentication) and provide values for four parameters in a hash: + +* `tenantDomain` +* `tenantService` +* `providerDomain` +* `privateKey` + +You can also set an optional `keyId`. Here's an example configuration: + +```java +Map authParams = new HashMap<>(); +authParams.put("tenantDomain", "shopping"); // Tenant domain name +authParams.put("tenantService", "some_app"); // Tenant service name +authParams.put("providerDomain", "pulsar"); // Provider domain name +authParams.put("privateKey", "file:///path/to/private.pem"); // Tenant private key path +authParams.put("keyId", "v1"); // Key id for the tenant private key (optional, default: "0") + +Authentication athenzAuth = AuthenticationFactory + .create(AuthenticationAthenz.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(athenzAuth) + .build(); +``` + +> #### Supported pattern formats +> The `privateKey` parameter supports the following three pattern formats: +> * `file:///path/to/file` +> * `file:/path/to/file` +> * `data:application/x-pem-file;base64,` diff --git a/site2/website/versioned_docs/version-2.4.2/concepts-messaging.md b/site2/website/versioned_docs/version-2.4.2/concepts-messaging.md new file mode 100644 index 0000000000000..3d6f0f7f4d3db --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/concepts-messaging.md @@ -0,0 +1,415 @@ +--- +id: version-2.4.2-concepts-messaging +title: Messaging Concepts +sidebar_label: Messaging +original_id: concepts-messaging +--- + +Pulsar is built on the [publish-subscribe](https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern) pattern, aka pub-sub. In this pattern, [producers](#producers) publish messages to [topics](#topics). [Consumers](#consumers) can then [subscribe](#subscription-modes) to those topics, process incoming messages, and send an acknowledgement when processing is complete. + +Once a subscription has been created, all messages will be [retained](concepts-architecture-overview.md#persistent-storage) by Pulsar, even if the consumer gets disconnected. Retained messages will be discarded only when a consumer acknowledges that they've been successfully processed. + +## Messages + +Messages are the basic "unit" of Pulsar. They're what producers publish to topics and what consumers then consume from topics (and acknowledge when the message has been processed). Messages are the analogue of letters in a postal service system. + +Component | Purpose +:---------|:------- +Value / data payload | The data carried by the message. All Pulsar messages carry raw bytes, although message data can also conform to data [schemas](schema-get-started.md) +Key | Messages can optionally be tagged with keys, which can be useful for things like [topic compaction](concepts-topic-compaction.md) +Properties | An optional key/value map of user-defined properties +Producer name | The name of the producer that produced the message (producers are automatically given default names, but you can apply your own explicitly as well) +Sequence ID | Each Pulsar message belongs to an ordered sequence on its topic. A message's sequence ID is its ordering in that sequence. +Publish time | The timestamp of when the message was published (automatically applied by the producer) +Event time | An optional timestamp that applications can attach to the message representing when something happened, e.g. when the message was processed. The event time of a message is 0 if none is explicitly set. + + +> For a more in-depth breakdown of Pulsar message contents, see the documentation on Pulsar's [binary protocol](developing-binary-protocol.md). + +## Producers + +A producer is a process that attaches to a topic and publishes messages to a Pulsar [broker](reference-terminology.md#broker) for processing. + +### Send modes + +Producers can send messages to brokers either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync send | The producer will wait for acknowledgement from the broker after sending each message. If acknowledgment isn't received then the producer will consider the send operation a failure. | +| Async send | The producer will put the message in a blocking queue and return immediately. The client library will then send the message to the broker in the background. If the queue is full (max size [configurable](reference-configuration.md#broker), the producer could be blocked or fail immediately when calling the API, depending on arguments passed to the producer. | + +### Compression + +Messages published by producers can be compressed during transportation in order to save bandwidth. Pulsar currently supports the following types of compression: + +* [LZ4](https://github.com/lz4/lz4) +* [ZLIB](https://zlib.net/) +* [ZSTD](https://facebook.github.io/zstd/) +* [SNAPPY](https://google.github.io/snappy/) + +### Batching + +If batching is enabled, the producer will accumulate and send a batch of messages in a single request. Batching size is defined by the maximum number of messages and maximum publish latency. + +## Consumers + +A consumer is a process that attaches to a topic via a subscription and then receives messages. + +### Receive modes + +Messages can be received from [brokers](reference-terminology.md#broker) either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:--------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync receive | A sync receive will be blocked until a message is available. | +| Async receive | An async receive will return immediately with a future value---a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) in Java, for example---that completes once a new message is available. | + +### Listeners + +Client libraries provide listener implementation for consumers. For example, the [Java client](client-libraries-java.md) provides a {@inject: javadoc:MesssageListener:/client/org/apache/pulsar/client/api/MessageListener} interface. In this interface, the `received` method is called whenever a new message is received. + +### Acknowledgement + +When a consumer has consumed a message successfully, the consumer sends an acknowledgement request to the broker, so that the broker will discard the message. Otherwise, it [stores](concepts-architecture-overview.md#persistent-storage) the message. + +Messages can be acknowledged either one by one or cumulatively. With cumulative acknowledgement, the consumer only needs to acknowledge the last message it received. All messages in the stream up to (and including) the provided message will not be re-delivered to that consumer. + + +> Cumulative acknowledgement cannot be used with [shared subscription mode](#subscription-modes), because shared mode involves multiple consumers having access to the same subscription. + +In the shared subscription mode, messages can be acknowledged individually. + +### Negative acknowledgement + +When a consumer does not consume a message successfully at a time, and wants to consume the message again, the consumer can send a negative acknowledgement to the broker, and then the broker will redeliver the message. + +Messages can be negatively acknowledged one by one or cumulatively, which depends on the consumption subscription mode. + +In the exclusive and failover subscription modes, consumers only negatively acknowledge the last message they have received. + +In the shared and Key_Shared subscription modes, you can negatively acknowledge messages individually. + +### Acknowledgement timeout + +When a message is not consumed successfully, and you want to trigger the broker to redeliver the message automatically, you can adopt the unacknowledged message automatic re-delivery mechanism. Client will track the unacknowledged messages within the entire `acktimeout` time range, and send a `redeliver unacknowledged messages` request to the broker automatically when the acknowledgement timeout is specified. + +> Note +> Use negative acknowledgement prior to acknowledgement timeout. Negative acknowledgement controls re-delivery of individual messages with more precise, and avoids invalid redeliveries when the message processing time exceeds the acknowledgement timeout. + +### Dead letter topic + +Dead letter topic enables you to consume new messages when some messages cannot be consumed successfully by a consumer. In this mechanism, messages that are failed to be consumed are stored in a separate topic, which is called dead letter topic. You can decide how to handle messages in the dead letter topic. + +The following example shows how to enable dead letter topic in a Java client using the default dead letter topic: + +```java +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .build()) + .subscribe(); + +``` +The default dead letter topic uses this format: +``` +--DLQ +``` + +If you want to specify the name of the dead letter topic, use this Java client example: + +```java +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .deadLetterTopic("your-topic-name") + .build()) + .subscribe(); + +``` + + +Dead letter topic depends on message re-delivery. Messages are redelivered either due to [acknowledgement timeout](#acknowledgement-timeout) or [negative acknowledgement](#negative-acknowledgement). If you are going to use negative acknowledgement on a message, make sure it is negatively acknowledged before the acknowledgement timeout. + +> Note +> Currently, dead letter topic is enabled only in the shared subscription mode. + +## Topics + +As in other pub-sub systems, topics in Pulsar are named channels for transmitting messages from [producers](reference-terminology.md#producer) to [consumers](reference-terminology.md#consumer). Topic names are URLs that have a well-defined structure: + +```http +{persistent|non-persistent}://tenant/namespace/topic +``` + +Topic name component | Description +:--------------------|:----------- +`persistent` / `non-persistent` | This identifies the type of topic. Pulsar supports two kind of topics: [persistent](concepts-architecture-overview.md#persistent-storage) and [non-persistent](#non-persistent-topics) (persistent is the default, so if you don't specify a type the topic will be persistent). With persistent topics, all messages are durably [persisted](concepts-architecture-overview.md#persistent-storage) on disk (that means on multiple disks unless the broker is standalone), whereas data for [non-persistent](#non-persistent-topics) topics isn't persisted to storage disks. +`tenant` | The topic's tenant within the instance. Tenants are essential to multi-tenancy in Pulsar and can be spread across clusters. +`namespace` | The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the [namespace](#namespaces) level. Each tenant can have multiple namespaces. +`topic` | The final part of the name. Topic names are freeform and have no special meaning in a Pulsar instance. + + +> #### No need to explicitly create new topics +> You don't need to explicitly create topics in Pulsar. If a client attempts to write or receive messages to/from a topic that does not yet exist, Pulsar will automatically create that topic under the [namespace](#namespaces) provided in the [topic name](#topics). + + +## Namespaces + +A namespace is a logical nomenclature within a tenant. A tenant can create multiple namespaces via the [admin API](admin-api-namespaces.md#create). For instance, a tenant with different applications can create a separate namespace for each application. A namespace allows the application to create and manage a hierarchy of topics. The topic `my-tenant/app1` is a namespace for the application `app1` for `my-tenant`. You can create any number of [topics](#topics) under the namespace. + +## Subscription modes + +A subscription is a named configuration rule that determines how messages are delivered to consumers. There are three available subscription modes in Pulsar: [exclusive](#exclusive), [shared](#shared), and [failover](#failover). These modes are illustrated in the figure below. + +![Subscription modes](assets/pulsar-subscription-modes.png) + +### Exclusive + +In *exclusive* mode, only a single consumer is allowed to attach to the subscription. If more than one consumer attempts to subscribe to a topic using the same subscription, the consumer receives an error. + +In the diagram below, only **Consumer A-0** is allowed to consume messages. + +> Exclusive mode is the default subscription mode. + +![Exclusive subscriptions](assets/pulsar-exclusive-subscriptions.png) + +### Failover + +In *failover* mode, multiple consumers can attach to the same subscription. The consumers will be lexically sorted by the consumer's name and the first consumer will initially be the only one receiving messages. This consumer is called the *master consumer*. + +When the master consumer disconnects, all (non-acked and subsequent) messages will be delivered to the next consumer in line. + +In the diagram below, **Consumer-B-0** is the master consumer while **Consumer-B-1** would be the next in line to receive messages if **Consumer-B-0** disconnected. + +![Failover subscriptions](assets/pulsar-failover-subscriptions.png) + +### Shared + +In *shared* or *round robin* mode, multiple consumers can attach to the same subscription. Messages are delivered in a round robin distribution across consumers, and any given message is delivered to only one consumer. When a consumer disconnects, all the messages that were sent to it and not acknowledged will be rescheduled for sending to the remaining consumers. + +In the diagram below, **Consumer-C-1** and **Consumer-C-2** are able to subscribe to the topic, but **Consumer-C-3** and others could as well. + +> #### Limitations of shared mode +> There are two important things to be aware of when using shared mode: +> * Message ordering is not guaranteed. +> * You cannot use cumulative acknowledgment with shared mode. + +![Shared subscriptions](assets/pulsar-shared-subscriptions.png) + +### Key_Shared + +In *Key_Shared* mode, multiple consumers can attach to the same subscription. Messages are delivered in a distribution across consumers and message with same key or same ordering key are delivered to only one consumer. No matter how many times the message is re-delivered, it is delivered to the same consumer. When a consumer connected or disconnected will cause served consumer change for some key of message. + +> #### Limitations of Key_Shared mode +> There are two important things to be aware of when using Key_Shared mode: +> * You need to specify a key or orderingKey for messages +> * You cannot use cumulative acknowledgment with Key_Shared mode. + +![Key_Shared subscriptions](assets/pulsar-key-shared-subscriptions.png) + +**Key_Shared subscription is a beta feature. You can disable it at broker.config.** + +## Multi-topic subscriptions + +When a consumer subscribes to a Pulsar topic, by default it subscribes to one specific topic, such as `persistent://public/default/my-topic`. As of Pulsar version 1.23.0-incubating, however, Pulsar consumers can simultaneously subscribe to multiple topics. You can define a list of topics in two ways: + +* On the basis of a [**reg**ular **ex**pression](https://en.wikipedia.org/wiki/Regular_expression) (regex), for example `persistent://public/default/finance-.*` +* By explicitly defining a list of topics + +> When subscribing to multiple topics by regex, all topics must be in the same [namespace](#namespaces) + +When subscribing to multiple topics, the Pulsar client will automatically make a call to the Pulsar API to discover the topics that match the regex pattern/list and then subscribe to all of them. If any of the topics don't currently exist, the consumer will auto-subscribe to them once the topics are created. + +> #### No ordering guarantees +> When a consumer subscribes to multiple topics, all ordering guarantees normally provided by Pulsar on single topics do not hold. If your use case for Pulsar involves any strict ordering requirements, we would strongly recommend against using this feature. + +Here are some multi-topic subscription examples for Java: + +```java +import java.util.regex.Pattern; + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient pulsarClient = // Instantiate Pulsar client object + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(allTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer someTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(someTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); +``` + +For code examples, see: + +* [Java](client-libraries-java.md#multi-topic-subscriptions) + +## Partitioned topics + +Normal topics can be served only by a single broker, which limits the topic's maximum throughput. *Partitioned topics* are a special type of topic that be handled by multiple brokers, which allows for much higher throughput. + +Behind the scenes, a partitioned topic is actually implemented as N internal topics, where N is the number of partitions. When publishing messages to a partitioned topic, each message is routed to one of several brokers. The distribution of partitions across brokers is handled automatically by Pulsar. + +The diagram below illustrates this: + +![](assets/partitioning.png) + +Here, the topic **Topic1** has five partitions (**P0** through **P4**) split across three brokers. Because there are more partitions than brokers, two brokers handle two partitions a piece, while the third handles only one (again, Pulsar handles this distribution of partitions automatically). + +Messages for this topic are broadcast to two consumers. The [routing mode](#routing-modes) determines both which broker handles each partition, while the [subscription mode](#subscription-modes) determines which messages go to which consumers. + +Decisions about routing and subscription modes can be made separately in most cases. In general, throughput concerns should guide partitioning/routing decisions while subscription decisions should be guided by application semantics. + +There is no difference between partitioned topics and normal topics in terms of how subscription modes work, as partitioning only determines what happens between when a message is published by a producer and processed and acknowledged by a consumer. + +Partitioned topics need to be explicitly created via the [admin API](admin-api-overview.md). The number of partitions can be specified when creating the topic. + +### Routing modes + +When publishing to partitioned topics, you must specify a *routing mode*. The routing mode determines which partition---that is, which internal topic---each message should be published to. + +There are three {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} available: + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer will publish messages across all partitions in round-robin fashion to achieve maximum throughput. Please note that round-robin is not done per individual message but rather it's set to the same boundary of batching delay, to ensure batching is effective. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer will randomly pick one single partition and publish all the messages into that partition. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. +`CustomPartition` | Use custom message router implementation that will be called to determine the partition for a particular message. User can create a custom routing mode by using the [Java client](client-libraries-java.md) and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +### Ordering guarantee + +The ordering of messages is related to MessageRoutingMode and Message Key. Usually, user would want an ordering of Per-key-partition guarantee. + +If there is a key attached to message, the messages will be routed to corresponding partitions based on the hashing scheme specified by {@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} in {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder}, when using either `SinglePartition` or `RoundRobinPartition` mode. + +Ordering guarantee | Description | Routing Mode and Key +:------------------|:------------|:------------ +Per-key-partition | All the messages with the same key will be in order and be placed in same partition. | Use either `SinglePartition` or `RoundRobinPartition` mode, and Key is provided by each message. +Per-producer | All the messages from the same producer will be in order. | Use `SinglePartition` mode, and no Key is provided for each message. + +### Hashing scheme + +{@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} is an enum that represent sets of standard hashing functions available when choosing the partition to use for a particular message. + +There are 2 types of standard hashing functions available: `JavaStringHash` and `Murmur3_32Hash`. +The default hashing function for producer is `JavaStringHash`. +Please pay attention that `JavaStringHash` is not useful when producers can be from different multiple language clients, under this use case, it is recommended to use `Murmur3_32Hash`. + + + +## Non-persistent topics + + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](concepts-architecture-overview.md#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar broker or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http +non-persistent://tenant/namespace/topic +``` + +> For more info on using non-persistent topics, see the [Non-persistent messaging cookbook](cookbooks-non-persistent.md). + +In non-persistent topics, brokers immediately deliver messages to all connected subscribers *without persisting them* in [BookKeeper](concepts-architecture-overview.md#persistent-storage). If a subscriber is disconnected, the broker will not be able to deliver those in-transit messages, and subscribers will never be able to receive those messages again. Eliminating the persistent storage step makes messaging on non-persistent topics slightly faster than on persistent topics in some cases, but with the caveat that some of the core benefits of Pulsar are lost. + +> With non-persistent topics, message data lives only in memory. If a message broker fails or message data can otherwise not be retrieved from memory, your message data may be lost. Use non-persistent topics only if you're *certain* that your use case requires it and can sustain it. + +By default, non-persistent topics are enabled on Pulsar brokers. You can disable them in the broker's [configuration](reference-configuration.md#broker-enableNonPersistentTopics). You can manage non-persistent topics using the [`pulsar-admin topics`](referencereference--pulsar-admin/#topics-1) interface. + +### Performance + +Non-persistent messaging is usually faster than persistent messaging because brokers don't persist messages and immediately send acks back to the producer as soon as that message is delivered to connected brokers. Producers thus see comparatively low publish latency with non-persistent topic. + +### Client API + +Producers and consumers can connect to non-persistent topics in the same way as persistent topics, with the crucial difference that the topic name must start with `non-persistent`. All three subscription modes---[exclusive](#exclusive), [shared](#shared), and [failover](#failover)---are supported for non-persistent topics. + +Here's an example [Java consumer](client-libraries-java.md#consumers) for a non-persistent topic: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +String npTopic = "non-persistent://public/default/my-topic"; +String subscriptionName = "my-subscription-name"; + +Consumer consumer = client.newConsumer() + .topic(npTopic) + .subscriptionName(subscriptionName) + .subscribe(); +``` + +Here's an example [Java producer](client-libraries-java.md#producer) for the same non-persistent topic: + +```java +Producer producer = client.newProducer() + .topic(npTopic) + .create(); +``` + +## Message retention and expiry + +By default, Pulsar message brokers: + +* immediately delete *all* messages that have been acknowledged by a consumer, and +* [persistently store](concepts-architecture-overview.md#persistent-storage) all unacknowledged messages in a message backlog. + +Pulsar has two features, however, that enable you to override this default behavior: + +* Message **retention** enables you to store messages that have been acknowledged by a consumer +* Message **expiry** enables you to set a time to live (TTL) for messages that have not yet been acknowledged + +> All message retention and expiry is managed at the [namespace](#namespaces) level. For a how-to, see the [Message retention and expiry](cookbooks-retention-expiry.md) cookbook. + +The diagram below illustrates both concepts: + +![Message retention and expiry](assets/retention-expiry.png) + +With message retention, shown at the top, a retention policy applied to all topics in a namespace dicates that some messages are durably stored in Pulsar even though they've already been acknowledged. Acknowledged messages that are not covered by the retention policy are deleted. Without a retention policy, *all* of the acknowledged messages would be deleted. + +With message expiry, shown at the bottom, some messages are deleted, even though they haven't been acknowledged, because they've expired according to the TTL applied to the namespace (for example because a TTL of 5 minutes has been applied and the messages haven't been acknowledged but are 10 minutes old). + +## Message deduplication + +Message **duplication** occurs when a message is [persisted](concepts-architecture-overview.md#persistent-storage) by Pulsar more than once. Message ***de*duplication** is an optional Pulsar feature that prevents unnecessary message duplication by processing each message only once, *even if the message is received more than once*. + +The following diagram illustrates what happens when message deduplication is disabled vs. enabled: + +![Pulsar message deduplication](assets/message-deduplication.png) + + +Message deduplication is disabled in the scenario shown at the top. Here, a producer publishes message 1 on a topic; the message reaches a Pulsar broker and is [persisted](concepts-architecture-overview.md#persistent-storage) to BookKeeper. The producer then sends message 1 again (in this case due to some retry logic), and the message is received by the broker and stored in BookKeeper again, which means that duplication has occurred. + +In the second scenario at the bottom, the producer publishes message 1, which is received by the broker and persisted, as in the first scenario. When the producer attempts to publish the message again, however, the broker knows that it has already seen message 1 and thus does not persist the message. + +> Message deduplication is handled at the namespace level. For more instructions, see the [message deduplication cookbook](cookbooks-deduplication.md). + + +### Producer idempotency + +The other available approach to message deduplication is to ensure that each message is *only produced once*. This approach is typically called **producer idempotency**. The drawback of this approach is that it defers the work of message deduplication to the application. In Pulsar, this is handled at the [broker](reference-terminology.md#broker) level, which means that you don't need to modify your Pulsar client code. Instead, you only need to make administrative changes (see the [Managing message deduplication](cookbooks-deduplication.md) cookbook for a guide). + +### Deduplication and effectively-once semantics + +Message deduplication makes Pulsar an ideal messaging system to be used in conjunction with stream processing engines (SPEs) and other systems seeking to provide [effectively-once](https://streaml.io/blog/exactly-once) processing semantics. Messaging systems that don't offer automatic message deduplication require the SPE or other system to guarantee deduplication, which means that strict message ordering comes at the cost of burdening the application with the responsibility of deduplication. With Pulsar, strict ordering guarantees come at no application-level cost. + +> More in-depth information can be found in [this post](https://streaml.io/blog/pulsar-effectively-once/) on the [Streamlio blog](https://streaml.io/blog) + + diff --git a/site2/website/versioned_docs/version-2.4.2/concepts-tiered-storage.md b/site2/website/versioned_docs/version-2.4.2/concepts-tiered-storage.md new file mode 100644 index 0000000000000..5851f7d7fd910 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/concepts-tiered-storage.md @@ -0,0 +1,18 @@ +--- +id: version-2.4.2-concepts-tiered-storage +title: Tiered Storage +sidebar_label: Tiered Storage +original_id: concepts-tiered-storage +--- + +Pulsar's segment oriented architecture allows for topic backlogs to grow very large, effectively without limit. However, this can become expensive over time. + +One way to alleviate this cost is to use Tiered Storage. With tiered storage, older messages in the backlog can be moved from BookKeeper to a cheaper storage mechanism, while still allowing clients to access the backlog as if nothing had changed. + +![Tiered Storage](assets/pulsar-tiered-storage.png) + +> Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Pulsar currently supports S3 and Google Cloud Storage (GCS) for [long term store](https://pulsar.apache.org/docs/en/cookbooks-tiered-storage/). Offloading to long term storage triggered via a Rest API or command line interface. The user passes in the amount of topic data they wish to retain on BookKeeper, and the broker will copy the backlog data to long term storage. The original data will then be deleted from BookKeeper after a configured delay (4 hours by default). + +> For a guide for setting up tiered storage, see the [Tiered storage cookbook](cookbooks-tiered-storage.md). diff --git a/site2/website/versioned_docs/version-2.4.2/cookbooks-compaction.md b/site2/website/versioned_docs/version-2.4.2/cookbooks-compaction.md new file mode 100644 index 0000000000000..78bd1da2681cf --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/cookbooks-compaction.md @@ -0,0 +1,127 @@ +--- +id: version-2.4.2-cookbooks-compaction +title: Topic compaction +sidebar_label: Topic compaction +original_id: cookbooks-compaction +--- + +Pulsar's [topic compaction](concepts-topic-compaction.md#compaction) feature enables you to create **compacted** topics in which older, "obscured" entries are pruned from the topic, allowing for faster reads through the topic's history (which messages are deemed obscured/outdated/irrelevant will depend on your use case). + +To use compaction: + +* You need to give messages keys, as topic compaction in Pulsar takes place on a *per-key basis* (i.e. messages are compacted based on their key). For a stock ticker use case, the stock symbol---e.g. `AAPL` or `GOOG`---could serve as the key (more on this [below](#when)). Messages without keys will be left alone by the compaction process. +* Compaction can be configured to run [automatically](#automatic), or you can manually [trigger](#trigger) compaction using the Pulsar administrative API. +* Your consumers must be [configured](#config) to read from compacted topics ([Java consumers](#java), for example, have a `readCompacted` setting that must be set to `true`). If this configuration is not set, consumers will still be able to read from the non-compacted topic. + + +> Compaction only works on messages that have keys (as in the stock ticker example the stock symbol serves as the key for each message). Keys can thus be thought of as the axis along which compaction is applied. Messages that don't have keys are simply ignored by compaction. + +## When should I use compacted topics? {#when} + +The classic example of a topic that could benefit from compaction would be a stock ticker topic through which consumers can access up-to-date values for specific stocks. Imagine a scenario in which messages carrying stock value data use the stock symbol as the key (`GOOG`, `AAPL`, `TWTR`, etc.). Compacting this topic would give consumers on the topic two options: + +* They can read from the "original," non-compacted topic in case they need access to "historical" values, i.e. the entirety of the topic's messages. +* They can read from the compacted topic if they only want to see the most up-to-date messages. + +Thus, if you're using a Pulsar topic called `stock-values`, some consumers could have access to all messages in the topic (perhaps because they're performing some kind of number crunching of all values in the last hour) while the consumers used to power the real-time stock ticker only see the compacted topic (and thus aren't forced to process outdated messages). Which variant of the topic any given consumer pulls messages from is determined by the consumer's [configuration](#config). + +> One of the benefits of compaction in Pulsar is that you aren't forced to choose between compacted and non-compacted topics, as the compaction process leaves the original topic as-is and essentially adds an alternate topic. In other words, you can run compaction on a topic and consumers that need access to the non-compacted version of the topic will not be adversely affected. + + +## Configuring compaction to run automatically {#automatic} + +Tenant administrators can configure a policy for compaction at the namespace level. The policy specifies how large the topic backlog can grow before compaction is triggered. + +For example, to trigger compaction when the backlog reaches 100MB: + +```bash +$ bin/pulsar-admin namespaces set-compaction-threshold \ + --threshold 100M my-tenant/my-namespace +``` + +Configuring the compaction threshold on a namespace will apply to all topics within that namespace. + +## Triggering compaction manually {#trigger} + +In order to run compaction on a topic, you need to use the [`topics compact`](reference-pulsar-admin.md#topics-compact) command for the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool. Here's an example: + +```bash +$ bin/pulsar-admin topics compact \ + persistent://my-tenant/my-namespace/my-topic +``` + +The `pulsar-admin` tool runs compaction via the Pulsar {@inject: rest:REST:/} API. To run compaction in its own dedicated process, i.e. *not* through the REST API, you can use the [`pulsar compact-topic`](reference-cli-tools.md#pulsar-compact-topic) command. Here's an example: + +```bash +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant-namespace/my-topic +``` + +> Running compaction in its own process is recommended when you want to avoid interfering with the broker's performance. Broker performance should only be affected, however, when running compaction on topics with a large keyspace (i.e when there are many keys on the topic). The first phase of the compaction process keeps a copy of each key in the topic, which can create memory pressure as the number of keys grows. Using the `pulsar-admin topics compact` command to run compaction through the REST API should present no issues in the overwhelming majority of cases; using `pulsar compact-topic` should correspondingly be considered an edge case. + +The `pulsar compact-topic` command communicates with [ZooKeeper](https://zookeeper.apache.org) directly. In order to establish communication with ZooKeeper, though, the `pulsar` CLI tool will need to have a valid [broker configuration](reference-configuration.md#broker). You can either supply a proper configuration in `conf/broker.conf` or specify a non-default location for the configuration: + +```bash +$ bin/pulsar compact-topic \ + --broker-conf /path/to/broker.conf \ + --topic persistent://my-tenant/my-namespace/my-topic + +# If the configuration is in conf/broker.conf +$ bin/pulsar compact-topic \ + --topic persistent://my-tenant/my-namespace/my-topic +``` + +#### When should I trigger compaction? + +How often you [trigger compaction](#trigger) will vary widely based on the use case. If you want a compacted topic to be extremely speedy on read, then you should run compaction fairly frequently. + +## Consumer configuration + +Pulsar consumers and readers need to be configured to read from compacted topics. The sections below show you how to enable compacted topic reads for Pulsar's language clients. If the + +### Java + +In order to read from a compacted topic using a Java consumer, the `readCompacted` parameter must be set to `true`. Here's an example consumer for a compacted topic: + +```java +Consumer compactedTopicConsumer = client.newConsumer() + .topic("some-compacted-topic") + .readCompacted(true) + .subscribe(); +``` + +As mentioned above, topic compaction in Pulsar works on a *per-key basis*. That means that messages that you produce on compacted topics need to have keys (the content of the key will depend on your use case). Messages that don't have keys will be ignored by the compaction process. Here's an example Pulsar message with a key: + +```java +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); +``` + +The example below shows a message with a key being produced on a compacted Pulsar topic: + +```java +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageBuilder; +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer compactedTopicProducer = client.newProducer() + .topic("some-compacted-topic") + .create(); + +Message msg = MessageBuilder.create() + .setContent(someByteArray) + .setKey("some-key") + .build(); + +compactedTopicProducer.send(msg); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/develop-bare-metal.md b/site2/website/versioned_docs/version-2.4.2/develop-bare-metal.md new file mode 100644 index 0000000000000..42125b05aba1d --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/develop-bare-metal.md @@ -0,0 +1,446 @@ +--- +id: version-2.4.2-deploy-bare-metal +title: Deploying a cluster on bare metal +sidebar_label: Bare metal +original_id: deploy-bare-metal +--- + + +> ### Tips +> +> 1. Single-cluster Pulsar installations should be sufficient for all but the most ambitious use cases. If you're interested in experimenting with +> Pulsar or using it in a startup or on a single team, we recommend opting for a single cluster. If you do need to run a multi-cluster Pulsar instance, +> however, see the guide [here](deploy-bare-metal-multi-cluster.md). +> +> 2. If you want to use all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you need to download `apache-pulsar-io-connectors` +> package and make sure it is installed under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you +> have run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +> +> 3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders` +> package and make sure it is installed under `offloaders` directory in the pulsar directory on every broker node. For more details of how to configure +> this feature, you could reference this [Tiered storage cookbook](cookbooks-tiered-storage.md). + +Deploying a Pulsar cluster involves doing the following (in order): + +* Deploying a [ZooKeeper](#deploying-a-zookeeper-cluster) cluster (optional) +* Initializing [cluster metadata](#initializing-cluster-metadata) +* Deploying a [BookKeeper](#deploying-a-bookkeeper-cluster) cluster +* Deploying one or more Pulsar [brokers](#deploying-pulsar-brokers) + +## Preparation + +### Requirements + +> If you already have an existing zookeeper cluster and would like to reuse it, you don't need to prepare the machines +> for running ZooKeeper. + +To run Pulsar on bare metal, you are recommended to have: + +* At least 6 Linux machines or VMs + * 3 running [ZooKeeper](https://zookeeper.apache.org) + * 3 running a Pulsar broker, and a [BookKeeper](https://bookkeeper.apache.org) bookie +* A single [DNS](https://en.wikipedia.org/wiki/Domain_Name_System) name covering all of the Pulsar broker hosts + +> However if you don't have enough machines, or are trying out Pulsar in cluster mode (and expand the cluster later), +> you can even deploy Pulsar in one node, where it will run zookeeper, bookie and broker in same machine. + +> If you don't have a DNS server, you can use multi-host in service URL instead. + +Each machine in your cluster will need to have [Java 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) or higher installed. + +Here's a diagram showing the basic setup: + +![alt-text](assets/pulsar-basic-setup.png) + +In this diagram, connecting clients need to be able to communicate with the Pulsar cluster using a single URL, in this case `pulsar-cluster.acme.com`, that abstracts over all of the message-handling brokers. Pulsar message brokers run on machines alongside BookKeeper bookies; brokers and bookies, in turn, rely on ZooKeeper. + +### Hardware considerations + +When deploying a Pulsar cluster, we have some basic recommendations that you should keep in mind when capacity planning. + +#### ZooKeeper + +For machines running ZooKeeper, we recommend using lighter-weight machines or VMs. Pulsar uses ZooKeeper only for periodic coordination- and configuration-related tasks, *not* for basic operations. If you're running Pulsar on [Amazon Web Services](https://aws.amazon.com/) (AWS), for example, a [t2.small](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/t2-instances.html) instance would likely suffice. + +#### Bookies & Brokers + +For machines running a bookie and a Pulsar broker, we recommend using more powerful machines. For an AWS deployment, for example, [i3.4xlarge](https://aws.amazon.com/blogs/aws/now-available-i3-instances-for-demanding-io-intensive-applications/) instances may be appropriate. On those machines we also recommend: + +* Fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) (for Pulsar brokers) +* Small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache (for BookKeeper bookies) + +## Installing the Pulsar binary package + +> You'll need to install the Pulsar binary package on *each machine in the cluster*, including machines running [ZooKeeper](#deploying-a-zookeeper-cluster) and [BookKeeper](#deploying-a-bookkeeper-cluster). + +To get started deploying a Pulsar cluster on bare metal, you'll need to download a binary tarball release in one of the following ways: + +* By clicking on the link directly below, which will automatically trigger a download: + * Pulsar {{pulsar:version}} binary release +* From the Pulsar [downloads page](pulsar:download_page_url) +* From the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) on [GitHub](https://github.com) +* Using [wget](https://www.gnu.org/software/wget): + +```bash +$ wget pulsar:binary_release_url +``` + +Once you've downloaded the tarball, untar it and `cd` into the resulting directory: + +```bash +$ tar xvzf apache-pulsar-{{pulsar:version}}-bin.tar.gz +$ cd apache-pulsar-{{pulsar:version}} +``` + +The untarred directory contains the following subdirectories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's [command-line tools](reference-cli-tools.md), such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`logs` | Logs created by the installation. + +## Installing Builtin Connectors (optional) + +> Since release `2.1.0-incubating`, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +> If you would like to enable those `builtin` connectors, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To get started using builtin connectors, you'll need to download the connectors tarball release on every broker node in +one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar IO Connectors {{pulsar:version}} release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:connector_release_url/{connector}-{{pulsar:version}}.nar + ``` + +Once the nar file is downloaded, copy the file to directory `connectors` in the pulsar directory, +for example, if the connector file `pulsar-io-aerospike-{{pulsar:version}}.nar` is downloaded: + +```bash +$ mkdir connectors +$ mv pulsar-io-aerospike-{{pulsar:version}}.nar connectors + +$ ls connectors +pulsar-io-aerospike-{{pulsar:version}}.nar +... +``` + +## Installing Tiered Storage Offloaders (optional) + +> Since release `2.2.0`, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +> If you would like to enable tiered storage feature, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To get started using tiered storage offloaders, you'll need to download the offloaders tarball release on every broker node in +one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar Tiered Storage Offloaders {{pulsar:version}} release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:offloader_release_url + ``` + +Once the tarball is downloaded, in the pulsar directory, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash +$ tar xvfz apache-pulsar-offloaders-{{pulsar:version}}-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-{{pulsar:version}}` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-{{pulsar:version}}/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-{{pulsar:version}}.nar +``` + +For more details of how to configure tiered storage feature, you could reference this [Tiered storage cookbook](cookbooks-tiered-storage.md) + + +## Deploying a ZooKeeper cluster + +> If you already have an exsiting zookeeper cluster and would like to use it, you can skip this section. + +[ZooKeeper](https://zookeeper.apache.org) manages a variety of essential coordination- and configuration-related tasks for Pulsar. To deploy a Pulsar cluster you'll need to deploy ZooKeeper first (before all other components). We recommend deploying a 3-node ZooKeeper cluster. Pulsar does not make heavy use of ZooKeeper, so more lightweight machines or VMs should suffice for running ZooKeeper. + +To begin, add all ZooKeeper servers to the configuration specified in [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) (in the Pulsar directory you created [above](#installing-the-pulsar-binary-package)). Here's an example: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +> If you have only one machine to deploy Pulsar, you just need to add one server entry in the configuration file. + +On each host, you need to specify the ID of the node in each node's `myid` file, which is in each server's `data/zookeeper` folder by default (this can be changed via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed info on `myid` and more. + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```bash +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid +``` + +On `zk2.us-west.example.com` the command would be `echo 2 > data/zookeeper/myid` and so on. + +Once each server has been added to the `zookeeper.conf` configuration and has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start zookeeper +``` + +> If you are planning to deploy zookeeper with bookie on the same node, you +> need to start zookeeper by using different stats port. + +Start zookeeper with [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool like: + +```bash +$ PULSAR_EXTRA_OPTS="-Dstats_server_port=8001" bin/pulsar-daemon start zookeeper +``` + +## Initializing cluster metadata + +Once you've deployed ZooKeeper for your cluster, there is some metadata that needs to be written to ZooKeeper for each cluster in your instance. It only needs to be written **once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. This command can be run on any machine in your ZooKeeper cluster. Here's an example: + +```shell +$ bin/pulsar initialize-cluster-metadata \ + --cluster pulsar-cluster-1 \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2181 \ + --web-service-url http://pulsar.us-west.example.com:8080 \ + --web-service-url-tls https://pulsar.us-west.example.com:8443 \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650 \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +As you can see from the example above, the following needs to be specified: + +Flag | Description +:----|:----------- +`--cluster` | A name for the cluster +`--zookeeper` | A "local" ZooKeeper connection string for the cluster. This connection string only needs to include *one* machine in the ZooKeeper cluster. +`--configuration-store` | The configuration store connection string for the entire instance. As with the `--zookeeper` flag, this connection string only needs to include *one* machine in the ZooKeeper cluster. +`--web-service-url` | The web service URL for the cluster, plus a port. This URL should be a standard DNS name. The default port is 8080 (we don't recommend using a different port). +`--web-service-url-tls` | If you're using [TLS](security-tls-transport.md), you'll also need to specify a TLS web service URL for the cluster. The default port is 8443 (we don't recommend using a different port). +`--broker-service-url` | A broker service URL enabling interaction with the brokers in the cluster. This URL should use the same DNS name as the web service URL but should use the `pulsar` scheme instead. The default port is 6650 (we don't recommend using a different port). +`--broker-service-url-tls` | If you're using [TLS](security-tls-transport.md), you'll also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. The default port is 6651 (we don't recommend using a different port). + +## Deploying a BookKeeper cluster + +[BookKeeper](https://bookkeeper.apache.org) handles all persistent data storage in Pulsar. You will need to deploy a cluster of BookKeeper bookies to use Pulsar. We recommend running a **3-bookie BookKeeper cluster**. + +BookKeeper bookies can be configured using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important step in configuring bookies for our purposes here is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) is set to the connection string for the ZooKeeper cluster. Here's an example: + +```properties +zkServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +``` + +Once you've appropriately modified the `zkServers` parameter, you can provide any other configuration modifications you need. You can find a full listing of the available BookKeeper configuration parameters [here](reference-configuration.md#bookkeeper), although we would recommend consulting the [BookKeeper documentation](http://bookkeeper.apache.org/docs/latest/reference/config/) for a more in-depth guide. + +> ##### NOTES +> +> Since Pulsar 2.1.0 release, Pulsar introduces [stateful function](functions-develop.md#state-storage) for Pulsar Functions. If you would like to enable that feature, +> you need to enable table service on BookKeeper by setting following setting in `conf/bookkeeper.conf` file. +> +> ```conf +> extraServerComponents=org.apache.bookkeeper.stream.server.StreamStorageLifecycleComponent +> ``` + +Once you've applied the desired configuration in `conf/bookkeeper.conf`, you can start up a bookie on each of your BookKeeper hosts. You can start up each bookie either in the background, using [nohup](https://en.wikipedia.org/wiki/Nohup), or in the foreground. + +To start the bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start bookie +``` + +To start the bookie in the foreground: + +```bash +$ bin/bookkeeper bookie +``` + +You can verify that a bookie is working properly by running the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#shell) on it: + +```bash +$ bin/bookkeeper shell bookiesanity +``` + +This will create an ephemeral BookKeeper ledger on the local bookie, write a few entries, read them back, and finally delete the ledger. + +After you have started all the bookies, you can use `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to +verify all the bookies in the cluster are up running. + +```bash +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries +``` + +This command will create a `num-bookies` sized ledger on the cluster, write a few entries, and finally delete the ledger. + + +## Deploying Pulsar brokers + +Pulsar brokers are the last thing you need to deploy in your Pulsar cluster. Brokers handle Pulsar messages and provide Pulsar's administrative interface. We recommend running **3 brokers**, one for each machine that's already running a BookKeeper bookie. + +### Configuring Brokers + +The most important element of broker configuration is ensuring that each broker is aware of the ZooKeeper cluster that you've deployed. Make sure that the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) and [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameters. In this case, since we only have 1 cluster and no configuration store setup, the `configurationStoreServers` will point to the same `zookeeperServers`. + +```properties +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +configurationStoreServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +``` + +You also need to specify the cluster name (matching the name that you provided when [initializing the cluster's metadata](#initializing-cluster-metadata)): + +```properties +clusterName=pulsar-cluster-1 +``` + +In addition, you need to match the broker and web service ports provided when initializing the cluster's metadata (especially when using a different port from default): + +```properties +brokerServicePort=6650 +brokerServicePortTls=6651 +webServicePort=8080 +webServicePortTls=8443 +``` + +> If you deploy Pulsar in a one-node cluster, you should update the replication settings in `conf/broker.conf` to `1` +> +> ```properties +> # Number of bookies to use when creating a ledger +> managedLedgerDefaultEnsembleSize=1 +> +> # Number of copies to store for each message +> managedLedgerDefaultWriteQuorum=1 +> +> # Number of guaranteed copies (acks to wait before write is complete) +> managedLedgerDefaultAckQuorum=1 +> ``` + +### Enabling Pulsar Functions (optional) + +If you want to enable [Pulsar Functions](functions-overview.md), you can follow the instructions as below: + +1. Edit `conf/broker.conf` to enable functions worker, by setting `functionsWorkerEnabled` to `true`. + + ```conf + functionsWorkerEnabled=true + ``` + +2. Edit `conf/functions_worker.yml` and set `pulsarFunctionsCluster` to the cluster name that you provided when [initializing the cluster's metadata](#initializing-cluster-metadata). + + ```conf + pulsarFunctionsCluster: pulsar-cluster-1 + ``` + +If you would like to learn more options about deploying functions worker, please checkout [Deploy and manage functions worker](functions-worker.md). + +### Starting Brokers + +You can then provide any other configuration changes that you'd like in the [`conf/broker.conf`](reference-configuration.md#broker) file. Once you've decided on a configuration, you can start up the brokers for your Pulsar cluster. Like ZooKeeper and BookKeeper, brokers can be started either in the foreground or in the background, using nohup. + +You can start a broker in the foreground using the [`pulsar broker`](reference-cli-tools.md#pulsar-broker) command: + +```bash +$ bin/pulsar broker +``` + +You can start a broker in the background using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start broker +``` + +Once you've succesfully started up all the brokers you intend to use, your Pulsar cluster should be ready to go! + +## Connecting to the running cluster + +Once your Pulsar cluster is up and running, you should be able to connect with it using Pulsar clients. One such client is the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool, which is included with the Pulsar binary package. The `pulsar-client` tool can publish messages to and consume messages from Pulsar topics and thus provides a simple way to make sure that your cluster is runnning properly. + +To use the `pulsar-client` tool, first modify the client configuration file in [`conf/client.conf`](reference-configuration.md#client) in your binary package. You'll need to change the values for `webServiceUrl` and `brokerServiceUrl`, substituting `localhost` (which is the default), with the DNS name that you've assigned to your broker/bookie hosts. Here's an example: + +```properties +webServiceUrl=http://us-west.example.com:8080/ +brokerServiceurl=pulsar://us-west.example.com:6650/ +``` + +Once you've done that, you can publish a message to Pulsar topic: + +```bash +$ bin/pulsar-client produce \ + persistent://public/default/test \ + -n 1 \ + -m "Hello Pulsar" +``` + +> You may need to use a different cluster name in the topic if you specified a cluster name different from `pulsar-cluster-1`. + +This will publish a single message to the Pulsar topic. In addition, you can subscribe the Pulsar topic in a different terminal before publishing messages as below: + +```bash +$ bin/pulsar-client consume \ + persistent://public/default/test \ + -n 100 \ + -s "consumer-test" \ + -t "Exclusive" +``` + +Once the message above has been successfully published to the topic, you should see it in the standard output: + +```bash +----- got message ----- +Hello Pulsar +``` + +## Running Functions + +> If you have [enabled](#enabling-pulsar-functions-optional) Pulsar Functions, you can also tryout pulsar functions now. + +Create a ExclamationFunction `exclamation`. + +```bash +bin/pulsar-admin functions create \ + --jar examples/api-examples.jar \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --tenant public \ + --namespace default \ + --name exclamation +``` + +Check if the function is running as expected by [triggering](functions-deploying.md#triggering-pulsar-functions) the function. + +```bash +bin/pulsar-admin functions trigger --name exclamation --trigger-value "hello world" +``` + +You will see output as below: + +```shell +hello world! +``` diff --git a/site2/website/versioned_docs/version-2.4.2/functions-cli.md b/site2/website/versioned_docs/version-2.4.2/functions-cli.md new file mode 100644 index 0000000000000..5fcc092b1d90a --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/functions-cli.md @@ -0,0 +1,196 @@ +--- +id: version-2.4.2-functions-cli +title: Pulsar Functions command line tool +sidebar_label: Reference: CLI +original_id: functions-cli +--- + +The following tables list Pulsar Functions command-line tools. You can learn Pulsar Functions modes, commands, and parameters. + +## localrun + +Run Pulsar Functions locally, rather than deploying it to the Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | false | +broker-service-url | The URL for the Pulsar broker. | | +classname | The class name of a Pulsar Function.| | +client-auth-params | Client authentication parameter. | | +client-auth-plugin | Client authentication plugin using which function-process can connect to broker. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). | | +hostname-verification-enabled | Enable hostname verification. | false +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package. | | +instance-id-offset | Start the instanceIds from this offset. | 0 +log-topic | The topic to which the logs a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +tls-allow-insecure | Allow insecure tls connection. | false +tls-trust-cert-path | tls trust cert file path. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +use-tls | Use tls connection. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + + +## create + +Create and deploy a Pulsar Function in cluster mode. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | false | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## delete + +Delete a Pulsar Function that is running on a Pulsar cluster. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## update + +Update a Pulsar Function that has been deployed to a Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | false +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime). | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +update-auth-data | Whether or not to update the auth data. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## get + +Fetch information about a Pulsar Function. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## restart + +Restart function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## stop + +Stops function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## start + +Starts a stopped function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/functions-debug.md b/site2/website/versioned_docs/version-2.4.2/functions-debug.md new file mode 100644 index 0000000000000..26f3f775bb12e --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/functions-debug.md @@ -0,0 +1,448 @@ +--- +id: version-2.4.2-functions-debug +title: Debug Pulsar Functions +sidebar_label: How-to: Debug +original_id: functions-debug +--- + +You can use the following methods to debug Pulsar Functions: + +* [Use unit test](functions-debug.md#use-unit-test) +* [Debug with localrun mode](functions-debug.md#debug-with-localrun-mode) +* [Use log topic](functions-debug.md#use-log-topic) +* [Use Functions CLI](functions-debug.md#use-functions-cli) + +## Use unit test + +A Pulsar Function is a function with inputs and outputs, you can test a Pulsar Function in a similar way as you test any function. + +For example, if you have the following Pulsar Function: + +```java +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} +``` + +You can write a simple unit test to test Pulsar Function. + +```java +@Test +public void testJavaNativeExclamationFunction() { + JavaNativeExclamationFunction exclamation = new JavaNativeExclamationFunction(); + String output = exclamation.apply("foo"); + Assert.assertEquals(output, "foo!"); +} +``` + +The following Pulsar Function implements the `org.apache.pulsar.functions.api.Function` interface. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} +``` + +In this situation, you can write a unit test for this function as well. Remember to mock the `Context` parameter. The following is an example. + +```java +@Test +public void testExclamationFunction() { + ExclamationFunction exclamation = new ExclamationFunction(); + String output = exclamation.process("foo", mock(Context.class)); + Assert.assertEquals(output, "foo!"); +} +``` + +## Debug with localrun mode +When you run a Pulsar Function in localrun mode, it launches an instance of the Function on your local machine as a thread. + +In this mode, a Pulsar Function consumes and produces actual data to a Pulsar cluster, and mirrors how the function actually runs in a Pulsar cluster. + +> Note +> Currently, debugging with localrun mode is only supported by Pulsar Functions written in Java. You need Pulsar version 2.4.0 or later to do the following. Even though localrun is available in versions earlier than Pulsar 2.4.0, you cannot debug with localrun mode programmatically or run Functions as threads. + +You can launch your function in the following manner. + +```java +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setName(functionName); +functionConfig.setInputs(Collections.singleton(sourceTopic)); +functionConfig.setClassName(ExclamationFunction.class.getName()); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setOutput(sinkTopic); + +LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); +localRunner.start(true); +``` + +So you can debug functions using an IDE easily. Set breakpoints and manually step through a function to debug with real data. + +The following example illustrates how to programmatically launch a function in localrun mode. + +```java +public class ExclamationFunction implements Function { + + @Override + public String process(String s, Context context) throws Exception { + return s + "!"; + } + +public static void main(String[] args) throws Exception { + FunctionConfig functionConfig = new FunctionConfig(); + functionConfig.setName("exclamation"); + functionConfig.setInputs(Collections.singleton("input")); + functionConfig.setClassName(ExclamationFunction.class.getName()); + functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); + functionConfig.setOutput("output"); + + LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); + localRunner.start(false); +} +``` + +To use localrun mode programmatically, add the following dependency. + +```xml + + org.apache.pulsar + pulsar-functions-local-runner + ${pulsar.version} + + +``` + +For complete code samples, see [here](https://github.com/jerrypeng/pulsar-functions-demos/tree/master/debugging). + +> Note +> Debugging with localrun mode for Pulsar Functions written in other languages will be supported soon. + +## Use log topic + +In Pulsar Functions, you can generate log information defined in functions to a specified log topic. You can configure consumers to consume messages from a specified log topic to check the log information. + +![Pulsar Functions core programming model](assets/pulsar-functions-overview.png) + +**Example** + +```text +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} +``` + +As shown in the example above, you can get the logger via `context.getLogger()` and assign the logger to the `LOG` variable of `slf4j`, so you can define your desired log information in a function using the `LOG` variable. Meanwhile, you need to specify the topic to which the log information is produced. + +**Example** + +``` +$ bin/pulsar-admin functions create \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs +``` + +## Use Functions CLI + +With [Pulsar Functions CLI](reference-pulsar-admin.md#functions), you can debug Pulsar Functions with the following subcommands: + +* `get` +* `status` +* `stats` +* `list` +* `trigger` + +> **Tip** +> +> For complete commands of **Pulsar Functions CLI**, see [here](reference-pulsar-admin.md#functions)。 + +### `get` + +Get information about a Pulsar Function. + +**Usage** + +```text +$ pulsar-admin functions get options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +> **Tip** +> +> `--fqfn` consists of `--name`, `--namespace` and `--tenant`, so you can specify either `--fqfn` or `--name`, `--namespace` and `--tenant`. + +**Example** + +You can specify `--fqfn` to get information about a Pulsar Function. + +```text +$ ./bin/pulsar-admin functions get public/default/ExclamationFunctio6 +``` +Optionally, you can specify `--name`, `--namespace` and `--tenant` to get information about a Pulsar Function. + +```text +$ ./bin/pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 +``` + +As shown below, the `get` command shows input, output, runtime, and other information about the _ExclamationFunctio6_ function. + +```text +{ + "tenant": "public", + "namespace": "default", + "name": "ExclamationFunctio6", + "className": "org.example.test.ExclamationFunction", + "inputSpecs": { + "persistent://public/default/my-topic-1": { + "isRegexPattern": false + } + }, + "output": "persistent://public/default/test-1", + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "userConfig": {}, + "runtime": "JAVA", + "autoAck": true, + "parallelism": 1 +} +``` + +### `status` + +Check the current status of a Pulsar Function. + +**Usage** + +```text +$ pulsar-admin functions status options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function
If the `--instance-id` is not specified, it gets the IDs of all instances.
+|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```text +$ ./bin/pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ +``` + +As shown below, the `status` command shows the number of instances, running instances, the instance running under the _ExclamationFunctio6_ function, received messages, successfully processed messages, system exceptions, the average latency and so on. + +```text +{ + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 1, + "numSuccessfullyProcessed" : 1, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.8385, + "lastInvocationTime" : 1557734137987, + "workerId" : "c-standalone-fw-23ccc88ef29b-8080" + } + } ] +} +``` + +### `stats` + +Get the current stats of a Pulsar Function. + +**Usage** + +```text +$ pulsar-admin functions stats options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function.
If the `--instance-id` is not specified, it gets the IDs of all instances.
+|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```text +$ ./bin/pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ +``` + +The output is shown as follows: + +```text +{ + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "instances" : [ { + "instanceId" : 0, + "metrics" : { + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "userMetrics" : { } + } + } ] +} +``` + +### `list` + +List all Pulsar Functions running under a specific tenant and namespace. + +**Usage** + +```text +$ pulsar-admin functions list options +``` + +**Options** + +|Flag|Description +|---|--- +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```text +$ ./bin/pulsar-admin functions list \ + --tenant public \ + --namespace default +``` +As shown below, the `list` command returns three functions running under the _public_ tenant and the _default_ namespace. + +```text +ExclamationFunctio1 +ExclamationFunctio2 +ExclamationFunctio3 +``` + +### `trigger` + +Trigger a specified Pulsar Function with a supplied value. This command simulates the execution process of a Pulsar Function and verifies it. + +**Usage** + +```text +$ pulsar-admin functions trigger options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. +|`--topic`|The topic name that a Pulsar Function consumes from. +|`--trigger-file`|The path to a file that contains the data to trigger a Pulsar Function. +|`--trigger-value`|The value to trigger a Pulsar Function. + +**Example** + +```text +$ ./bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + --topic persistent://public/default/my-topic-1 \ + --trigger-value "hello pulsar functions" +``` + +As shown below, the `trigger` command returns the following result: + +```text +This is my function! +``` + +> #### **Note** +> You must specify the [entire topic name](getting-started-pulsar.md#topic-names) when using the `--topic` option. Otherwise, the following error occurs. +> +>```text +>Function in trigger function has unidentified topic +> +>Reason: Function in trigger function has unidentified topic +>``` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/functions-deploy.md b/site2/website/versioned_docs/version-2.4.2/functions-deploy.md new file mode 100644 index 0000000000000..c373e360dbdd1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/functions-deploy.md @@ -0,0 +1,211 @@ +--- +id: version-2.4.2-functions-deploy +title: Deploy Pulsar Functions +sidebar_label: How-to: Deploy +original_id: functions-deploy +--- + +## Requirements + +To deploy and manage Pulsar Functions, you need to have a Pulsar cluster running. There are several options for this: + +* You can run a [standalone cluster](getting-started-standalone.md) locally on your own machine. +* You can deploy a Pulsar cluster on [Kubernetes](deploy-kubernetes.md), [Amazon Web Services](deploy-aws.md), [bare metal](deploy-bare-metal.md), [DC/OS](deploy-dcos.md), and more. + +If you run a non-[standalone](reference-terminology.md#standalone) cluster, you need to obtain the service URL for the cluster. How you obtain the service URL depends on how you deploy your Pulsar cluster. + +If you want to deploy and trigger Python user-defined functions, you need to install [the pulsar python client](http://pulsar.apache.org/docs/en/client-libraries-python/) on all the machines running [functions workers](functions-worker.md). + +## Command-line interface + +Pulsar Functions are deployed and managed using the [`pulsar-admin functions`](reference-pulsar-admin.md#functions) interface, which contains commands such as [`create`](reference-pulsar-admin.md#functions-create) for deploying functions in [cluster mode](#cluster-mode), [`trigger`](reference-pulsar-admin.md#trigger) for [triggering](#triggering-pulsar-functions) functions, [`list`](reference-pulsar-admin.md#list-2) for listing deployed functions. + +To learn more commands, refer to [`pulsar-admin functions`](reference-pulsar-admin.md#functions). + +### Default arguments + +When managing Pulsar Functions, you need to specify a variety of information about functions, including tenant, namespace, input and output topics, and so on. However, some parameters have default values if you do not specify values for them. The following table lists the default values. + +Parameter | Default +:---------|:------- +Function name | You can specify any value for the class name (except org, library, or similar class names). For example, when you specify the flag `--classname org.example.MyFunction`, the function name is `MyFunction`. +Tenant | Derived from names of the input topics. If the input topics are under the `marketing` tenant, which means the topic names have the form `persistent://marketing/{namespace}/{topicName}`, the tenant is `marketing`. +Namespace | Derived from names of the input topics. If the input topics are under the `asia` namespace under the `marketing` tenant, which means the topic names have the form `persistent://marketing/asia/{topicName}`, then the namespace is `asia`. +Output topic | `{input topic}-{function name}-output`. For example, if an input topic name of a function is `incoming`, and the function name is `exclamation`, then the name of the output topic is `incoming-exclamation-output`. +Subscription type | For `at-least-once` and `at-most-once` [processing guarantees](functions-overview.md#processing-guarantees), the [`SHARED`](concepts-messaging.md#shared) mode is applied by default; for `effectively-once` guarantees, the [`FAILOVER`](concepts-messaging.md#failover) mode is applied. +Processing guarantees | [`ATLEAST_ONCE`](functions-overview.md#processing-guarantees) +Pulsar service URL | `pulsar://localhost:6650` + +### Example of default arguments + +Take the `create` command as an example. + +```bash +$ bin/pulsar-admin functions create \ + --jar my-pulsar-functions.jar \ + --classname org.example.MyFunction \ + --inputs my-function-input-topic1,my-function-input-topic2 +``` + +The function has default values for the function name (`MyFunction`), tenant (`public`), namespace (`default`), subscription type (`SHARED`), processing guarantees (`ATLEAST_ONCE`), and Pulsar service URL (`pulsar://localhost:6650`). + +## Local run mode + +If you run a Pulsar Function in **local run** mode, it runs on the machine from which you enter the commands (on your laptop, an [AWS EC2](https://aws.amazon.com/ec2/) instance, and so on). The following is a [`localrun`](reference-pulsar-admin.md#localrun) command example. + +```bash +$ bin/pulsar-admin functions localrun \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 +``` + +By default, the function connects to a Pulsar cluster running on the same machine, via a local [broker](reference-terminology.md#broker) service URL of `pulsar://localhost:6650`. If you use local run mode to run a function but connect it to a non-local Pulsar cluster, you can specify a different broker URL using the `--brokerServiceUrl` flag. The following is an example. + +```bash +$ bin/pulsar-admin functions localrun \ + --broker-service-url pulsar://my-cluster-host:6650 \ + # Other function parameters +``` + +## Cluster mode + +When you run a Pulsar Function in **cluster** mode, the function code is uploaded to a Pulsar broker and runs *alongside the broker* rather than in your [local environment](#local-run-mode). You can run a function in cluster mode using the [`create`](reference-pulsar-admin.md#create-1) command. + +```bash +$ bin/pulsar-admin functions create \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 +``` + +### Update functions in cluster mode + +You can use the [`update`](reference-pulsar-admin.md#update-1) command to update a Pulsar Function running in cluster mode. The following command updates the function created in the [cluster mode](#cluster-mode) section. + +```bash +$ bin/pulsar-admin functions update \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/new-input-topic \ + --output persistent://public/default/new-output-topic +``` + +### Parallelism + +Pulsar Functions run as processes or threads, which are called **instances**. When you run a Pulsar Function, it runs as a single instance by default. With one localrun command, you can only run a single instance of a function. If you want to run multiple instances, you can use localrun command multiple times. + +When you create a function, you can specify the *parallelism* of a function (the number of instances to run). You can set the parallelism factor using the `--parallelism` flag of the [`create`](reference-pulsar-admin.md#functions-create) command. + +```bash +$ bin/pulsar-admin functions create \ + --parallelism 3 \ + # Other function info +``` + +You can adjust the parallelism of an already created function using the [`update`](reference-pulsar-admin.md#update-1) interface. + +```bash +$ bin/pulsar-admin functions update \ + --parallelism 5 \ + # Other function +``` + +If you specify a function configuration via YAML, use the `parallelism` parameter. The following is a config file example. + +```yaml +# function-config.yaml +parallelism: 3 +inputs: +- persistent://public/default/input-1 +output: persistent://public/default/output-1 +# other parameters +``` + +The following is corresponding update command. + +```bash +$ bin/pulsar-admin functions update \ + --function-config-file function-config.yaml +``` + +### Function instance resources + +When you run Pulsar Functions in [cluster mode](#cluster-mode), you can specify the resources that are assigned to each function [instance](#parallelism). + +Resource | Specified as | Runtimes +:--------|:----------------|:-------- +CPU | The number of cores | Kubernetes +RAM | The number of bytes | Process, Docker +Disk space | The number of bytes | Docker + +The following function creation command allocates 8 cores, 8 GB of RAM, and 10 GB of disk space to a function. + +```bash +$ bin/pulsar-admin functions create \ + --jar target/my-functions.jar \ + --classname org.example.functions.MyFunction \ + --cpu 8 \ + --ram 8589934592 \ + --disk 10737418240 +``` + +> #### Resources are *per instance* +> The resources that you apply to a given Pulsar Function are applied to each instance of the function. For example, if you apply 8 GB of RAM to a function with a parallelism of 5, you are applying 40 GB of RAM for the function in total. Make sure that you take the parallelism (the number of instances) factor into your resource calculations. + +## Trigger Pulsar Functions + +If a Pulsar Function is running in [cluster mode](#cluster-mode), you can **trigger** it at any time using the command line. Triggering a function means that you send a message with a specific value to the function and get the function output (if any) via the command line. + +> Triggering a function is to invoke a function by producing a message on one of the input topics. With the [`pulsar-admin functions trigger`](reference-pulsar-admin.md#trigger) command, you can send messages to functions without using the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool or a language-specific client library. + +To learn how to trigger a function, you can start with Python function that returns a simple string based on the input. + +```python +# myfunc.py +def process(input): + return "This function has been triggered with a value of {0}".format(input) +``` + +You can run the function in [local run mode](functions-deploy.md#local-run-mode). + +```bash +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name myfunc \ + --py myfunc.py \ + --classname myfunc \ + --inputs persistent://public/default/in \ + --output persistent://public/default/out +``` + +Then assign a consumer to listen on the output topic for messages from the `myfunc` function with the [`pulsar-client consume`](reference-cli-tools.md#consume) command. + +```bash +$ bin/pulsar-client consume persistent://public/default/out \ + --subscription-name my-subscription + --num-messages 0 # Listen indefinitely +``` + +And then you can trigger the function. + +```bash +$ bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name myfunc \ + --trigger-value "hello world" +``` + +The consumer listening on the output topic produces something as follows in the log. + +``` +----- got message ----- +This function has been triggered with a value of hello world +``` + +> #### Topic info is not required +> In the `trigger` command, you only need to specify basic information about the function (tenant, namespace, and name). To trigger the function, you do not need to know the function input topics. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/functions-develop.md b/site2/website/versioned_docs/version-2.4.2/functions-develop.md new file mode 100644 index 0000000000000..1ad091403171d --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/functions-develop.md @@ -0,0 +1,899 @@ +--- +id: version-2.4.2-functions-develop +title: Develop Pulsar Functions +sidebar_label: How-to: Develop +original_id: functions-develop +--- + +This tutorial walks you through how to develop Pulsar Functions. + +## Available APIs +In Java and Python, you have two options to write Pulsar Functions. In Go, you can use Pulsar Functions SDK for Go. + +Interface | Description | Use cases +:---------|:------------|:--------- +Language-native interface | No Pulsar-specific libraries or special dependencies required (only core libraries from Java/Python). | Functions that do not require access to the function [context](#context). +Pulsar Function SDK for Java/Python/Go | Pulsar-specific libraries that provide a range of functionality not provided by "native" interfaces. | Functions that require access to the function [context](#context). + +The language-native function, which adds an exclamation point to all incoming strings and publishes the resulting string to a topic, has no external dependencies. The following example is language-native function. + + + +```Java +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclamationFunction.java). + + +```python +def process(input): + return "{}!".format(input) +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py). + +> Note +> You can write Pulsar Functions in python2 or python3. However, Pulsar only looks for `python` as the interpreter. +> +> If you're running Pulsar Functions on an Ubuntu system that only supports python3, you might fail to start the functions. In this case, you can create a symlink. Your system will fail if you subsequently install any other package that depends on Python 2.x. A solution is under development in [Issue 5518](https://github.com/apache/pulsar/issues/5518). +> +> ```bash +> sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10 +> ``` + + + +The following example uses Pulsar Functions SDK. + + +```Java +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java). + + +```python +from pulsar import Function + +class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py). + + +```Go +package main + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func HandleRequest(ctx context.Context, in []byte) error{ + fmt.Println(string(in) + "!") + return nil +} + +func main() { + pf.Start(HandleRequest) +} +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-function-go/examples/inputFunc.go#L20-L36). + + + +## Schema registry +Pulsar has a built in schema registry and comes bundled with a variety of popular schema types(avro, json and protobuf). Pulsar Functions can leverage existing schema information from input topics and derive the input type. The schema registry applies for output topic as well. + +## SerDe +SerDe stands for **Ser**ialization and **De**serialization. Pulsar Functions uses SerDe when publishing data to and consuming data from Pulsar topics. How SerDe works by default depends on the language you use for a particular function. + + + +When you write Pulsar Functions in Java, the following basic Java types are built in and supported by default: + +* `String` +* `Double` +* `Integer` +* `Float` +* `Long` +* `Short` +* `Byte` + +To customize Java types, you need to implement the following interface. + +```java +public interface SerDe { + T deserialize(byte[] input); + byte[] serialize(T input); +} +``` + + +In Python, the default SerDe is identity, meaning that the type is serialized as whatever type the producer function returns. + +You can specify the SerDe when [creating](functions-deploy.md#cluster-mode) or [running](functions-deploy.md#local-run-mode) functions. + +```bash +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name my_function \ + --py my_function.py \ + --classname my_function.MyFunction \ + --custom-serde-inputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ + --output-serde-classname Serde3 \ + --output output-topic-1 +``` + +This case contains two input topics: `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the map must be specified as a JSON string). The output topic, `output-topic-1`, uses the `Serde3` class for SerDe. At the moment, all Pulsar Functions logic, include processing function and SerDe classes, must be contained within a single Python file. + +When using Pulsar Functions for Python, you have three SerDe options: + +1. You can use the [`IdentitySerde`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. The `IdentitySerDe` is the **default**. Creating or running a function without explicitly specifying SerDe means that this option is used. +2. You can use the [`PickleSerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. +3. You can create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods: [`serialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes, and [`deserialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. + +The table below shows when you should use each SerDe. + +SerDe option | When to use +:------------|:----------- +`IdentitySerde` | When you work with simple types like strings, Booleans, integers. +`PickleSerDe` | When you work with complex, application-specific types and are comfortable with the "best effort" approach of `pickle`. +Custom SerDe | When you require explicit control over SerDe, potentially for performance or data compatibility purposes. + + + +### Example +Imagine that you're writing Pulsar Functions that are processing tweet objects, you can refer to the following example of `Tweet` class. + + + + +```java +public class Tweet { + private String username; + private String tweetContent; + + public Tweet(String username, String tweetContent) { + this.username = username; + this.tweetContent = tweetContent; + } + + // Standard setters and getters +} +``` + +To pass `Tweet` objects directly between Pulsar Functions, you need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings in which the username and tweet content are separated by a `|`. + +```java +package com.example.serde; + +import org.apache.pulsar.functions.api.SerDe; + +import java.util.regex.Pattern; + +public class TweetSerde implements SerDe { + public Tweet deserialize(byte[] input) { + String s = new String(input); + String[] fields = s.split(Pattern.quote("|")); + return new Tweet(fields[0], fields[1]); + } + + public byte[] serialize(Tweet input) { + return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); + } +} +``` + +To apply this customized SerDe to a particular Pulsar Function, you need to: + +* Package the `Tweet` and `TweetSerde` classes into a JAR. +* Specify a path to the JAR and SerDe class name when deploying the function. + +The following is an example of [`create`](reference-pulsar-admin.md#create-1) operation. + +```bash +$ bin/pulsar-admin functions create \ + --jar /path/to/your.jar \ + --output-serde-classname com.example.serde.TweetSerde \ + # Other function attributes +``` + +> #### Custom SerDe classes must be packaged with your function JARs +> Pulsar does not store your custom SerDe classes separately from your Pulsar Functions. So you need to include your SerDe classes in your function JARs. If not, Pulsar returns an error. + + + +```python +class Tweet(object): + def __init__(self, username, tweet_content): + self.username = username + self.tweet_content = tweet_content +``` + +In order to use this class in Pulsar Functions, you have two options: + +1. You can specify `PickleSerDe`, which applies the [`pickle`](https://docs.python.org/3/library/pickle.html) library SerDe. +2. You can create your own SerDe class. The following is an example. + + ```python +from pulsar import SerDe + +class TweetSerDe(SerDe): + + def serialize(self, input): + return bytes("{0}|{1}".format(input.username, input.tweet_content)) + + def deserialize(self, input_bytes): + tweet_components = str(input_bytes).split('|') + return Tweet(tweet_components[0], tweet_componentsp[1]) + ``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/custom_object_function.py). + + + +In both languages, however, you can write custom SerDe logic for more complex, application-specific types. + +## Context +Java, Python and Go SDKs provide access to a **context object** that can be used by a function. This context object provides a wide variety of information and functionality to the function. + +* The name and ID of a Pulsar Function. +* The message ID of each message. Each Pulsar message is automatically assigned with an ID. +* The key, event time, properties and partition key of each message. +* The name of the topic to which the message is sent. +* The names of all input topics as well as the output topic associated with the function. +* The name of the class used for [SerDe](#serde). +* The [tenant](reference-terminology.md#tenant) and namespace associated with the function. +* The ID of the Pulsar Functions instance running the function. +* The version of the function. +* The [logger object](functions-develop.md#logger) used by the function, which can be used to create function log messages. +* Access to arbitrary [user configuration](#user-config) values supplied via the CLI. +* An interface for recording [metrics](#metrics). +* An interface for storing and retrieving state in [state storage](#state-storage). +* A function to publish new messages onto arbitrary topics. +* A function to ack the message being processed (if auto-ack is disabled). + + + +The [Context](https://github.com/apache/pulsar/blob/master/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Context.java) interface provides a number of methods that you can use to access the function [context](#context). The various method signatures for the `Context` interface are listed as follows. + +```java +public interface Context { + Record getCurrentRecord(); + Collection getInputTopics(); + String getOutputTopic(); + String getOutputSchemaType(); + String getTenant(); + String getNamespace(); + String getFunctionName(); + String getFunctionId(); + String getInstanceId(); + String getFunctionVersion(); + Logger getLogger(); + void incrCounter(String key, long amount); + long getCounter(String key); + void putState(String key, ByteBuffer value); + void deleteState(String key); + ByteBuffer getState(String key); + Map getUserConfigMap(); + Optional getUserConfigValue(String key); + Object getUserConfigValueOrDefault(String key, Object defaultValue); + void recordMetric(String metricName, double value); + CompletableFuture publish(String topicName, O object, String schemaOrSerdeClassName); + CompletableFuture publish(String topicName, O object); +} +``` + +The following example uses several methods available via the `Context` object. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.stream.Collectors; + +public class ContextFunction implements Function { + public Void process(String input, Context context) { + Logger LOG = context.getLogger(); + String inputTopics = context.getInputTopics().stream().collect(Collectors.joining(", ")); + String functionName = context.getFunctionName(); + + String logMessage = String.format("A message with a value of \"%s\" has arrived on one of the following topics: %s\n", + input, + inputTopics); + + LOG.info(logMessage); + + String metricName = String.format("function-%s-messages-received", functionName); + context.recordMetric(metricName, 1); + + return null; + } +} +``` + + +``` +class ContextImpl(pulsar.Context): + def get_message_id(self): + ... + def get_message_key(self): + ... + def get_message_eventtime(self): + ... + def get_message_properties(self): + ... + def get_current_message_topic_name(self): + ... + def get_partition_key(self): + ... + def get_function_name(self): + ... + def get_function_tenant(self): + ... + def get_function_namespace(self): + ... + def get_function_id(self): + ... + def get_instance_id(self): + ... + def get_function_version(self): + ... + def get_logger(self): + ... + def get_user_config_value(self, key): + ... + def get_user_config_map(self): + ... + def record_metric(self, metric_name, metric_value): + ... + def get_input_topics(self): + ... + def get_output_topic(self): + ... + def get_output_serde_class_name(self): + ... + def publish(self, topic_name, message, serde_class_name="serde.IdentitySerDe", + properties=None, compression_type=None, callback=None, message_conf=None): + ... + def ack(self, msgid, topic): + ... + def get_and_reset_metrics(self): + ... + def reset_metrics(self): + ... + def get_metrics(self): + ... + def incr_counter(self, key, amount): + ... + def get_counter(self, key): + ... + def del_counter(self, key): + ... + def put_state(self, key, value): + ... + def get_state(self, key): + ... +``` + + +``` +func (c *FunctionContext) GetInstanceID() int { + return c.instanceConf.instanceID +} + +func (c *FunctionContext) GetInputTopics() []string { + return c.inputTopics +} + +func (c *FunctionContext) GetOutputTopic() string { + return c.instanceConf.funcDetails.GetSink().Topic +} + +func (c *FunctionContext) GetFuncTenant() string { + return c.instanceConf.funcDetails.Tenant +} + +func (c *FunctionContext) GetFuncName() string { + return c.instanceConf.funcDetails.Name +} + +func (c *FunctionContext) GetFuncNamespace() string { + return c.instanceConf.funcDetails.Namespace +} + +func (c *FunctionContext) GetFuncID() string { + return c.instanceConf.funcID +} + +func (c *FunctionContext) GetFuncVersion() string { + return c.instanceConf.funcVersion +} + +func (c *FunctionContext) GetUserConfValue(key string) interface{} { + return c.userConfigs[key] +} + +func (c *FunctionContext) GetUserConfMap() map[string]interface{} { + return c.userConfigs +} +``` + +The following example uses several methods available via the `Context` object. + +``` +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func contextFunc(ctx context.Context) { + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } +} +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-function-go/examples/contextFunc.go#L29-L34). + + + +### User config +When you run or update Pulsar Functions created using SDK, you can pass arbitrary key/values to them with the command line with the `--userConfig` flag. Key/values must be specified as JSON. The following function creation command passes a user configured key/value to a function. + +```bash +$ bin/pulsar-admin functions create \ + --name word-filter \ + # Other function configs + --user-config '{"forbidden-word":"rosebud"}' +``` + + + +The Java SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash +$ bin/pulsar-admin functions create \ + # Other function configs + --user-config '{"word-of-the-day":"verdure"}' +``` + +To access that value in a Java function: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + Optional wotd = context.getUserConfigValue("word-of-the-day"); + if (wotd.isPresent()) { + LOG.info("The word of the day is {}", wotd); + } else { + LOG.warn("No word of the day provided"); + } + return null; + } +} +``` + +The `UserConfigFunction` function will log the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The `word-of-the-day` user config will be changed only when the function is updated with a new config value via the command line. + +You can also access the entire user config map or set a default value in case no value is present: + +```java +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); +``` + +> For all key/value pairs passed to Java functions, both the key *and* the value are `String`. To set the value to be a different type, you need to deserialize from the `String` type. + + +In Python function, you can access the configuration value like this. + +```python +from pulsar import Function + +class WordFilter(Function): + def process(self, context, input): + forbidden_word = context.user_config()["forbidden-word"] + + # Don't publish the message if it contains the user-supplied + # forbidden word + if forbidden_word in input: + pass + # Otherwise publish the message + else: + return input +``` + +The Python SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash +$ bin/pulsar-admin functions create \ + # Other function configs \ + --user-config '{"word-of-the-day":"verdure"}' +``` + +To access that value in a Python function: + +```python +from pulsar import Function + +class UserConfigFunction(Function): + def process(self, input, context): + logger = context.get_logger() + wotd = context.get_user_config_value('word-of-the-day') + if wotd is None: + logger.warn('No word of the day provided') + else: + logger.info("The word of the day is {0}".format(wotd)) +``` + + + +### Logger + + + +Pulsar Functions that use the Java SDK have access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. The following example logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} +``` + +If you want your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. + +```bash +$ bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs +``` + +All logs produced by `LoggingFunction` above can be accessed via the `persistent://public/default/logging-function-logs` topic. + + +Pulsar Functions that use the Python SDK have access to a logging object that can be used to produce logs at the chosen log level. The following example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```python +from pulsar import Function + +class LoggingFunction(Function): + def process(self, input, context): + logger = context.get_logger() + msg_id = context.get_message_id() + if 'danger' in input: + logger.warn("A warning was received in message {0}".format(context.get_message_id())) + else: + logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) +``` + +If you want your function to produce logs on a Pulsar topic, you need to specify a **log topic** when creating or running the function. The following is an example. + +```bash +$ bin/pulsar-admin functions create \ + --py logging_function.py \ + --classname logging_function.LoggingFunction \ + --log-topic logging-function-logs \ + # Other function configs +``` + +All logs produced by `LoggingFunction` above can be accessed via the `logging-function-logs` topic. + + +The following Go Function example shows different log levels based on the function input. + +``` +import ( + "context" + + "github.com/apache/pulsar/pulsar-function-go/log" + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func loggerFunc(ctx context.Context, input []byte) { + if len(input) <= 100 { + log.Infof("This input has a length of: %d", len(input)) + } else { + log.Warnf("This input is getting too long! It has {%d} characters", len(input)) + } +} + +func main() { + pf.Start(loggerFunc) +} +``` + +When you use `logTopic` related functionalities in Go Function, import `github.com/apache/pulsar/pulsar-function-go/log`, and you do not have to use the `getLogger()` context object. + + + +## Metrics +Pulsar Functions can publish arbitrary metrics to the metrics interface which can be queried. + +> If a Pulsar Function uses the language-native interface for Java or Python, that function is not able to publish metrics and stats to Pulsar. + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecorderFunction implements Function { + @Override + public void apply(Integer input, Context context) { + // Records the metric 1 every time a message arrives + context.recordMetric("hit-count", 1); + + // Records the metric only if the arriving number equals 11 + if (input == 11) { + context.recordMetric("elevens-count", 1); + } + + return null; + } +} +``` + +> For instructions on reading and using metrics, see the [Monitoring](deploy-monitoring.md) guide. + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. The following is an example. + +```python +from pulsar import Function + +class MetricRecorderFunction(Function): + def process(self, input, context): + context.record_metric('hit-count', 1) + + if input == 11: + context.record_metric('elevens-count', 1) +``` + + + +### Access metrics +To access metrics created by Pulsar Functions, refer to [Monitoring](deploy-monitoring.md) in Pulsar. + +## State storage +Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Pulsar installation, including the local standalone installation, includes deployment of BookKeeper bookies. + +Since Pulsar 2.1.0 release, Pulsar integrates with Apache BookKeeper [table service](https://docs.google.com/document/d/155xAwWv5IdOitHh1NVMEwCMGgB28M3FyMiQSxEpjE-Y/edit#heading=h.56rbh52koe3f) to store the `State` for functions. For example, a `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions State API. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data - counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function, and shared between instances of that function. + +You can access states within Pulsar Functions using the `putState`, `getState`, `incrCounter`, `getCounter` and `deleteState` calls on the context object. You can also manage states using the [querystate](#query-state) and [putstate](#putstate) options to `pulsar-admin functions`. + +### API + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](functions-develop.md#context) object when you are using Java SDK functions. + +#### incrCounter + +```java + /** + * Increment the builtin distributed counter refered by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); +``` + +Application can use `incrCounter` to change the counter of a given `key` by the given `amount`. + +#### getCounter + +```java + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); +``` + +Application can use `getCounter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### putState + +```java + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); +``` + +#### getState + +```java + /** + * Retrieve the state value for the key. + * + * @param key name of the key + * @return the state value for the key. + */ + ByteBuffer getState(String key); +``` + +#### deleteState + +```java + /** + * Delete the state value for the key. + * + * @param key name of the key + */ +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](#context) object when you are using Python SDK functions. + +#### incr_counter + +```python + def incr_counter(self, key, amount): + """incr the counter of a given key in the managed state""" +``` + +Application can use `incr_counter` to change the counter of a given `key` by the given `amount`. +If the `key` does not exist, a new key is created. + +#### get_counter + +```python + def get_counter(self, key): + """get the counter of a given key in the managed state""" +``` + +Application can use `get_counter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### put_state + +```python + def put_state(self, key, value): + """update the value of a given key in the managed state""" +``` + +The key is a string, and the value is arbitrary binary data. + +#### get_state + +```python + def get_state(self, key): + """get the value of a given key in the managed state""" +``` + +#### del_counter + +```python + def del_counter(self, key): + """delete the counter of a given key in the managed state""" +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + +### Query State + +A Pulsar Function can use the [State API](#api) for storing state into Pulsar's state storage +and retrieving state back from Pulsar's state storage. Additionally Pulsar also provides +CLI commands for querying its state. + +```shell +$ bin/pulsar-admin functions querystate \ + --tenant \ + --namespace \ + --name \ + --state-storage-url \ + --key \ + [---watch] +``` + +If `--watch` is specified, the CLI will watch the value of the provided `state-key`. + +### Example + + + + +{@inject: github:`WordCountFunction`:/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/WordCountFunction.java} is a very good example +demonstrating on how Application can easily store `state` in Pulsar Functions. + +```java +public class WordCountFunction implements Function { + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split("\\.")).forEach(word -> context.incrCounter(word, 1)); + return null; + } +} +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received `String` into multiple words using regex `\\.`. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incrCounter(key, amount)`). + + + +```python +from pulsar import Function + +class WordCount(Function): + def process(self, item, context): + for word in item.split(): + context.incr_counter(word, 1) +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received string into multiple words on space. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incr_counter(key, amount)`). + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/functions-overview.md b/site2/website/versioned_docs/version-2.4.2/functions-overview.md new file mode 100644 index 0000000000000..0119b65c8eace --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/functions-overview.md @@ -0,0 +1,188 @@ +--- +id: version-2.4.2-functions-overview +title: Pulsar Functions overview +sidebar_label: Overview +original_id: functions-overview +--- + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics, +* apply a user-supplied processing logic to each message, +* publish the results of the computation to another topic. + + +## Goals +With Pulsar Functions, you can create complex processing logic without deploying a separate neighboring system (such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), [Apache Flink](https://flink.apache.org/)). Pulsar Functions are computing infrastructure of Pulsar messaging system. The core goal is tied to a series of other goals: + +* Developer productivity (language-native vs Pulsar Functions SDK functions) +* Easy troubleshooting +* Operational simplicity (no need for an external processing system) + +## Inspirations +Pulsar Functions are inspired by (and take cues from) several systems and paradigms: + +* Stream processing engines such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), and [Apache Flink](https://flink.apache.org) +* "Serverless" and "Function as a Service" (FaaS) cloud platforms like [Amazon Web Services Lambda](https://aws.amazon.com/lambda/), [Google Cloud Functions](https://cloud.google.com/functions/), and [Azure Cloud Functions](https://azure.microsoft.com/en-us/services/functions/) + +Pulsar Functions can be described as + +* [Lambda](https://aws.amazon.com/lambda/)-style functions that are +* specifically designed to use Pulsar as a message bus. + +## Programming model +Pulsar Functions provide a wide range of functionality, and the core programming model is simple. Functions receive messages from one or more **input [topics](reference-terminology.md#topic)**. Each time a message is received, the function will complete the following tasks. + + * Apply some processing logic to the input and write output to: + * An **output topic** in Pulsar + * [Apache BookKeeper](#state-storage) + * Write logs to a **log topic** (potentially for debugging purposes) + * Increment a [counter](#word-count-example) + +![Pulsar Functions core programming model](assets/pulsar-functions-overview.png) + +You can use Pulsar Functions to set up the following processing chain: + +* A Python function listens for the `raw-sentences` topic and "sanitizes" incoming strings (removing extraneous whitespace and converting all characters to lowercase) and then publishes the results to a `sanitized-sentences` topic. +* A Java function listens for the `sanitized-sentences` topic, counts the number of times each word appears within a specified time window, and publishes the results to a `results` topic +* Finally, a Python function listens for the `results` topic and writes the results to a MySQL table. + + +### Word count example + +If you implement the classic word count example using Pulsar Functions, it looks something like this: + +![Pulsar Functions word count example](assets/pulsar-functions-word-count.png) + +To write the function in Java with [Pulsar Functions SDK for Java](functions-develop#available-apis), you can write the function as follows. + +```java +package org.example.functions; + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } +} +``` + +Bundle and build the JAR file to be deployed, and then deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash +$ bin/pulsar-admin functions create \ + --jar target/my-jar-with-dependencies.jar \ + --classname org.example.functions.WordCountFunction \ + --tenant public \ + --namespace default \ + --name word-count \ + --inputs persistent://public/default/sentences \ + --output persistent://public/default/count +``` + +### Content-based routing example + +Pulsar Functions are used in many cases. The following is a sophisticated example that involves content-based routing. + +For example, a function takes items (strings) as input and publishes them to either a `fruits` or `vegetables` topic, depending on the item. Or, if an item is neither fruit nor vegetable, a warning is logged to a [log topic](functions-develop.md#logger). The following is a visual representation. + +![Pulsar Functions routing example](assets/pulsar-functions-routing-example.png) + +If you implement this routing functionality in Python, it looks something like this: + +```python +from pulsar import Function + +class RoutingFunction(Function): + def __init__(self): + self.fruits_topic = "persistent://public/default/fruits" + self.vegetables_topic = "persistent://public/default/vegetables" + + def is_fruit(item): + return item in ["apple", "orange", "pear", "other fruits..."] + + def is_vegetable(item): + return item in ["carrot", "lettuce", "radish", "other vegetables..."] + + def process(self, item, context): + if self.is_fruit(item): + context.publish(self.fruits_topic, item) + elif self.is_vegetable(item): + context.publish(self.vegetables_topic, item) + else: + warning = "The item {0} is neither a fruit nor a vegetable".format(item) + context.get_logger().warn(warning) +``` + +### Functions, messages and message types +Pulsar Functions take byte arrays as inputs and spit out byte arrays as output. However in languages that support typed interfaces(Java), you can write typed Functions, and bind messages to types in the following ways. +* [Schema Registry](functions-develop.md#schema-registry) +* [SerDe](functions-develop.md#serde) + + +## Fully Qualified Function Name (FQFN) +Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function tenant, namespace, and function name. FQFN looks like this: + +```http +tenant/namespace/name +``` + +FQFNs enable you to create multiple functions with the same name provided that they are in different namespaces. + +## Supported languages +Currently, you can write Pulsar Functions in Java, Python, and Go. For details, refer to [Develop Pulsar Functions](functions-develop.md). + +## Processing guarantees +Pulsar Functions provide three different messaging semantics that you can apply to any function. + +Delivery semantics | Description +:------------------|:------- +**At-most-once** delivery | Each message sent to the function is likely to be processed, or not to be processed (hence "at most"). +**At-least-once** delivery | Each message sent to the function can be processed more than once (hence the "at least"). +**Effectively-once** delivery | Each message sent to the function will have one output associated with it. + + +### Apply processing guarantees to a function +You can set the processing guarantees for a Pulsar Function when you create the Function. The [`pulsar-function create`](reference-pulsar-admin.md#create-1) command applies effectively-once guarantees to the Function. + +```bash +$ bin/pulsar-admin functions create \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs +``` + +The available options are: + +* `ATMOST_ONCE` +* `ATLEAST_ONCE` +* `EFFECTIVELY_ONCE` + +The following command runs a function in the cluster mode with effectively-once guarantees applied. + +```bash +$ bin/pulsar-admin functions create \ + --name my-effectively-once-function \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs +``` + +> By default, Pulsar Functions provide at-least-once delivery guarantees. So if you create a function without supplying a value for the `--processingGuarantees` flag, the function provides at-least-once guarantees. + +### Update the processing guarantees of a function +You can change the processing guarantees applied to a function using the [`update`](reference-pulsar-admin.md#update-1) command. The following is an example. + +```bash +$ bin/pulsar-admin functions update \ + --processing-guarantees ATMOST_ONCE \ + # Other function configs +``` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/functions-runtime.md b/site2/website/versioned_docs/version-2.4.2/functions-runtime.md new file mode 100644 index 0000000000000..44327ac22d020 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/functions-runtime.md @@ -0,0 +1,142 @@ +--- +id: version-2.4.2-functions-runtime +title: Configure Functions runtime +sidebar_label: Setup: Configure Functions runtime +original_id: functions-runtime +--- + +Pulsar Functions support the following methods to run functions. + +- *Thread*: Invoke functions in threads in Functions Worker. +- *Process*: Invoke functions in processes forked by Functions Worker. +- *Kubernetes*: Submit functions as Kubernetes StatefulSets by Functions Worker. + +The differences of the thread and process modes are: +- Thread mode: when a function runs in thread mode, it runs on the same Java virtual machine (JVM) with Functions worker. +- Process mode: when a function runs in process mode, it runs on the same machine that Functions worker runs. + +## Configure thread runtime +It is easy to configure *Thread* runtime. In most cases, you do not need to configure anything. You can customize the thread group name with the following settings: + +```yaml +threadContainerFactory: + threadGroupName: "Your Function Container Group" +``` + +*Thread* runtime is only supported in Java function. + +## Configure process runtime +When you enable *Process* runtime, you do not need to configure anything. + +```yaml +processContainerFactory: + # the directory for storing the function logs + logDirectory: + # change the jar location only when you put the java instance jar in a different location + javaInstanceJarLocation: + # change the python instance location only when you put the python instance jar in a different location + pythonInstanceLocation: + # change the extra dependencies location: + extraFunctionDependenciesDir: +``` + +*Process* runtime is supported in Java, Python, and Go functions. + +## Configure Kubernetes runtime + +It is easy to configure Kubernetes runtime. You can just uncomment the settings of `kubernetesContainerFactory` in the `functions_worker.yaml` file. The following is an example. + +```yaml +kubernetesContainerFactory: + # uri to kubernetes cluster, leave it to empty and it will use the kubernetes settings in function worker + k8Uri: + # the kubernetes namespace to run the function instances. it is `default`, if this setting is left to be empty + jobNamespace: + # the docker image to run function instance. by default it is `apachepulsar/pulsar` + pulsarDockerImageName: + # the root directory of pulsar home directory in `pulsarDockerImageName`. by default it is `/pulsar`. + # if you are using your own built image in `pulsarDockerImageName`, you need to set this setting accordingly + pulsarRootDir: + # this setting only takes effects if `k8Uri` is set to null. if your function worker is running as a k8 pod, + # setting this to true is let function worker to submit functions to the same k8s cluster as function worker + # is running. setting this to false if your function worker is not running as a k8 pod. + submittingInsidePod: false + # setting the pulsar service url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar service url configured in worker service + pulsarServiceUrl: + # setting the pulsar admin url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar admin url configured in worker service + pulsarAdminUrl: + # the custom labels that function worker uses to select the nodes for pods + customLabels: + # the directory for dropping extra function dependencies + # if it is not an absolute path, it is relative to `pulsarRootDir` + extraFunctionDependenciesDir: + # Additional memory padding added on top of the memory requested by the function per on a per instance basis + percentMemoryPadding: 10 +``` + +If you have already run a Pulsar cluster on Kubernetes, you can keep the settings unchanged at most of time. + +However, if you enable RBAC on deploying your Pulsar cluster, make sure the service account you use for +running Functions Workers (or brokers, if Functions Workers run along with brokers) have permissions on the following +kubernetes APIs. + +- services +- configmaps +- pods +- apps.statefulsets + +Otherwise, you will not be able to create any functions. The following is an example of error message. + +```bash +22:04:27.696 [Timer-0] ERROR org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory - Error while trying to fetch configmap example-pulsar-4qvmb5gur3c6fc9dih0x1xn8b-function-worker-config at namespace pulsar +io.kubernetes.client.ApiException: Forbidden + at io.kubernetes.client.ApiClient.handleResponse(ApiClient.java:882) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.ApiClient.execute(ApiClient.java:798) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMapWithHttpInfo(CoreV1Api.java:23673) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMap(CoreV1Api.java:23655) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory.fetchConfigMap(KubernetesRuntimeFactory.java:284) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory$1.run(KubernetesRuntimeFactory.java:275) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at java.util.TimerThread.mainLoop(Timer.java:555) [?:1.8.0_212] + at java.util.TimerThread.run(Timer.java:505) [?:1.8.0_212] +``` +If this happens, you need to grant the required permissions to the service account used for running Functions Workers. An example to grant permissions is shown below: a service account `functions-worker` is granted with permissions to access Kubernetes resources `services`, `configmaps`, `pods` and `apps.statefulsets`. + +```yaml +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: functions-worker +rules: +- apiGroups: [""] + resources: + - services + - configmaps + - pods + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: functions-worker +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: functions-worker +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: functions-worker +subjects: +- kind: ServiceAccount + name: functions-worker +``` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/functions-worker.md b/site2/website/versioned_docs/version-2.4.2/functions-worker.md new file mode 100644 index 0000000000000..e0767453959b3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/functions-worker.md @@ -0,0 +1,243 @@ +--- +id: version-2.4.2-functions-worker +title: Deploy and manage functions worker +sidebar_label: Setup: Pulsar Functions Worker +original_id: functions-worker +--- + +Before using Pulsar Functions, you need to learn how to set up Pulsar Functions worker and how to [configure Functions runtime](functions-runtime.md). + +Pulsar `functions-worker` is a logic component to run Pulsar Functions in cluster mode. Two options are available, and you can select either of the two options based on your requirements. +- [run with brokers](#run-functions-worker-with-brokers) +- [run it separately](#run-functions-worker-separately) in a different broker + +> Note +> The `--- Service Urls---` lines in the following diagrams represent Pulsar service URLs that Pulsar client and admin use to connect to a Pulsar cluster. + +## Run Functions-worker with brokers + +The following diagram illustrates the deployment of functions-workers running along with brokers. + +![assets/functions-worker-corun.png](assets/functions-worker-corun.png) + +To enable functions-worker running as part of a broker, you need to set `functionsWorkerEnabled` to `true` in the `broker.conf` file. + +```conf +functionsWorkerEnabled=true +``` + +When you set `functionsWorkerEnabled` to `true`, it means that you start functions-worker as part of a broker. You need to configure the `conf/functions_worker.yml` file to customize your functions_worker. + +Before you run Functions-work with broker, you have to configure Functions-worker, and then start it with brokers. + +### Configure Functions-Worker to run with brokers +In this mode, since `functions-worker` is running as part of broker, most of the settings already inherit from your broker configuration (for example, configurationStore settings, authentication settings, and so on). + +Pay attention to the following required settings when configuring functions-worker in this mode. + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`, which is good for standalone deployment. For production deployment, to ensure high availability, set it to be more than `2` . +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). + +If authentication is enabled on the BookKeeper cluster, configure the following BookKeeper authentication settings. + +- `bookkeeperClientAuthenticationPlugin`: the BookKeeper client authentication plugin name. +- `bookkeeperClientAuthenticationParametersName`: the BookKeeper client authentication plugin parameters name. +- `bookkeeperClientAuthenticationParameters`: the BookKeeper client authentication plugin parameters. + +### Start Functions-worker with broker + +Once you have configured the `functions_worker.yml` file, you can start or restart your broker. + +And then you can use the following command to verify if `functions-worker` is running well. + +```bash +curl :8080/admin/v2/worker/cluster +``` + +After entering the command above, a list of active function workers in the cluster is returned. The output is something similar as follows. + +```json +[{"workerId":"","workerHostname":"","port":8080}] +``` + +## Run Functions-worker separately + +This section illustrates how to run `functions-worker` as a separate process in separate machines. + +![assets/functions-worker-separated.png](assets/functions-worker-separated.png) + +> Note +In this mode, make sure `functionsWorkerEnabled` is set to `false`, so you won't start `functions-worker` with brokers by mistake. + +### Configure Functions-worker to run separately + +To run function-worker separately, you have to configure the following parameters. + +#### Worker parameters + +- `workerId`: The type is string. It is unique across clusters, used to identify a worker machine. +- `workerHostname`: The hostname of the worker machine. +- `workerPort`: The port that the worker server listens on. Keep it as default if you don't customize it. +- `workerPortTls`: The TLS port that the worker server listens on. Keep it as default if you don't customize it. + +#### Function package parameter + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`. + +#### Function metadata parameter + +- `pulsarServiceUrl`: The Pulsar service URL for your broker cluster. +- `pulsarWebServiceUrl`: The Pulser web service URL for your broker cluster. +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). + +If authentication is enabled for your broker cluster, you *should* configure the authentication plugin and parameters for the functions worker to communicate with the brokers. + +- `clientAuthenticationPlugin` +- `clientAuthenticationParameters` + +#### Security settings + +If you want to enable security on functions workers, you *should*: +- [Enable TLS transport encryption](#enable-tls-transport-encryption) +- [Enable Authentication Provider](#enable-authentication-provider) +- [Enable Authorization Provider](#enable-authorization-provider) + +##### Enable TLS transport encryption + +To enable TLS transport encryption, configure the following settings. + +``` +tlsEnabled: true +tlsCertificateFilePath: /path/to/functions-worker.cert.pem +tlsKeyFilePath: /path/to/functions-worker.key-pk8.pem +tlsTrustCertsFilePath: /path/to/ca.cert.pem +``` + +For details on TLS encryption, refer to [Transport Encryption using TLS](security-tls-transport.md). + +##### Enable Authentication Provider + +To enable authentication on Functions Worker, configure the following settings. +> Note +Substitute the *providers list* with the providers you want to enable. + +``` +authenticationEnabled: true +authenticationProviders: [ provider1, provider2 ] +``` + +For *SASL Authentication* provider, add `saslJaasClientAllowedIds` and `saslJaasBrokerSectionName` +under `properties` if needed. + +``` +properties: + saslJaasClientAllowedIds: .*pulsar.* + saslJaasBrokerSectionName: Broker +``` + +For *Token Authentication* prodivder, add necessary settings under `properties` if needed. +See [Token Authentication](security-token-admin.md) for more details. +``` +properties: + tokenSecretKey: file://my/secret.key + # If using public/private + # tokenPublicKey: file:///path/to/public.key +``` + +##### Enable Authorization Provider + +To enable authorization on Functions Worker, you need to configure `authorizationEnabled` and `configurationStoreServers`. The authentication provider connects to `configurationStoreServers` to receive namespace policies. + +```yaml +authorizationEnabled: true +configurationStoreServers: +``` + +You should also configure a list of superuser roles. The superuser roles are able to access any admin API. The following is a configuration example. + +```yaml +superUserRoles: + - role1 + - role2 + - role3 +``` + +#### BookKeeper Authentication + +If authentication is enabled on the BookKeeper cluster, you should configure the BookKeeper authentication settings as follows: + +- `bookkeeperClientAuthenticationPlugin`: the plugin name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParametersName`: the plugin parameters name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParameters`: the plugin parameters of BookKeeper client authentication. + +### Start Functions-worker + +Once you have finished configuring the `functions_worker.yml` configuration file, you can use the following command to start a `functions-worker`: + +```bash +bin/pulsar functions-worker +``` + +### Configure Proxies for Functions-workers + +When you are running `functions-worker` in a separate cluster, the admin rest endpoints are split into two clusters. `functions`, `function-worker`, `source` and `sink` endpoints are now served +by the `functions-worker` cluster, while all the other remaining endpoints are served by the broker cluster. +Hence you need to configure your `pulsar-admin` to use the right service URL accordingly. + +In order to address this inconvenience, you can start a proxy cluster for routing the admin rest requests accordingly. Hence you will have one central entry point for your admin service. + +If you already have a proxy cluster, continue reading. If you haven't setup a proxy cluster before, you can follow the [instructions](http://pulsar.apache.org/docs/en/administration-proxy/) to +start proxies. + +![assets/functions-worker-separated.png](assets/functions-worker-separated-proxy.png) + +To enable routing functions related admin requests to `functions-worker` in a proxy, you can edit the `proxy.conf` file to modify the following settings: + +```conf +functionWorkerWebServiceURL= +functionWorkerWebServiceURLTLS= +``` + +## Compare the Run-with-Broker and Run-separately modes + +As described above, you can run Function-worker with brokers, or run it separately. And it is more convenient to run functions-workers along with brokers. However, running functions-workers in a separate cluster provides better resource isolation for running functions in `Process` or `Thread` mode. + +Use which mode for your cases, refer to the following guidelines to determine. + +Use the `Run-with-Broker` mode in the following cases: +- a) if resource isolation is not required when running functions in `Process` or `Thread` mode; +- b) if you configure the functions-worker to run functions on Kubernetes (where the resource isolation problem is addressed by Kubernetes). + +Use the `Run-separately` mode in the following cases: +- a) you don't have a Kubernetes cluster; +- b) if you want to run functions and brokers separately. + +## Troubleshooting + +**Error message: Namespace missing local cluster name in clusters list** + +``` +Failed to get partitioned topic metadata: org.apache.pulsar.client.api.PulsarClientException$BrokerMetadataException: Namespace missing local cluster name in clusters list: local_cluster=xyz ns=public/functions clusters=[standalone] +``` + +The error message prompts when either of the cases occurs: +- a) a broker is started with `functionsWorkerEnabled=true`, but the `pulsarFunctionsCluster` is not set to the correct cluster in the `conf/functions_worker.yaml` file; +- b) setting up a geo-replicated Pulsar cluster with `functionsWorkerEnabled=true`, while brokers in one cluster run well, brokers in the other cluster do not work well. + +**Workaround** + +If any of these cases happens, follow the instructions below to fix the problem: + +1. Get the current clusters list of `public/functions` namespace. + +```bash +bin/pulsar-admin namespaces get-clusters public/functions +``` + +2. Check if the cluster is in the clusters list. If the cluster is not in the list, add it to the list and update the clusters list. + +```bash +bin/pulsar-admin namespaces set-clusters --cluster=, public/functions +``` + +3. Set the correct cluster name in `pulsarFunctionsCluster` in the `conf/functions_worker.yml` file. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/getting-started-standalone.md b/site2/website/versioned_docs/version-2.4.2/getting-started-standalone.md new file mode 100644 index 0000000000000..33f72dc329f8b --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/getting-started-standalone.md @@ -0,0 +1,226 @@ +--- +id: version-2.4.2-standalone +title: Set up a standalone Pulsar locally +sidebar_label: Run Pulsar locally +original_id: standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary ZooKeeper and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> #### Pulsar in production? +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of the installation process. + +### System requirements + +Pulsar is currently available for **MacOS** and **Linux**. To use Pulsar, you need to install [Java 8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html). + +> #### Tip +> By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar {{pulsar:version}} binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:binary_release_url + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash +$ tar xvfz apache-pulsar-{{pulsar:version}}-bin.tar.gz +$ cd apache-pulsar-{{pulsar:version}} +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more. +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`logs` | Logs created by the installation. + +> #### Tip +> If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +> +> * [Install builtin connectors (optional)](#install-builtin-connectors-optional) +> * [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +> +> Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors {{pulsar:version}} release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:connector_release_url/{connector}-{{pulsar:version}}.nar + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-{{pulsar:version}}.nar` connector file, enter the following commands: + +```bash +$ mkdir connectors +$ mv pulsar-io-aerospike-{{pulsar:version}}.nar connectors + +$ ls connectors +pulsar-io-aerospike-{{pulsar:version}}.nar +... +``` + +> #### Note +> +> * If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker +> (or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +> +> * If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DCOS](deploy-dcos.md)), +> you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +### Install tiered storage offloaders (optional) + +> #### Tip +> +> Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +> To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders {{pulsar:version}} release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:offloader_release_url + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash +$ tar xvfz apache-pulsar-offloaders-{{pulsar:version}}-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-{{pulsar:version}}` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-{{pulsar:version}}/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-{{pulsar:version}}.nar +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +> #### Note +> +> * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +> +> * If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DCOS](deploy-dcos.md)), +> you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash +$ bin/pulsar standalone +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@95] - Configuration Store cache started +2017-06-01 14:46:29,192 - INFO - [main:AuthenticationService@61] - Authentication is disabled +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@108] - Pulsar WebSocket Service started +``` + +> #### Tip +> +> * The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. +You can also run the service as a background process using the `pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash +$ bin/pulsar-client consume my-topic -s "first-subscription" +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` +09:56:55.566 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.MultiTopicsConsumerImpl - [TopicsConsumerFakeTopicNamee2df9] [first-subscription] Success subscribe new topic my-topic in topics consumer, partitions: 4, allTopicPartitionsNumber: 4 +``` + +> #### Tip +> +> As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` +13:09:39.356 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +> #### Tip +> +> If the service runs as a background process using the `pulsar-daemon start standalone` command, then use the `pulsar-daemon stop standalone` command to stop the service. +> +> For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). diff --git a/site2/website/versioned_docs/version-2.4.2/io-cdc-canal.md b/site2/website/versioned_docs/version-2.4.2/io-cdc-canal.md new file mode 100644 index 0000000000000..72db9fc0e6774 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-cdc-canal.md @@ -0,0 +1,175 @@ +--- +id: version-2.4.2-io-cdc-canal +title: CDC Canal Connector +sidebar_label: CDC Canal Connector +original_id: io-cdc-canal +--- + +### Source Configuration Options + +The Configuration is mostly related to Canal task config. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `zkServers` | `false` | `127.0.0.1:2181` | `The address and port of the zookeeper . if canal server configured to cluster mode` | +| `batchSize` | `true` | `5120` | `Take 5120 records from the canal server in batches` | +| `username` | `false` | `` | `Canal server account, not MySQL` | +| `password` | `false` | `` | `Canal server password, not MySQL` | +| `cluster` | `false` | `false` | `Decide whether to open cluster mode based on canal server configuration, true: cluster mode, false: standalone mode` | +| `singleHostname` | `false` | `127.0.0.1` | `The address of canal server` | +| `singlePort` | `false` | `11111` | `The port of canal server` | + + +### Configuration Example + +Here is a configuration Json example: + +```$json +{ + "zkServers": "127.0.0.1:2181", + "batchSize": "5120", + "destination": "example", + "username": "", + "password": "", + "cluster": false, + "singleHostname": "127.0.0.1", + "singlePort": "11111", +} +``` +You could also find the yaml example in this [file](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/resources/canal-mysql-source-config.yaml), which has similar content below: + +```$yaml +configs: + zkServers: "127.0.0.1:2181" + batchSize: "5120" + destination: "example" + username: "" + password: "" + cluster: false + singleHostname: "127.0.0.1" + singlePort: "11111" +``` + +### Usage example + +Here is a simple example to store MySQL change data using above example config. + +- Start a MySQL server + +```$bash +docker pull mysql:5.7 +docker run -d -it --rm --name pulsar-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=canal -e MYSQL_USER=mysqluser -e MYSQL_PASSWORD=mysqlpw mysql:5.7 +``` +- Modify configuration files mysqld.cnf + +``` +[mysqld] +pid-file = /var/run/mysqld/mysqld.pid +socket = /var/run/mysqld/mysqld.sock +datadir = /var/lib/mysql +#log-error = /var/log/mysql/error.log +# By default we only accept connections from localhost +#bind-address = 127.0.0.1 +# Disabling symbolic-links is recommended to prevent assorted security risks +symbolic-links=0 +log-bin=mysql-bin +binlog-format=ROW +server_id=1 +``` + +- Copy file to mysql server from local and restart mysql server +```$bash +docker cp mysqld.cnf pulsar-mysql:/etc/mysql/mysql.conf.d/ +docker restart pulsar-mysql +``` + +- Create test database in mysql server +```$bash +docker exec -it pulsar-mysql /bin/bash +mysql -h 127.0.0.1 -uroot -pcanal -e 'create database test;' +``` + +- Start canal server and connect mysql server + +``` +docker pull canal/canal-server:v1.1.2 +docker run -d -it --link pulsar-mysql -e canal.auto.scan=false -e canal.destinations=test -e canal.instance.master.address=pulsar-mysql:3306 -e canal.instance.dbUsername=root -e canal.instance.dbPassword=canal -e canal.instance.connectionCharset=UTF-8 -e canal.instance.tsdb.enable=true -e canal.instance.gtidon=false --name=pulsar-canal-server -p 8000:8000 -p 2222:2222 -p 11111:11111 -p 11112:11112 -m 4096m canal/canal-server:v1.1.2 +``` + +- Start pulsar standalone + +```$bash +docker pull apachepulsar/pulsar:2.4.2 +docker run -d -it --link pulsar-canal-server -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:2.4.2 bin/pulsar standalone +``` + +- Start pulsar-io in standalone + +- Config file canal-mysql-source-config.yaml + +```$yaml +configs: + zkServers: "" + batchSize: "5120" + destination: "test" + username: "" + password: "" + cluster: false + singleHostname: "pulsar-canal-server" + singlePort: "11111" +``` +- Consumer file pulsar-client.py for test +``` +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() +``` + +- Copy config file and test file to pulsar server + +```$bash +docker cp canal-mysql-source-config.yaml pulsar-standalone:/pulsar/conf/ +docker cp pulsar-client.py pulsar-standalone:/pulsar/ +``` + +- Download canal connector and start canal connector +```$bash +docker exec -it pulsar-standalone /bin/bash +wget http://apache.01link.hk/pulsar/pulsar-2.4.2/connectors/pulsar-io-canal-2.4.2.nar -P connectors +./bin/pulsar-admin sources localrun --archive ./connectors/pulsar-io-canal-2.4.2.nar --classname org.apache.pulsar.io.canal.CanalStringSource --tenant public --namespace default --name canal --destination-topic-name my-topic --source-config-file /pulsar/conf/canal-mysql-source-config.yaml --parallelism 1 +``` + +- Consumption data + +```$bash +docker exec -it pulsar-standalone /bin/bash +python pulsar-client.py +``` + +- Open another window for login mysql server + +```$bash +docker exec -it pulsar-mysql /bin/bash +mysql -h 127.0.0.1 -uroot -pcanal +``` +- Create table and insert, delete, update data in mysql server +``` +mysql> use test; +mysql> show tables; +mysql> CREATE TABLE IF NOT EXISTS `test_table`(`test_id` INT UNSIGNED AUTO_INCREMENT,`test_title` VARCHAR(100) NOT NULL, +`test_author` VARCHAR(40) NOT NULL, +`test_date` DATE,PRIMARY KEY ( `test_id` ))ENGINE=InnoDB DEFAULT CHARSET=utf8; +mysql> INSERT INTO test_table (test_title, test_author, test_date) VALUES("a", "b", NOW()); +mysql> UPDATE test_table SET test_title='c' WHERE test_title='a'; +mysql> DELETE FROM test_table WHERE test_title='c'; +``` + diff --git a/site2/website/versioned_docs/version-2.4.2/io-cdc-debezium.md b/site2/website/versioned_docs/version-2.4.2/io-cdc-debezium.md new file mode 100644 index 0000000000000..d49d99727f883 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-cdc-debezium.md @@ -0,0 +1,261 @@ +--- +id: version-2.4.2-io-cdc-debezium +title: CDC Debezium Connector +sidebar_label: CDC Debezium Connector +original_id: io-cdc-debezium +--- + +### Source Configuration Options + +The Configuration is mostly related to Debezium task config, besides this we should provides the service URL of Pulsar cluster, and topic names that used to store offset and history. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | `true` | `null` | A source task class that implemented in Debezium. | +| `database.hostname` | `true` | `null` | The address of the Database server. | +| `database.port` | `true` | `null` | The port number of the Database server.. | +| `database.user` | `true` | `null` | The name of the Database user that has the required privileges. | +| `database.password` | `true` | `null` | The password for the Database user that has the required privileges. | +| `database.server.id` | `true` | `null` | The connector’s identifier that must be unique within the Database cluster and similar to Database’s server-id configuration property. | +| `database.server.name` | `true` | `null` | The logical name of the Database server/cluster, which forms a namespace and is used in all the names of the Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | `false` | `null` | A list of all databases hosted by this server that this connector will monitor. This is optional, and there are other properties for listing the databases and tables to include or exclude from monitoring. | +| `key.converter` | `true` | `null` | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | `true` | `null` | The converter provided by Kafka Connect to convert record value. | +| `database.history` | `true` | `null` | The name of the database history class name. | +| `database.history.pulsar.topic` | `true` | `null` | The name of the database history topic where the connector will write and recover DDL statements. This topic is for internal use only and should not be used by consumers. | +| `database.history.pulsar.service.url` | `true` | `null` | Pulsar cluster service url for history topic. | +| `pulsar.service.url` | `true` | `null` | Pulsar cluster service url. | +| `offset.storage.topic` | `true` | `null` | Record the last committed offsets that the connector successfully completed. | + +## Example of MySQL + +We need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +Here is a JSON configuration example: + +```json +{ + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "offset.storage.topic": "offset-topic" +} +``` + +Optionally, you can create a `debezium-mysql-source-config.yaml` file, and copy the [contents] (https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + +```$yaml +tenant: "public" +namespace: "default" +name: "debezium-mysql-source" +topicName: "debezium-mysql-topic" +archive: "connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar" + +parallelism: 1 + +configs: + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" +``` + +### Usage + +This example shows how to store the data changes of a MySQL table using the configuration file in the example above. + +1. Start a MySQL server with an example database, from which Debezium can capture changes. + + ```$bash + docker run -it --rm --name mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=debezium -e MYSQL_USER=mysqluser -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```$bash + bin/pulsar standalone + ``` + +3. Start pulsar debezium connector, with local run mode, and using above yaml config file. Please make sure that the nar file is available as configured in path `connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar`. + + ```$bash + bin/pulsar-admin source localrun --source-config-file debezium-mysql-source-config.yaml + ``` + + ```$bash + bin/pulsar-admin source localrun --archive connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar --name debezium-mysql-source --destination-topic-name debezium-mysql-topic --tenant public --namespace default --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + ``` + +4. Subscribe the topic for table `inventory.products`. + + ``` + bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + ``` + +5. start a MySQL cli docker connector, and use it we could change to the table `products` in MySQL server. + + ```$bash + $docker run -it --rm --name mysqlterm --link mysql --rm mysql:5.7 sh -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + ``` + +6. This command will pop out MySQL cli, in this cli, we could do a change in table products, use commands below to change the name of 2 items in table products: + + ``` + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products ; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + ``` + + In above subscribe topic terminal tab, we could find that 2 changes has been kept into products topic. + +## Example of PostgreSQL + +We need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + + +Here is a JSON configuration example: + +```json +{ + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "postgres", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "schema.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" +} +``` + + +Optionally, you can create a `debezium-postgres-source-config.yaml` file, and copy the [contents] (https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the`debezium-postgres-source-config.yaml` file. + +```yaml +tenant: "public" +namespace: "default" +name: "debezium-postgres-source" +topicName: "debezium-postgres-topic" +archive: "connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar" + +parallelism: 1 + +configs: + ## config for pg, docker image: debezium/example-postgress:0.8 + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "postgres" + database.dbname: "postgres" + database.server.name: "dbserver1" + schema.whitelist: "inventory" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" +``` + +### Usage + +This example shows how to store the data changes of a PostgreSQL table using the configuration file in the example above. + + +1. Start a PostgreSQL server with an example database, from which Debezium can capture changes. + + ```$bash + docker pull debezium/example-postgres:0.8 + docker run -d -it --rm --name pulsar-postgresql -p 5432:5432 debezium/example-postgres:0.8 + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```$bash + bin/pulsar standalone + ``` + +3. Start the Pulsar Debezium connector in local run mode and use the JSON or YAML configuration file in the example above. Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar`. + + ```$bash + bin/pulsar-admin source localrun --source-config-file debezium-postgres-source-config.yaml + ``` + +Optionally, start Pulsar Debezium connector in local run mode and use the JSON config file in the example above. + + + ```$bash + bin/pulsar-admin source localrun --archive connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar --name debezium-postgres-source --destination-topic-name debezium-postgres-topic --tenant public --namespace default --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "postgres","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + ``` + + +4. PostgreSQL CLI appears after this command is executed. Use the commands below to update the `products` table. + + ```bash + docker exec -it pulsar-postgresql /bin/bash + ``` + + ``` + psql -U postgres postgres + postgres=# \c postgres; + You are now connected to database "postgres" as user "postgres". + postgres=# SET search_path TO inventory; + SET + postgres=# select * from products; + id | name | description | weight + -----+--------------------+---------------------------------------------------------+-------- + 102 | car battery | 12V car battery | 8.1 + 103 | 12-pack drill bits | 12-pack of drill bits with sizes ranging from #40 to #3 | 0.8 + 104 | hammer | 12oz carpenter's hammer | 0.75 + 105 | hammer | 14oz carpenter's hammer | 0.875 + 106 | hammer | 16oz carpenter's hammer | 1 + 107 | rocks | box of assorted rocks | 5.3 + 108 | jacket | water resistent black wind breaker | 0.1 + 109 | spare tire | 24 inch spare tire | 22.2 + 101 | 1111111111 | Small 2-wheel scooter | 3.14 + (9 rows) + + postgres=# UPDATE products SET name='1111111111' WHERE id=107; + UPDATE 1 + ``` + +5. Subscribe the topic for the `inventory.products` table. + + ``` + bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + ``` + + At this time, you will receive the following information: + + ```bash + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":107}}�{"schema":{"type":"struct","fields":[{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"before"},{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"after"},{"type":"struct","fields":[{"type":"string","optional":true,"field":"version"},{"type":"string","optional":true,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":false,"field":"db"},{"type":"int64","optional":true,"field":"ts_usec"},{"type":"int64","optional":true,"field":"txId"},{"type":"int64","optional":true,"field":"lsn"},{"type":"string","optional":true,"field":"schema"},{"type":"string","optional":true,"field":"table"},{"type":"boolean","optional":true,"default":false,"field":"snapshot"},{"type":"boolean","optional":true,"field":"last_snapshot_record"}],"optional":false,"name":"io.debezium.connector.postgresql.Source","field":"source"},{"type":"string","optional":false,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"before":{"id":107,"name":"rocks","description":"box of assorted rocks","weight":5.3},"after":{"id":107,"name":"1111111111","description":"box of assorted rocks","weight":5.3},"source":{"version":"0.9.2.Final","connector":"postgresql","name":"dbserver1","db":"postgres","ts_usec":1559208957661080,"txId":577,"lsn":23862872,"schema":"inventory","table":"products","snapshot":false,"last_snapshot_record":null},"op":"u","ts_ms":1559208957692}} + ``` diff --git a/site2/website/versioned_docs/version-2.4.2/io-connectors.md b/site2/website/versioned_docs/version-2.4.2/io-connectors.md new file mode 100644 index 0000000000000..91b47756c8f04 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-connectors.md @@ -0,0 +1,31 @@ +--- +id: version-2.4.2-io-connectors +title: Builtin Connectors +sidebar_label: Builtin Connectors +original_id: io-connectors +--- + +Pulsar distribution includes a set of common connectors that have been packaged and tested with the rest of Apache Pulsar. +These connectors import and export data from some of the most commonly used data systems. Using any these connectors is +as easy as writing a simple connector configuration and running the connector locally or submitting the connector to a +Pulsar Functions cluster. + +- [Aerospike Sink Connector](io-aerospike.md) +- [Cassandra Sink Connector](io-cassandra.md) +- [Kafka Sink Connector](io-kafka.md#sink) +- [Kafka Source Connector](io-kafka.md#source) +- [Kinesis Sink Connector](io-kinesis.md#sink) +- [RabbitMQ Source Connector](io-rabbitmq.md#source) +- [RabbitMQ Sink Connector](io-rabbitmq.md#sink) +- [Twitter Firehose Source Connector](io-twitter.md) +- [CDC Source Connector based on Debezium](io-cdc.md) +- [Netty Source Connector](io-netty.md#source) +- [Hbase Sink Connector](io-hbase.md#sink) +- [ElasticSearch Sink Connector](io-elasticsearch.md#sink) +- [File Source Connector](io-file.md#source) +- [Hdfs Sink Connector](io-hdfs.md#sink) +- [MongoDB Sink Connector](io-mongo.md#sink) +- [Redis Sink Connector](io-redis.md#sink) +- [Solr Sink Connector](io-solr.md#sink) +- [InfluxDB Sink Connector](io-influxdb.md#sink) +- [JDBC Sink Connector](io-jdbc.md) diff --git a/site2/website/versioned_docs/version-2.4.2/io-debug.md b/site2/website/versioned_docs/version-2.4.2/io-debug.md new file mode 100644 index 0000000000000..6f996dce450e0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-debug.md @@ -0,0 +1,329 @@ +--- +id: version-2.4.2-io-debug +title: How to debug Pulsar connectors +sidebar_label: Debug +original_id: io-debug +--- +This guide explains how to debug connectors in localrun or cluster mode and gives a debugging checklist. +To better demonstrate how to debug Pulsar connectors, here takes a Mongo sink connector as an example. + +**Deploy a Mongo sink environment** +1. Start a Mongo service. + ```bash + docker pull mongo:4 + docker run -d -p 27017:27017 --name pulsar-mongo -v $PWD/data:/data/db mongo:4 + ``` +2. Create a DB and a collection. + ```bash + docker exec -it pulsar-mongo /bin/bash + mongo + > use pulsar + > db.createCollection('messages') + > exit + ``` +3. Start Pulsar standalone. + ```bash + docker pull apachepulsar/pulsar:2.4.0 + docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --link pulsar-mongo --name pulsar-mongo-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone + ``` +4. Configure the Mongo sink with the `mongo-sink-config.yaml` file. + ```bash + configs: + mongoUri: "mongodb://pulsar-mongo:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + ``` + ```bash + docker cp mongo-sink-config.yaml pulsar-mongo-standalone:/pulsar/ + ``` +5. Download the Mongo sink nar package. + ```bash + docker exec -it pulsar-mongo-standalone /bin/bash + curl -O http://apache.01link.hk/pulsar/pulsar-2.4.0/connectors/pulsar-io-mongo-2.4.0.nar + ``` +## Debug in localrun mode +Start the Mongo sink in localrun mode using the `localrun` command. +> #### Tip +> +> For more information about the `localrun` command, see [`localrun`](reference-connector-admin.md/#localrun-1). +```bash +./bin/pulsar-admin sinks localrun \ +--archive pulsar-io-mongo-2.4.0.nar \ +--tenant public --namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 +``` +### Use connector log +Use one of the following methods to get a connector log in localrun mode: +* After executing the `localrun` command, the **log is automatically printed on the console**. +* The log is located at: + + ```bash + logs/functions/tenant/namespace/function-name/function-name-instance-id.log + ``` + + **Example** + + The path of the Mongo sink connector is: + ```bash + logs/functions/public/default/pulsar-mongo-sink/pulsar-mongo-sink-0.log + ``` +To clearly explain the log information, here breaks down the large block of information into small blocks and add descriptions for each block. +* This piece of log information shows the storage path of the nar package after decompression. + ``` + 08:21:54.132 [main] INFO org.apache.pulsar.common.nar.NarClassLoader - Created class loader with paths: [file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/, file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/, + ``` + > #### Tip + > + > If `class cannot be found` exception is thrown, check whether the nar file is decompressed in the folder `file:/tmp/pulsar-nar/pulsar-io-mongo-2.4.0.nar-unpacked/META-INF/bundled-dependencies/` or not. +* This piece of log information illustrates the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, resources, and so on, which can be used to **check whether the Mongo sink connector is configured correctly or not**. + ```bash + 08:21:55.390 [main] INFO org.apache.pulsar.functions.runtime.ThreadRuntime - ThreadContainer starting function with instance config InstanceConfig(instanceId=0, functionId=853d60a1-0c48-44d5-9a5c-6917386476b2, functionVersion=c2ce1458-b69e-4175-88c0-a0a856a2be8c, functionDetails=tenant: "public" + namespace: "default" + name: "pulsar-mongo-sink" + className: "org.apache.pulsar.functions.api.utils.IdentityFunction" + autoAck: true + parallelism: 1 + source { + typeClassName: "[B" + inputSpecs { + key: "test-mongo" + value { + } + } + cleanupSubscription: true + } + sink { + className: "org.apache.pulsar.io.mongodb.MongoSink" + configs: "{\"mongoUri\":\"mongodb://pulsar-mongo:27017\",\"database\":\"pulsar\",\"collection\":\"messages\",\"batchSize\":2,\"batchTimeMs\":500}" + typeClassName: "[B" + } + resources { + cpu: 1.0 + ram: 1073741824 + disk: 10737418240 + } + componentType: SINK + , maxBufferedTuples=1024, functionAuthenticationSpec=null, port=38459, clusterName=local) + ``` +* This piece of log information demonstrates the status of the connections to Mongo and configuration information. + ```bash + 08:21:56.231 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.connection - Opened connection [connectionId{localValue:1, serverValue:8}] to pulsar-mongo:27017 + 08:21:56.326 [cluster-ClusterId{value='5d6396a3c9e77c0569ff00eb', description='null'}-pulsar-mongo:27017] INFO org.mongodb.driver.cluster - Monitor thread successfully connected to server with description ServerDescription{address=pulsar-mongo:27017, type=STANDALONE, state=CONNECTED, ok=true, version=ServerVersion{versionList=[4, 2, 0]}, minWireVersion=0, maxWireVersion=8, maxDocumentSize=16777216, logicalSessionTimeoutMinutes=30, roundTripTimeNanos=89058800} + ``` +* This piece of log information explains the configuration of consumers and clients, including the topic name, subscription name, subscription type, and so on. + ```bash + 08:21:56.719 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Starting Pulsar consumer perf with config: { + "topicNames" : [ "test-mongo" ], + "topicsPattern" : null, + "subscriptionName" : "public/default/pulsar-mongo-sink", + "subscriptionType" : "Shared", + "receiverQueueSize" : 1000, + "acknowledgementsGroupTimeMicros" : 100000, + "negativeAckRedeliveryDelayMicros" : 60000000, + "maxTotalReceiverQueueSizeAcrossPartitions" : 50000, + "consumerName" : null, + "ackTimeoutMillis" : 0, + "tickDurationMillis" : 1000, + "priorityLevel" : 0, + "cryptoFailureAction" : "CONSUME", + "properties" : { + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink", + "instance_id" : "0" + }, + "readCompacted" : false, + "subscriptionInitialPosition" : "Latest", + "patternAutoDiscoveryPeriod" : 1, + "regexSubscriptionMode" : "PersistentOnly", + "deadLetterPolicy" : null, + "autoUpdatePartitions" : true, + "replicateSubscriptionState" : false, + "resetIncludeHead" : false + } + 08:21:56.726 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.ConsumerStatsRecorderImpl - Pulsar client config: { + "serviceUrl" : "pulsar://localhost:6650", + "authPluginClassName" : null, + "authParams" : null, + "operationTimeoutMs" : 30000, + "statsIntervalSeconds" : 60, + "numIoThreads" : 1, + "numListenerThreads" : 1, + "connectionsPerBroker" : 1, + "useTcpNoDelay" : true, + "useTls" : false, + "tlsTrustCertsFilePath" : null, + "tlsAllowInsecureConnection" : false, + "tlsHostnameVerificationEnable" : false, + "concurrentLookupRequest" : 5000, + "maxLookupRequest" : 50000, + "maxNumberOfRejectedRequestPerConnection" : 50, + "keepAliveIntervalSeconds" : 30, + "connectionTimeoutMs" : 10000, + "requestTimeoutMs" : 60000, + "defaultBackoffIntervalNanos" : 100000000, + "maxBackoffIntervalNanos" : 30000000000 + } + ``` +## Debug in cluster mode +You can use the following methods to debug a connector in cluster mode: +* [Use connector log](#use-connector-log) +* [Use admin CLI](#use-admin-cli) +### Use connector log +In cluster mode, multiple connectors can run on a worker. To find the log path of a specified connector, use the `workerId` to locate the connector log. +### Use admin CLI +Pulsar admin CLI helps you debug Pulsar connectors with the following subcommands: +* [`get`](#get) + +* [`status`](#status) +* [`topics stats`](#topics-stats) + +**Create a Mongo sink** +```bash +./bin/pulsar-admin sinks create \ +--archive pulsar-io-mongo-2.4.0.nar \ +--tenant public \ +--namespace default \ +--inputs test-mongo \ +--name pulsar-mongo-sink \ +--sink-config-file mongo-sink-config.yaml \ +--parallelism 1 +``` +### `get` +Use the `get` command to get the basic information about the Mongo sink connector, such as tenant, namespace, name, parallelism, and so on. +```bash +./bin/pulsar-admin sinks get --tenant public --namespace default --name pulsar-mongo-sink +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-mongo-sink", + "className": "org.apache.pulsar.io.mongodb.MongoSink", + "inputSpecs": { + "test-mongo": { + "isRegexPattern": false + } + }, + "configs": { + "mongoUri": "mongodb://pulsar-mongo:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": 2.0, + "batchTimeMs": 500.0 + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} +``` +> #### Tip +> +> For more information about the `get` command, see [`get`](reference-connector-admin.md/#get-1). +### `status` +Use the `status` command to get the current status about the Mongo sink connector, such as the number of instance, the number of running instance, instanceId, workerId and so on. +```bash +./bin/pulsar-admin sinks status +--tenant public \ +--namespace default \ +--name pulsar-mongo-sink +{ +"numInstances" : 1, +"numRunning" : 1, +"instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-5d202832fd18-8080" + } +} ] +} +``` +> #### Tip +> +> For more information about the `status` command, see [`status`](reference-connector-admin.md/#stauts-1). +> +> If there are multiple connectors running on a worker, `workerId` can locate the worker on which the specified connector is running. +### `topics stats` +Use the `topics stats` command to get the stats for a topic and its connected producer and consumer, such as whether the topic has received messages or not, whether there is a backlog of messages or not, the available permits and other key information. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. +```bash +./bin/pulsar-admin topics stats test-mongo +{ + "msgRateIn" : 0.0, + "msgThroughputIn" : 0.0, + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "averageMsgSize" : 0.0, + "storageSize" : 1, + "publishers" : [ ], + "subscriptions" : { + "public/default/pulsar-mongo-sink" : { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "msgBacklog" : 0, + "blockedSubscriptionOnUnackedMsgs" : false, + "msgDelayed" : 0, + "unackedMessages" : 0, + "type" : "Shared", + "msgRateExpired" : 0.0, + "consumers" : [ { + "msgRateOut" : 0.0, + "msgThroughputOut" : 0.0, + "msgRateRedeliver" : 0.0, + "consumerName" : "dffdd", + "availablePermits" : 999, + "unackedMessages" : 0, + "blockedConsumerOnUnackedMsgs" : false, + "metadata" : { + "instance_id" : "0", + "application" : "pulsar-sink", + "id" : "public/default/pulsar-mongo-sink" + }, + "connectedSince" : "2019-08-26T08:48:07.582Z", + "clientVersion" : "2.4.0", + "address" : "/172.17.0.3:57790" + } ], + "isReplicated" : false + } + }, + "replication" : { }, + "deduplicationStatus" : "Disabled" +} +``` +> #### Tip +> +> For more information about the `topic stats` command, see [`topic stats`](http://pulsar.apache.org/docs/en/pulsar-admin/#stats-1). +## Checklist +This checklist indicates the major areas to check when you debug connectors. It is a reminder of what to look for to ensure a thorough review and an evaluation tool to get the status of connectors. +* Does Pulsar start successfully? + +* Does the external service run normally? + +* Is the nar package complete? + +* Is the connector configuration file correct? + +* In localrun mode, run a connector and check the printed information (connector log) on the console. + +* In cluster mode: + + * Use the `get` command to get the basic information. + + * Use the `status` command to get the current status. + * Use the `topics stats` command to get the stats for a specified topic and its connected producers and consumers. + + * Check the connector log. +* Enter into the external system and verify the result. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/io-develop.md b/site2/website/versioned_docs/version-2.4.2/io-develop.md new file mode 100644 index 0000000000000..c7ce456342559 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-develop.md @@ -0,0 +1,196 @@ +--- +id: version-2.4.2-io-develop +title: Develop Connectors +sidebar_label: Developing Connectors +original_id: io-develop +--- + +This guide describes how developers can write new connectors for Pulsar IO to move data +between Pulsar and other systems. It describes how to create a Pulsar IO connector. + +Pulsar IO connectors are specialized [Pulsar Functions](functions-overview.md). So writing +a Pulsar IO connector is as simple as writing a Pulsar function. Pulsar IO connectors come +in two flavors: {@inject: github:`Source`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java}, +which import data from another system, and {@inject: github:`Sink`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java}, +which export data to another system. For example, [KinesisSink](io-kinesis.md) would export +the messages of a Pulsar topic to a Kinesis stream, and [RabbitmqSource](io-rabbitmq.md) would import +the messages of a RabbitMQ queue to a Pulsar topic. + +### Developing + +#### Develop a source connector + +What you need to develop a source connector is to implement {@inject: github:`Source`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} +interface. + +First, you need to implement the {@inject: github:`open`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java#L33} method. This method will be called once when the source connector +is initialized. In this method, you can retrieve all the connector specific settings through +the passed `config` parameter, and initialize all the necessary resourcess. For example, a Kafka +connector can create the Kafka client in this `open` method. + +Beside the passed-in `config` object, the Pulsar runtime also provides a `SourceContext` for the +connector to access runtime resources for tasks like collecting metrics. The implementation can +save the `SourceContext` for futher usage. + +```java + /** + * Open connector with configuration + * + * @param config initialization config + * @param sourceContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SourceContext sourceContext) throws Exception; +``` + +The main task for a Source implementor is to implement {@inject: github:`read`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java#L41} +method. + +```java + /** + * Reads the next message from source. + * If source does not have any new messages, this call should block. + * @return next message from source. The return result should never be null + * @throws Exception + */ + Record read() throws Exception; +``` + +The implementation should be blocking on this method if nothing to return. It should never return +`null`. The returned {@inject: github:`Record`:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java#L28} should encapsulates the information that is needed by +Pulsar IO runtime. + +These information includes: + +- *Topic Name*: _Optional_. If the record is originated from a Pulsar topic, it should be the Pulsar topic name. +- *Key*: _Optional_. If the record has a key associated with it. +- *Value*: _Required_. The actual data of this record. +- *Partition Id*: _Optional_. If the record is originated from a partitioned source, + return its partition id. The partition id will be used as part of the unique identifier + by Pulsar IO runtime to do message deduplication and achieve exactly-once processing guarantee. +- *Record Sequence*: _Optional_. If the record is originated from a sequential source, + return its record sequence. The record sequence will be used as part of the unique identifier + by Pulsar IO runtime to do message deduplication and achieve exactly-once processing guarantee. +- *Properties*: _Optional_. If the record carries user-defined properties, return those properties. + +Additionally, the implemention of the record should provide two methods: `ack` and `fail`. These +two methods will be used by Pulsar IO connector to acknowledge the records that it has done +processing and fail the records that it has failed to process. + +{@inject: github:`KafkaSource`:/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java} is a good example to follow. + +#### Develop a sink connector + +Developing a sink connector is as easy as developing a source connector. You just need to +implement {@inject: github:`Sink`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} interface. + +Similarly, you first need to implement the {@inject: github:`open`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java#L36} method to initialize all the necessary resources +before implementing the {@inject: github:`write`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java#L44} method. + +```java + /** + * Open connector with configuration + * + * @param config initialization config + * @param sinkContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SinkContext sinkContext) throws Exception; +``` + +The main task for a Sink implementor is to implement {@inject: github:`write`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java#L44} method. + +```java + /** + * Write a message to Sink + * @param inputRecordContext Context of input record from the source + * @param record record to write to sink + * @throws Exception + */ + void write(Record record) throws Exception; +``` + +In the implemention of `write` method, the implementor can decide how to write the value and +the optional key to the actual source, and leverage all the provided information such as +`Partition Id`, `Record Sequence` for achieving different processing guarantees. The implementor +is also responsible for acknowledging records if it has successfully written them or failing +records if has failed to write them. + +### Testing + +Testing connectors can be challenging because Pulsar IO connectors interact with two systems +that may be difficult to mock - Pulsar and the system the connector is connecting to. It is +recommended to write very specificially test the functionalities of the connector classes +while mocking the external services. + +Once you have written sufficient unit tests for your connector, we also recommend adding +separate integration tests to verify end-to-end functionality. In Pulsar, we are using +[testcontainers](https://www.testcontainers.org/) for all Pulsar integration tests. Pulsar IO +{@inject: github:`IntegrationTests`:/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io} are good examples to follow on integration testing your connectors. + +### Packaging + +Once you've developed and tested your connector, you must package it so that it can be submitted +to a [Pulsar Functions](functions-overview.md) cluster. There are two approaches described +here work with Pulsar Functions' runtime. + +If you plan to package and distribute your connector for others to use, you are obligated to +properly license and copyright your own code and to adhere to the licensing and copyrights of +all libraries your code uses and that you include in your distribution. If you are using the +approach described in ["Creating a NAR package"](#creating-a-nar-package), the NAR plugin will +automatically create a `DEPENDENCIES` file in the generated NAR package, including the proper +licensing and copyrights of all libraries of your connector. + +#### Creating a NAR package + +The easiest approach to packaging a Pulsar IO connector is to create a NAR package using +[nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin). + +NAR stands for NiFi Archive. It is a custom packaging mechanism used by Apache NiFi, to provide +a bit of Java ClassLoader isolation. For more details, you can read this +[blog post](https://medium.com/hashmapinc/nifi-nar-files-explained-14113f7796fd) to understand +how NAR works. Pulsar uses the same mechanism for packaging all the [builtin connectors](io-connectors). + +All what you need is to include this [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin) in your maven project for your connector. For example: + +```xml + + + org.apache.nifi + nifi-nar-maven-plugin + 1.2.0 + + +``` + +The {@inject: github:`TwitterFirehose`:/pulsar-io/twitter} connector is a good example to follow. + +#### Creating an Uber JAR + +An alternative approach is to create an _uber JAR_ that contains all of the connector's JAR files +and other resource files. No directory internal structure is necessary. + +You can use [maven-shade-plugin](https://maven.apache.org/plugins/maven-shade-plugin/examples/includes-excludes.html) to create a Uber JAR. For example: + +```xml + + org.apache.maven.plugins + maven-shade-plugin + 3.1.1 + + + package + + shade + + + + + *:* + + + + + + +``` diff --git a/site2/website/versioned_docs/version-2.4.2/io-jdbc.md b/site2/website/versioned_docs/version-2.4.2/io-jdbc.md new file mode 100644 index 0000000000000..8526d5168906e --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-jdbc.md @@ -0,0 +1,24 @@ +--- +id: version-2.4.2-io-jdbc +title: JDBC Connector +sidebar_label: JDBC Connector +original_id: io-jdbc +--- + +## Sink + +The JDBC Sink Connector is used to pull messages from Pulsar topics and persist the messages to an MySQL or Sqlite. +Current support INSERT, DELETE and UPDATE. + +### Sink Configuration Options + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| userName | `false` | `` | Username used to connect to the database specified by `jdbcUrl`. | +| password | `false` | `` | Password used to connect to the database specified by `jdbcUrl`. | +| jdbcUrl | `true` | `` | The JDBC url of the database this connector connects to. | +| tableName | `true` | `` | The name of the table this connector writes messages to. | +| nonKey | `false` | `` | Fields used in update events. A comma-separated list. | +| key | `false` | `` | Fields used in where condition of update and delete Events. A comma-separated list. | +| timeoutMs | `false` | `500` | The jdbc operation timeout in milliseconds. | +| batchSize | `false` | `200` | The batch size of updates made to the database. | \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.4.2/io-netty.md b/site2/website/versioned_docs/version-2.4.2/io-netty.md new file mode 100644 index 0000000000000..567e6b6354a19 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-netty.md @@ -0,0 +1,148 @@ +--- +id: version-2.4.2-io-netty +title: Netty Tcp or Udp Connector +sidebar_label: Netty Tcp or Udp Connector +original_id: io-netty +--- + +## Source + +The Netty Source connector opens a port that accept incoming data via the configured network protocol and publish it to a user-defined Pulsar topic. +Also, this connector is suggested to be used in a containerized (e.g. k8s) deployment. +Otherwise, if the connector is running in process or thread mode, the instances may be conflicting on listening to ports. + +### Source Configuration Options + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `type` | `false` | `tcp` | The network protocol over which data is trasmitted to netty. Valid values include HTTP, TCP, and UDP | +| `host` | `false` | `127.0.0.1` | The host name or address that the source instance to listen on. | +| `port` | `false` | `10999` | The port that the source instance to listen on. | +| `numberOfThreads` | `false` | `1` | The number of threads of Netty Tcp Server to accept incoming connections and handle the traffic of the accepted connections. | + + +### Configuration Example + +Here is a configuration Json example: + +```$json +{ + "type": "tcp", + "host": "127.0.0.1", + "port": "10911", + "numberOfThreads": "5" +} +``` +Here is a configuration Yaml example: + +```$yaml +configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 +``` + +### Usage example + + +- Start pulsar standalone + +```$bash +docker pull apachepulsar/pulsar:2.4.0 +docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone +``` + +- Start pulsar-io in standalone + +#### Tcp example + +- Config file netty-source-config.yaml + +```$yaml +configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 +``` + +- Copy configuration file to pulsar server + +```$bash +docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ +``` + +- Download netty connector and start netty connector + +```$bash +docker exec -it pulsar-netty-standalone /bin/bash +curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-2.4.0/connectors/pulsar-io-netty-2.4.0.nar +./bin/pulsar-admin sources localrun --archive pulsar-io-{{pulsar:version}}.nar --tenant public --namespace default --name netty --destination-topic-name netty-topic --source-config-file netty-source-config.yaml --parallelism 1 +``` + +- Consume data + +```$bash +docker exec -it pulsar-netty-standalone /bin/bash +./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 +``` + +- Open another window for send data to netty source + +```$bash +docker exec -it pulsar-netty-standalone /bin/bash +apt-get update +apt-get -y install telnet +root@1d19327b2c67:/pulsar# telnet 127.0.0.1 10999 +Trying 127.0.0.1... +Connected to 127.0.0.1. +Escape character is '^]'. +hello +world +``` + +- Verification results + +In the consumer window just opened, you can see the following data + +```bash +----- got message ----- +hello + +----- got message ----- +world +``` + +#### Http example + +- Config file netty-source-config.yaml + +```$yaml +configs: + type: "http" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 +``` +- Start netty source + +```$bash +docker exec -it pulsar-netty-standalone /bin/bash +./bin/pulsar-admin sources localrun --archive pulsar-io-{{pulsar:version}}.nar --tenant public --namespace default --name netty --destination-topic-name netty-topic --source-config-file netty-source-config.yaml --parallelism 1 +``` + +- Verification results + +```bash +curl -X POST --data 'hello, world!' http://127.0.0.1:10999/ +``` + +- Verification results + +In the consumer window just opened, you can see the following data + +```bash +----- got message ----- +hello, world! +``` diff --git a/site2/website/versioned_docs/version-2.4.2/io-use.md b/site2/website/versioned_docs/version-2.4.2/io-use.md new file mode 100644 index 0000000000000..9ec17da0600a5 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/io-use.md @@ -0,0 +1,1505 @@ +--- +id: version-2.4.2-io-use +title: How to use Pulsar connectors +sidebar_label: Use +original_id: io-use +--- + +This guide describes how to use Pulsar connectors. + +## Install a connector + +Pulsar bundles several [builtin connectors](io-connectors.md) used to move data in and out of commonly used systems (such as database and messaging system). Optionally, you can create and use your desired non-builtin connectors. + +> #### Note +> +> When using a non-builtin connector, you need to specify the path of a archive file for the connector. + +To set up a builtin connector, follow +the instructions [here](getting-started-standalone.md#installing-builtin-connectors). + +After the setup, the builtin connector is automatically discovered by Pulsar brokers (or function-workers), so no additional installation steps are required. + +## Configure a connector + +You can configure the following information: + +* [Configure a default storage location for a connector](#configure-a-default-storage-location-for-a-connector) + +* [Configure a connector with a YAML file](#configure-a-connector-with-yaml-file) + +### Configure a default storage location for a connector + +To configure a default folder for builtin connectors, set the `connectorsDirectory` parameter in the `./conf/functions_worker.yml` configuration file. + +**Example** + +Set the `./connectors` folder as the default storage location for builtin connectors. + +``` +######################## +# Connectors +######################## + +connectorsDirectory: ./connectors +``` + +### Configure a connector with a YAML file + +To configure a connector, you need to provide a YAML configuration file when creating a connector. + +The YAML configuration file tells Pulsar where to locate connectors and how to connect connectors with Pulsar topics. + +**Example 1** + +Below is a YAML configuration file of a Cassandra sink, which tells Pulsar: + +* Which Cassandra cluster to connect + +* What is the `keyspace` and `columnFamily` to be used in Cassandra for collecting data + +* How to map Pulsar messages into Cassandra table key and columns + +```shell +tenant: public +namespace: default +name: cassandra-test-sink +... +# cassandra specific config +configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" +``` + +**Example 2** + +Below is a YAML configuration file of a Kafka source. + +```shell +configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: "false" +``` + +**Example 3** + +Below is a YAML configuration file of a MySQL JDBC sink. + +```shell +configs: + userName: "root" + password: "jdbc" + jdbcUrl: "jdbc:mysql://127.0.0.1:3306/test_jdbc" + tableName: "test_jdbc" +``` + +## Get available connectors + +Before starting using connectors, you can perform the following operations: + +* [Reload connectors](#reload) + +* [Get a list of available connectors](#get-available-connectors) + +### `reload` + +If you add or delete a nar file in a connector folder, reload the available builtin connector before using it. + +#### Source + +Use the `reload` subcommand. + +```shell +$ pulsar-admin sources reload +``` + +For more information, see [`here`](reference-connector-admin/#reload). + +#### Sink + +Use the `reload` subcommand. + +```shell +$ pulsar-admin sinks reload +``` + +For more information, see [`here`](reference-connector-admin/#reload-1). + +### `available` + +After reloading connectors (optional), you can get a list of available connectors. + +#### Source + +Use the `available-sources` subcommand. + +```shell +$ pulsar-admin sources available-sources +``` + +#### Sink + +Use the `available-sinks` subcommand. + +```shell +$ pulsar-admin sinks available-sinks +``` + +## Run a connector + +To run a connector, you can perform the following operations: + +* [Create a connector](#create) + +* [Start a connector](#start) + +* [Run a connector locally](#localrun) + +### `create` + +You can create a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Create a source connector. + + + + + +Use the `create` subcommand. + +``` +$ pulsar-admin sources create options +``` + +For more information, see [here](reference-connector-admin.md#create). + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/registerSource} + + + +* Create a source connector with a **local file**. + + ```java + void createSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + ``` + + **Parameter** + + |Name|Description + |---|--- + `sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#createSource-SourceConfig-java.lang.String-). + +* Create a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + void createSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sourceConfig` | The source configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSourceWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#createSourceWithUrl-SourceConfig-java.lang.String-). + + + +#### Sink + +Create a sink connector. + + + + + +Use the `create` subcommand. + +``` +$ pulsar-admin sinks create options +``` + +For more information, see [here](reference-connector-admin.md#create-1). + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/registerSink} + + + +* Create a sink connector with a **local file**. + + ```java + void createSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + ``` + + **Parameter** + + |Name|Description + |---|--- + `sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#createSink-SinkConfig-java.lang.String-). + +* Create a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + void createSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + Parameter| Description + |---|--- + `sinkConfig` | The sink configuration object + `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`createSinkWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#createSinkWithUrl-SinkConfig-java.lang.String-). + + + +### `start` + +You can start a connector using **Admin CLI** or **REST API**. + +#### Source + +Start a source connector. + + + + + +Use the `start` subcommand. + +``` +$ pulsar-admin sources start options +``` + +For more information, see [here](reference-connector-admin.md#start). + + + +* Start **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/start|operation/startSource} + +* Start a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSource} + + + +#### Sink + +Start a sink connector. + + + + + +Use the `start` subcommand. + +``` +$ pulsar-admin sinks start options +``` + +For more information, see [here](reference-connector-admin.md#start-1). + + + +* Start **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/start|operation/startSink} + +* Start a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/start|operation/startSink} + + + +### `localrun` + +You can run a connector locally rather than deploying it on a Pulsar cluster using **Admin CLI**. + +#### Source + +Run a source connector locally. + + + + + +Use the `localrun` subcommand. + +``` +$ pulsar-admin sources localrun options +``` + +For more information, see [here](reference-connector-admin.md#localrun). + + + +#### Sink + +Run a sink connector locally. + + + + + +Use the `localrun` subcommand. + +``` +$ pulsar-admin sinks localrun options +``` + +For more information, see [here](reference-connector-admin.md#localrun-1). + + + +## Monitor a connector + +To monitor a connector, you can perform the following operations: + +* [Get the information of a connector](#get) + +* [Get the list of all running connectors](#list) + +* [Get the current status of a connector](#status) + +### `get` + +You can get the information of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the information of a source connector. + + + + + +Use the `get` subcommand. + +``` +$ pulsar-admin sources get options +``` + +For more information, see [here](reference-connector-admin.md#get). + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/getSourceInfo} + + + +```java +SourceConfig getSource(String tenant, + String namespace, + String source) + throws PulsarAdminException +``` + +**Example** + +This is a sourceConfig. + +```java +{ + "tenant": "tenantName", + "namespace": "namespaceName", + "name": "sourceName", + "className": "className", + "topicName": "topicName", + "configs": {}, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} +``` + +This is a sourceConfig example. + +``` +{ + "tenant": "public", + "namespace": "default", + "name": "debezium-mysql-source", + "className": "org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource", + "topicName": "debezium-mysql-topic", + "configs": { + "database.user": "debezium", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.port": "3306", + "database.hostname": "localhost", + "database.password": "dbz", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.whitelist": "inventory", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "database.history.pulsar.topic": "history-topic2" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + } +} +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException.NotFoundException` | Cluster doesn't exist +`PulsarAdminException` | Unexpected error + +For more information, see [`getSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + + + +#### Sink + +Get the information of a sink connector. + + + + + +Use the `get` subcommand. + +``` +$ pulsar-admin sinks get options +``` + +For more information, see [here](reference-connector-admin.md#get-1). + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/getSinkInfo} + + + +```java +SinkConfig getSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException +``` + +**Example** + +This is a sinkConfig. + +``` +{ +"tenant": "tenantName", +"namespace": "namespaceName", +"name": "sinkName", +"className": "className", +"inputSpecs": { +"topicName": { + "isRegexPattern": false +} +}, +"configs": {}, +"parallelism": 1, +"processingGuarantees": "ATLEAST_ONCE", +"retainOrdering": false, +"autoAck": true +} +``` + +This is a sinkConfig example. + +``` +{ +"tenant": "public", +"namespace": "default", +"name": "pulsar-mysql-jdbc-sink", +"className": "org.apache.pulsar.io.jdbc.JdbcAutoSchemaSink", +"inputSpecs": { +"pulsar-mysql-jdbc-sink-topic": { + "isRegexPattern": false +} +}, +"configs": { +"password": "jdbc", +"jdbcUrl": "jdbc:mysql://127.0.0.1:3306/pulsar_mysql_jdbc_sink", +"userName": "root", +"tableName": "pulsar_mysql_jdbc_sink" +}, +"parallelism": 1, +"processingGuarantees": "ATLEAST_ONCE", +"retainOrdering": false, +"autoAck": true +} +``` + +**Parameter description** + +Name| Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +For more information, see [`getSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSink-java.lang.String-java.lang.String-java.lang.String-). + + + +### `list` + +You can get the list of all running connectors using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the list of all running source connectors. + + + + + +Use the `list` subcommand. + +``` +$ pulsar-admin sources list options +``` + +For more information, see [here](reference-connector-admin.md#list). + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/|operation/listSources} + + + +```java +List listSources(String tenant, + String namespace) + throws PulsarAdminException +``` + +**Response example** + +```java +["f1", "f2", "f3"] +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#listSources-java.lang.String-java.lang.String-). + + + +#### Sink + +Get the list of all running sink connectors. + + + + + +Use the `list` subcommand. + +``` +$ pulsar-admin sinks list options +``` + +For more information, see [here](reference-connector-admin.md#list-1). + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/|operation/listSinks} + + + +```java +List listSinks(String tenant, + String namespace) + throws PulsarAdminException +``` + +**Response example** + +```java +["f1", "f2", "f3"] +``` + +**Exception** + +Exception name | Description +|---|--- +`PulsarAdminException.NotAuthorizedException` | You don't have the admin permission +`PulsarAdminException` | Unexpected error + +For more information, see [`listSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#listSinks-java.lang.String-java.lang.String-). + + + +### `status` + +You can get the current status of a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Get the current status of a source connector. + + + + + +Use the `status` subcommand. + +``` +$ pulsar-admin sources status options +``` + +For more information, see [here](reference-connector-admin.md#status). + + + +* Get the current status of **all** source connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/status|operation/getSourceStatus} + +* Gets the current status of a **specified** source connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSourceStatus} + + + +* Get the current status of **all** source connectors. + + ```java + SourceStatus getSourceStatus(String tenant, + String namespace, + String source) + throws PulsarAdminException + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSource-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + SourceStatus.SourceInstanceStatus.SourceInstanceStatusData getSourceStatus(String tenant, + String namespace, + String source, + int id) + throws PulsarAdminException + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Source instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSourceStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#getSourceStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + +#### Sink + +Get the current status of a Pulsar sink connector. + + + + + +Use the `status` subcommand. + +``` +$ pulsar-admin sinks status options +``` + +For more information, see [here](reference-connector-admin.md#status-1). + + + +* Get the current status of **all** sink connectors. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sinkName/status|operation/getSinkStatus} + +* Gets the current status of a **specified** sink connector. + + Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v3/sinks/:tenant/:namespace/:sourceName/:instanceId/status|operation/getSinkInstanceStatus} + + + +* Get the current status of **all** sink connectors. + + ```java + SinkStatus getSinkStatus(String tenant, + String namespace, + String sink) + throws PulsarAdminException + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatus`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-). + +* Gets the current status of a **specified** source connector. + + ```java + SinkStatus.SinkInstanceStatus.SinkInstanceStatusData getSinkStatus(String tenant, + String namespace, + String sink, + int id) + throws PulsarAdminException + ``` + + **Parameter** + + Parameter| Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Source name + `id` | Sink instanceID + + **Exception** + + Exception name | Description + |---|--- + `PulsarAdminException` | Unexpected error + + For more information, see [`getSinkStatusWithInstanceID`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#getSinkStatus-java.lang.String-java.lang.String-java.lang.String-int-). + + + +## Update a connector + +### `update` + +You can update a running connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Update a running Pulsar source connector. + + + + + +Use the `update` subcommand. + +``` +$ pulsar-admin sources update options +``` + +For more information, see [here](reference-connector-admin.md#update). + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/updateSource} + + + +* Update a running source connector with a **local file**. + + ```java + void updateSource(SourceConfig sourceConfig, + String fileName) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sourceConfig` | The source configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#updateSource-SourceConfig-java.lang.String-). + +* Update a source connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + void updateSourceWithUrl(SourceConfig sourceConfig, + String pkgUrl) + throws PulsarAdminException + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sourceConfig` | The source configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + +For more information, see [`createSourceWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#updateSourceWithUrl-SourceConfig-java.lang.String-). + + + +#### Sink + +Update a running Pulsar sink connector. + + + + + +Use the `update` subcommand. + +``` +$ pulsar-admin sinks update options +``` + +For more information, see [here](reference-connector-admin.md#update-1). + + + +Send a `PUT` request to this endpoint: {@inject: endpoint|PUT|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/updateSink} + + + +* Update a running sink connector with a **local file**. + + ```java + void updateSink(SinkConfig sinkConfig, + String fileName) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + |`sinkConfig` | The sink configuration object + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + | `PulsarAdminException.NotFoundException` | Cluster doesn't exist + | `PulsarAdminException` | Unexpected error + + For more information, see [`updateSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSink-SinkConfig-java.lang.String-). + +* Update a sink connector using a **remote file** with a URL from which fun-pkg can be downloaded. + + ```java + void updateSinkWithUrl(SinkConfig sinkConfig, + String pkgUrl) + throws PulsarAdminException + ``` + + Supported URLs are `http` and `file`. + + **Example** + + * HTTP: http://www.repo.com/fileName.jar + + * File: file:///dir/fileName.jar + + **Parameter** + + | Name | Description + |---|--- + | `sinkConfig` | The sink configuration object + | `pkgUrl` | URL from which pkg can be downloaded + + **Exception** + + |Name|Description| + |---|--- + |`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission + |`PulsarAdminException.NotFoundException` | Cluster doesn't exist + |`PulsarAdminException` | Unexpected error + +For more information, see [`updateSinkWithUrl`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#updateSinkWithUrl-SinkConfig-java.lang.String-). + + + +## Stop a connector + +### `stop` + +You can stop a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Stop a source connector. + + + + + +Use the `stop` subcommand. + +``` +$ pulsar-admin sources stop options +``` + +For more information, see [here](reference-connector-admin.md#stop). + + + +* Stop **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/stopSource} + +* Stop a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId|operation/stopSource} + + + +* Stop **all** source connectors. + + ```java + void stopSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** source connector. + + ```java + void stopSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#stopSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + +#### Sink + +Stop a sink connector. + + + + + +Use the `stop` subcommand. + +``` +$ pulsar-admin sinks stop options +``` + +For more information, see [here](reference-connector-admin.md#stop-1). + + + +* Stop **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sinks/:tenant/:namespace/:sinkName/stop|operation/stopSink} + +* Stop a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkeName/:instanceId/stop|operation/stopSink} + + + +* Stop **all** sink connectors. + + ```java + void stopSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-). + +* Stop a **specified** sink connector. + + ```java + void stopSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`stopSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#stopSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + +## Restart a connector + +### `restart` + +You can restart a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Restart a source connector. + + + + + +Use the `restart` subcommand. + +``` +$ pulsar-admin sources restart options +``` + +For more information, see [here](reference-connector-admin.md#restart). + + + +* Restart **all** source connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/restart|operation/restartSource} + +* Restart a **specified** source connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sourceName/:instanceId/restart|operation/restartSource} + + + +* Restart **all** source connectors. + + ```java + void restartSource(String tenant, + String namespace, + String source) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** source connector. + + ```java + void restartSource(String tenant, + String namespace, + String source, + int instanceId) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Source instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#restartSource-java.lang.String-java.lang.String-java.lang.String-int-). + + + +#### Sink + +Restart a sink connector. + + + + + +Use the `restart` subcommand. + +``` +$ pulsar-admin sinks restart options +``` + +For more information, see [here](reference-connector-admin.md#restart-1). + + + +* Restart **all** sink connectors. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/restart|operation/restartSource} + +* Restart a **specified** sink connector. + + Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v3/sources/:tenant/:namespace/:sinkName/:instanceId/restart|operation/restartSource} + + + +* Restart all Pulsar sink connectors. + + ```java + void restartSink(String tenant, + String namespace, + String sink) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `sink` | Sink name + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-). + +* Restart a **specified** sink connector. + + ```java + void restartSink(String tenant, + String namespace, + String sink, + int instanceId) + throws PulsarAdminException + ``` + + **Parameter** + + | Name | Description + |---|--- + `tenant` | Tenant name + `namespace` | Namespace name + `source` | Source name + `instanceId` | Sink instanceID + + **Exception** + + |Name|Description| + |---|--- + | `PulsarAdminException` | Unexpected error + + For more information, see [`restartSink`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#restartSink-java.lang.String-java.lang.String-java.lang.String-int-). + + + +## Delete a connector + +### `delete` + +You can delete a connector using **Admin CLI**, **REST API** or **JAVA admin API**. + +#### Source + +Delete a source connector. + + + + + +Use the `delete` subcommand. + +``` +$ pulsar-admin sources delete options +``` + +For more information, see [here](reference-connector-admin.md#delete). + + + +Delete al Pulsar source connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sources/:tenant/:namespace/:sourceName|operation/deregisterSource} + + + +Delete a source connector. + +```java +void deleteSource(String tenant, + String namespace, + String source) + throws PulsarAdminException +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`source` | Source name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Source.html#deleteSource-java.lang.String-java.lang.String-java.lang.String-). + + + +#### Sink + +Delete a sink connector. + + + + + +Use the `delete` subcommand. + +``` +$ pulsar-admin sinks delete options +``` + +For more information, see [here](reference-connector-admin.md#delete-1). + + + +Delete a sink connector. + +Send a `DELETE` request to this endpoint: {@inject: endpoint|DELETE|/admin/v3/sinks/:tenant/:namespace/:sinkName|operation/deregisterSink} + + + +Delete a Pulsar sink connector. + +```java +void deleteSink(String tenant, + String namespace, + String source) + throws PulsarAdminException +``` + +**Parameter** + +| Name | Description +|---|--- +`tenant` | Tenant name +`namespace` | Namespace name +`sink` | Sink name + +**Exception** + +|Name|Description| +|---|--- +|`PulsarAdminException.NotAuthorizedException`| You don't have the admin permission +| `PulsarAdminException.NotFoundException` | Cluster doesn't exist +| `PulsarAdminException.PreconditionFailedException` | Cluster is not empty +| `PulsarAdminException` | Unexpected error + +For more information, see [`deleteSource`](https://pulsar.apache.org/api/admin/org/apache/pulsar/client/admin/Sink.html#deleteSink-java.lang.String-java.lang.String-java.lang.String-). + + diff --git a/site2/website/versioned_docs/version-2.4.2/reference-configuration.md b/site2/website/versioned_docs/version-2.4.2/reference-configuration.md new file mode 100644 index 0000000000000..9d47e7a74288c --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/reference-configuration.md @@ -0,0 +1,494 @@ +--- +id: version-2.4.2-reference-configuration +title: Pulsar configuration +sidebar_label: Pulsar configuration +original_id: reference-configuration +--- + + + + +Pulsar configuration can be managed either via a series of configuration files contained in the [`conf`](https://github.com/apache/pulsar/tree/master/conf) directory of a Pulsar [installation](getting-started-standalone.md) + +* [BookKeeper](#bookkeeper) +* [Broker](#broker) +* [Client](#client) +* [Service discovery](#service-discovery) +* [Log4j](#log4j) +* [Log4j shell](#log4j-shell) +* [Standalone](#standalone) +* [WebSocket](#websocket) +* [ZooKeeper](#zookeeper) + +## BookKeeper + +BookKeeper is a replicated log storage system that Pulsar uses for durable storage of all messages. + + +|Name|Description|Default| +|---|---|---| +|bookiePort|The port on which the bookie server listens.|3181| +|allowLoopback|Whether the bookie is allowed to use a loopback interface as its primary interface (i.e. the interface used to establish its identity). By default, loopback interfaces are not allowed as the primary interface. Using a loopback interface as the primary interface usually indicates a configuration error. For example, it’s fairly common in some VPS setups to not configure a hostname or to have the hostname resolve to `127.0.0.1`. If this is the case, then all bookies in the cluster will establish their identities as `127.0.0.1:3181` and only one will be able to join the cluster. For VPSs configured like this, you should explicitly set the listening interface.|false| +|listeningInterface|The network interface on which the bookie listens. If not set, the bookie will listen on all interfaces.|eth0| +|journalDirectory|The directory where Bookkeeper outputs its write-ahead log (WAL)|data/bookkeeper/journal| +|ledgerDirectories|The directory where Bookkeeper outputs ledger snapshots. This could define multiple directories to store snapshots separated by comma, for example `ledgerDirectories=/tmp/bk1-data,/tmp/bk2-data`. Ideally, ledger dirs and the journal dir are each in a different device, which reduces the contention between random I/O and sequential write. It is possible to run with a single disk, but performance will be significantly lower.|data/bookkeeper/ledgers| +|ledgerManagerType|The type of ledger manager used to manage how ledgers are stored, managed, and garbage collected. See [BookKeeper Internals](http://bookkeeper.apache.org/docs/latest/getting-started/concepts) for more info.|hierarchical| +|zkLedgersRootPath|The root ZooKeeper path used to store ledger metadata. This parameter is used by the ZooKeeper-based ledger manager as a root znode to store all ledgers.|/ledgers| +|ledgerStorageClass|Ledger storage implementation class|org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage| +|entryLogFilePreallocationEnabled|Enable or disable entry logger preallocation|true| +|logSizeLimit|Max file size of the entry logger, in bytes. A new entry log file will be created when the old one reaches the file size limitation.|2147483648| +|minorCompactionThreshold|Threshold of minor compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a minor compaction. If set to less than zero, the minor compaction is disabled.|0.2| +|minorCompactionInterval|Time interval to run minor compaction, in seconds. If set to less than zero, the minor compaction is disabled.|3600| +|majorCompactionThreshold|The threshold of major compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a major compaction. Those entry log files whose remaining size percentage is still higher than the threshold will never be compacted. If set to less than zero, the minor compaction is disabled.|0.5| +|majorCompactionInterval|The time interval to run major compaction, in seconds. If set to less than zero, the major compaction is disabled.|86400| +|compactionMaxOutstandingRequests|Sets the maximum number of entries that can be compacted without flushing. When compacting, the entries are written to the entrylog and the new offsets are cached in memory. Once the entrylog is flushed the index is updated with the new offsets. This parameter controls the number of entries added to the entrylog before a flush is forced. A higher value for this parameter means more memory will be used for offsets. Each offset consists of 3 longs. This parameter should not be modified unless you’re fully aware of the consequences.|100000| +|compactionRate|The rate at which compaction will read entries, in adds per second.|1000| +|isThrottleByBytes|Throttle compaction by bytes or by entries.|false| +|compactionRateByEntries|The rate at which compaction will read entries, in adds per second.|1000| +|compactionRateByBytes|Set the rate at which compaction will readd entries. The unit is bytes added per second.|1000000| +|journalMaxSizeMB|Max file size of journal file, in megabytes. A new journal file will be created when the old one reaches the file size limitation.|2048| +|journalMaxBackups|The max number of old journal filse to keep. Keeping a number of old journal files would help data recovery in special cases.|5| +|journalPreAllocSizeMB|How space to pre-allocate at a time in the journal.|16| +|journalWriteBufferSizeKB|The of the write buffers used for the journal.|64| +|journalRemoveFromPageCache|Whether pages should be removed from the page cache after force write.|true| +|journalAdaptiveGroupWrites|Whether to group journal force writes, which optimizes group commit for higher throughput.|true| +|journalMaxGroupWaitMSec|The maximum latency to impose on a journal write to achieve grouping.|1| +|journalAlignmentSize|All the journal writes and commits should be aligned to given size|4096| +|journalBufferedWritesThreshold|Maximum writes to buffer to achieve grouping|524288| +|journalFlushWhenQueueEmpty|If we should flush the journal when journal queue is empty|false| +|numJournalCallbackThreads|The number of threads that should handle journal callbacks|8| +|rereplicationEntryBatchSize|The number of max entries to keep in fragment for re-replication|5000| +|gcWaitTime|How long the interval to trigger next garbage collection, in milliseconds. Since garbage collection is running in background, too frequent gc will heart performance. It is better to give a higher number of gc interval if there is enough disk capacity.|900000| +|gcOverreplicatedLedgerWaitTime|How long the interval to trigger next garbage collection of overreplicated ledgers, in milliseconds. This should not be run very frequently since we read the metadata for all the ledgers on the bookie from zk.|86400000| +|flushInterval|How long the interval to flush ledger index pages to disk, in milliseconds. Flushing index files will introduce much random disk I/O. If separating journal dir and ledger dirs each on different devices, flushing would not affect performance. But if putting journal dir and ledger dirs on same device, performance degrade significantly on too frequent flushing. You can consider increment flush interval to get better performance, but you need to pay more time on bookie server restart after failure.|60000| +|bookieDeathWatchInterval|Interval to watch whether bookie is dead or not, in milliseconds|1000| +|zkServers|A list of one of more servers on which zookeeper is running. The server list can be comma separated values, for example: zkServers=zk1:2181,zk2:2181,zk3:2181.|localhost:2181| +|zkTimeout|ZooKeeper client session timeout in milliseconds Bookie server will exit if it received SESSION_EXPIRED because it was partitioned off from ZooKeeper for more than the session timeout JVM garbage collection, disk I/O will cause SESSION_EXPIRED. Increment this value could help avoiding this issue|30000| +|serverTcpNoDelay|This settings is used to enabled/disabled Nagle’s algorithm, which is a means of improving the efficiency of TCP/IP networks by reducing the number of packets that need to be sent over the network. If you are sending many small messages, such that more than one can fit in a single IP packet, setting server.tcpnodelay to false to enable Nagle algorithm can provide better performance.|true| +|openFileLimit|Max number of ledger index files could be opened in bookie server If number of ledger index files reaches this limitation, bookie server started to swap some ledgers from memory to disk. Too frequent swap will affect performance. You can tune this number to gain performance according your requirements.|0| +|pageSize|Size of a index page in ledger cache, in bytes A larger index page can improve performance writing page to disk, which is efficent when you have small number of ledgers and these ledgers have similar number of entries. If you have large number of ledgers and each ledger has fewer entries, smaller index page would improve memory usage.|8192| +|pageLimit|How many index pages provided in ledger cache If number of index pages reaches this limitation, bookie server starts to swap some ledgers from memory to disk. You can increment this value when you found swap became more frequent. But make sure pageLimit*pageSize should not more than JVM max memory limitation, otherwise you would got OutOfMemoryException. In general, incrementing pageLimit, using smaller index page would gain bettern performance in lager number of ledgers with fewer entries case If pageLimit is -1, bookie server will use 1/3 of JVM memory to compute the limitation of number of index pages.|0| +|readOnlyModeEnabled|If all ledger directories configured are full, then support only read requests for clients. If “readOnlyModeEnabled=true” then on all ledger disks full, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown. By default this will be disabled.|true| +|diskUsageThreshold|For each ledger dir, maximum disk space which can be used. Default is 0.95f. i.e. 95% of disk can be used at most after which nothing will be written to that partition. If all ledger dir partions are full, then bookie will turn to readonly mode if ‘readOnlyModeEnabled=true’ is set, else it will shutdown. Valid values should be in between 0 and 1 (exclusive).|0.95| +|diskCheckInterval|Disk check interval in milli seconds, interval to check the ledger dirs usage.|10000| +|auditorPeriodicCheckInterval|Interval at which the auditor will do a check of all ledgers in the cluster. By default this runs once a week. The interval is set in seconds. To disable the periodic check completely, set this to 0. Note that periodic checking will put extra load on the cluster, so it should not be run more frequently than once a day.|604800| +|auditorPeriodicBookieCheckInterval|The interval between auditor bookie checks. The auditor bookie check, checks ledger metadata to see which bookies should contain entries for each ledger. If a bookie which should contain entries is unavailable, thea the ledger containing that entry is marked for recovery. Setting this to 0 disabled the periodic check. Bookie checks will still run when a bookie fails. The interval is specified in seconds.|86400| +|numAddWorkerThreads|number of threads that should handle write requests. if zero, the writes would be handled by netty threads directly.|0| +|numReadWorkerThreads|number of threads that should handle read requests. if zero, the reads would be handled by netty threads directly.|8| +|maxPendingReadRequestsPerThread|If read workers threads are enabled, limit the number of pending requests, to avoid the executor queue to grow indefinitely.|2500| +|readBufferSizeBytes|The number of bytes we should use as capacity for BufferedReadChannel.|4096| +|writeBufferSizeBytes|The number of bytes used as capacity for the write buffer|65536| +|useHostNameAsBookieID|Whether the bookie should use its hostname to register with the coordination service (e.g.: zookeeper service). When false, bookie will use its ipaddress for the registration.|false| +|statsProviderClass||org.apache.bookkeeper.stats.prometheus.PrometheusMetricsProvider| +|prometheusStatsHttpPort||8000| +|dbStorage_writeCacheMaxSizeMb|Size of Write Cache. Memory is allocated from JVM direct memory. Write cache is used to buffer entries before flushing into the entry log For good performance, it should be big enough to hold a sub|25% of direct memory| +|dbStorage_readAheadCacheMaxSizeMb|Size of Read cache. Memory is allocated from JVM direct memory. This read cache is pre-filled doing read-ahead whenever a cache miss happens|25% of direct memory| +|dbStorage_readAheadCacheBatchSize|How many entries to pre-fill in cache after a read cache miss|1000| +|dbStorage_rocksDB_blockCacheSize|Size of RocksDB block-cache. For best performance, this cache should be big enough to hold a significant portion of the index database which can reach ~2GB in some cases|10% of direct memory| +|dbStorage_rocksDB_writeBufferSizeMB||64| +|dbStorage_rocksDB_sstSizeInMB||64| +|dbStorage_rocksDB_blockSize||65536| +|dbStorage_rocksDB_bloomFilterBitsPerKey||10| +|dbStorage_rocksDB_numLevels||-1| +|dbStorage_rocksDB_numFilesInLevel0||4| +|dbStorage_rocksDB_maxSizeInLevel1MB||256| + + + +## Broker + +Pulsar brokers are responsible for handling incoming messages from producers, dispatching messages to consumers, replicating data between clusters, and more. + +|Name|Description|Default| +|---|---|---| +|enablePersistentTopics| Whether persistent topics are enabled on the broker |true| +|enableNonPersistentTopics| Whether non-persistent topics are enabled on the broker |true| +|functionsWorkerEnabled| Whether the Pulsar Functions worker service is enabled in the broker |false| +|zookeeperServers| Zookeeper quorum connection string || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| Broker data port |6650| +|brokerServicePortTls| Broker data port for TLS |6651| +|webServicePort| Port to use to server HTTP request |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|webSocketServiceEnabled| Enable the WebSocket API service in broker |false| +|bindAddress| Hostname or IP address the service binds on, default is 0.0.0.0. |0.0.0.0| +|advertisedAddress| Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| Name of the cluster to which this broker belongs to || +|brokerDeduplicationEnabled| Sets the default behavior for message deduplication in the broker. If enabled, the broker will reject messages that were already stored in the topic. This setting can be overridden on a per-namespace basis. |false| +|brokerDeduplicationMaxNumberOfProducers| The maximum number of producers for which information will be stored for deduplication purposes. |10000| +|brokerDeduplicationEntriesInterval| The number of entries after which a deduplication informational snapshot is taken. A larger interval will lead to fewer snapshots being taken, though this would also lengthen the topic recovery time (the time required for entries published after the snapshot to be replayed). |1000| +|brokerDeduplicationProducerInactivityTimeoutMinutes| The time of inactivity (in minutes) after which the broker will discard deduplication information related to a disconnected producer. |360| +|zooKeeperSessionTimeoutMillis| Zookeeper session timeout in milliseconds |30000| +|brokerShutdownTimeoutMs| Time to wait for broker graceful shutdown. After this time elapses, the process will be killed |60000| +|backlogQuotaCheckEnabled| Enable backlog quota check. Enforces action on topic when the quota is reached |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the quota |60| +|backlogQuotaDefaultLimitGB| Default per-topic backlog quota limit |10| +|allowAutoTopicCreation| Enable topic auto creation if new producer or consumer connected |true| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics |60| +|messageExpiryCheckIntervalInMinutes| How frequently to proactively check and purge expired messages |5| +|brokerServiceCompactionMonitorIntervalInSeconds| Interval between checks to see if topics with compaction policies need to be compacted |60| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable check for minimum allowed client library version |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| Path for the file used to determine the rotation status for the broker when responding to service discovery health checks || +|preferLaterVersions| If true, (and ModularLoadManagerImpl is being used), the load manager will attempt to use only brokers running the latest software version (to minimize impact to bundles) |false| +|tlsEnabled| Enable TLS |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate file || +|tlsAllowInsecureConnection| Accept untrusted TLS certificate from client |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.2```, ```TLSv1.1```, ```TLSv1``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`|| +|tokenPublicAlg| Configure the algorithm to be used to validate auth tokens. This can be any of the asymettric algorithms supported by Java JWT (https://github.com/jwtk/jjwt#signature-algorithms-keys) |RS256| +|tokenAuthClaim| Specify which of the token's claims will be used as the authentication "principal" or "role". The default "sub" claim will be used if this is left blank || +|maxUnackedMessagesPerConsumer| Max number of unacknowledged messages allowed to receive messages by a consumer on a shared subscription. Broker will stop sending messages to consumer once, this limit reaches until consumer starts acknowledging messages back. Using a value of 0, is disabling unackeMessage limit check and consumer can receive messages without any restriction |50000| +|maxUnackedMessagesPerSubscription| Max number of unacknowledged messages allowed per shared subscription. Broker will stop dispatching messages to all consumers of the subscription once this limit reaches until consumer starts acknowledging messages back and unack count reaches to limit/2. Using a value of 0, is disabling unackedMessage-limit check and dispatcher can dispatch messages without any restriction |200000| +|subscriptionRedeliveryTrackerEnabled| Enable subscription message redelivery tracker |true| +|maxConcurrentLookupRequest| Max number of concurrent lookup request broker allows to throttle heavy incoming lookup traffic |50000| +|maxConcurrentTopicLoadRequest| Max number of concurrent topic loading request broker allows to control number of zk-operations |5000| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Autentication provider name list, which is comma separated list of class names || +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics || +|brokerClientAuthenticationPlugin| Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters || +|brokerClientAuthenticationParameters||| +|athenzDomainNames| Supported Athenz provider domain names(comma separated) for authentication || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to use when connecting to bookies || +|bookkeeperClientAuthenticationParametersName| BookKeeper auth plugin implementatation specifics parameters name and values || +|bookkeeperClientAuthenticationParameters||| +|bookkeeperClientTimeoutInSeconds| Timeout for BK add / read operations |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time Using a value of 0, is disabling the speculative reads |0| +|bookkeeperClientHealthCheckEnabled| Enable bookies health check. Bookies that have more than the configured number of failure within the interval will be quarantined for some time. During this period, new ledgers won’t be created on these bookies |true| +|bookkeeperClientHealthCheckIntervalSeconds||60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval||5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds ||1800| +|bookkeeperClientRackawarePolicyEnabled| Enable rack-aware bookie selection policy. BK will chose bookies from different racks when forming a new bookie ensemble |true| +|bookkeeperClientRegionawarePolicyEnabled| Enable region-aware bookie selection policy. BK will chose bookies from different regions and racks when forming a new bookie ensemble. If enabled, the value of bookkeeperClientRackawarePolicyEnabled is ignored |false| +|bookkeeperClientReorderReadSequenceEnabled| Enable/disable reordering read sequence on reading entries. |false| +|bookkeeperClientIsolationGroups| Enable bookie isolation by specifying a list of bookie groups to choose from. Any bookie outside the specified groups will not be used by the broker || +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +|bookkeeperEnableStickyReads | Enable/disable having read operations for a ledger to be sticky to a single bookie. If this flag is enabled, the client will use one single bookie (by preference) to read all entries for a ledger. | true | +|managedLedgerDefaultEnsembleSize| Number of bookies to use when creating a ledger |2| +|managedLedgerDefaultWriteQuorum| Number of copies to store for each message |2| +|managedLedgerDefaultAckQuorum| Number of guaranteed copies (acks to wait before write is complete) |2| +|managedLedgerCacheSizeMB| Amount of memory to use for caching data payload in managed ledger. This memory is allocated from JVM direct memory and it’s shared across all the topics running in the same broker. By default, uses 1/5th of available direct memory || +|managedLedgerCacheCopyEntries| Whether we should make a copy of the entry payloads when inserting in cache| false| +|managedLedgerCacheEvictionWatermark| Threshold to which bring down the cache level when eviction is triggered |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerDefaultMarkDeleteRateLimit| Rate limit the amount of writes per second generated by consumer acking the messages |1.0| +|managedLedgerMaxEntriesPerLedger| Max number of entries to append to a ledger before triggering a rollover. A ledger rollover is triggered on these conditions:
  • Either the max rollover time has been reached
  • or max entries have been written to the ledged and at least min-time has passed
|50000| +|managedLedgerMinLedgerRolloverTimeMinutes| Minimum time between ledger rollover for a topic |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| Maximum time before forcing a ledger rollover for a topic |240| +|managedLedgerCursorMaxEntriesPerLedger| Max number of entries to append to a cursor ledger |50000| +|managedLedgerCursorRolloverTimeInSeconds| Max time before triggering a rollover on a cursor ledger |14400| +|managedLedgerMaxUnackedRangesToPersist| Max number of “acknowledgment holes” that are going to be persistently stored. When acknowledging out of order, a consumer will leave holes that are supposed to be quickly filled by acking all the messages. The information of which messages are acknowledged is persisted by compressing in “ranges” of messages that were acknowledged. After the max number of ranges is reached, the information will only be tracked in memory and messages will be redelivered in case of crashes. |1000| +|autoSkipNonRecoverableData| Skip reading non-recoverable/unreadable data-ledger under managed-ledger’s list.It helps when data-ledgers gets corrupted at bookkeeper and managed-cursor is stuck at that ledger. |false| +|loadBalancerEnabled| Enable load balancer |true| +|loadBalancerPlacementStrategy| Strategy to assign a new bundle weightedRandomSelection || +|loadBalancerReportUpdateThresholdPercentage| Percentage of change to trigger load report update |10| +|loadBalancerReportUpdateMaxIntervalMinutes| maximum interval to update load report |15| +|loadBalancerHostUsageCheckIntervalMinutes| Frequency of report to collect |1| +|loadBalancerSheddingIntervalMinutes| Load shedding interval. Broker periodically checks whether some traffic should be offload from some over-loaded broker to other under-loaded brokers |30| +|loadBalancerSheddingGracePeriodMinutes| Prevent the same topics to be shed and moved to other broker more that once within this timeframe |30| +|loadBalancerBrokerMaxTopics| Usage threshold to allocate max number of topics to broker |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| Usage threshold to determine a broker as under-loaded |1| +|loadBalancerBrokerOverloadedThresholdPercentage| Usage threshold to determine a broker as over-loaded |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| Interval to update namespace bundle resource quotat |15| +|loadBalancerBrokerComfortLoadLevelPercentage| Usage threshold to determine a broker is having just right level of load |65| +|loadBalancerAutoBundleSplitEnabled| enable/disable namespace bundle auto split |false| +|loadBalancerNamespaceBundleMaxTopics| maximum topics in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxSessions| maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxMsgRate| maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered |100| +|loadBalancerNamespaceMaximumBundles| maximum number of bundles in a namespace |128| +|replicationMetricsEnabled| Enable replication metrics |true| +|replicationConnectionsPerBroker| Max number of connections to open for each broker in a remote cluster More connections host-to-host lead to better throughput over high-latency links. |16| +|replicationProducerQueueSize| Replicator producer queue size |1000| +|replicatorPrefix| Replicator prefix used for replicator producer name and cursor name pulsar.repl|| +|replicationTlsEnabled| Enable TLS when talking with other clusters to replicate messages |false| +|defaultRetentionTimeInMinutes| Default message retention time || +|defaultRetentionSizeInMB| Default retention size |0| +|keepAliveIntervalSeconds| How often to check whether the connections are still alive |30| +|loadManagerClassName| Name of load manager to use |org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl| +|managedLedgerOffloadDriver| Driver to use to offload old data to long term storage (Possible values: S3) || +|managedLedgerOffloadMaxThreads| Maximum number of thread pool threads for ledger offloading |2| +|s3ManagedLedgerOffloadRegion| For Amazon S3 ledger offload, AWS region || +|s3ManagedLedgerOffloadBucket| For Amazon S3 ledger offload, Bucket to place offloaded ledger into || +|s3ManagedLedgerOffloadServiceEndpoint| For Amazon S3 ledger offload, Alternative endpoint to connect to (useful for testing) || +|s3ManagedLedgerOffloadMaxBlockSizeInBytes| For Amazon S3 ledger offload, Max block size in bytes. (64MB by default, 5MB minimum) |67108864| +|s3ManagedLedgerOffloadReadBufferSizeInBytes| For Amazon S3 ledger offload, Read buffer size in bytes (1MB by default) |1048576| +|s3ManagedLedgerOffloadRole| For Amazon S3 ledger offload, provide a role to assume before writing to s3 || +|s3ManagedLedgerOffloadRoleSessionName| For Amazon S3 ledger offload, provide a role session name when using a role |pulsar-s3-offload| + + + + +## Client + +The [`pulsar-client`](reference-cli-tools.md#pulsar-client) CLI tool can be used to publish messages to Pulsar and consume messages from Pulsar topics. This tool can be used in lieu of a client library. + +|Name|Description|Default| +|---|---|---| +|webServiceUrl| The web URL for the cluster. |http://localhost:8080/| +|brokerServiceUrl| The Pulsar protocol URL for the cluster. |pulsar://localhost:6650/| +|authPlugin| The authentication plugin. || +|authParams| The authentication parameters for the cluster, as a comma-separated string. || +|useTls| Whether or not TLS authentication will be enforced in the cluster. |false| +|tlsAllowInsecureConnection||| +|tlsTrustCertsFilePath||| + + +## Service discovery + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| Zookeeper quorum connection string (comma-separated) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout |30000| +|servicePort| Port to use to server binary-proto request |6650| +|servicePortTls| Port to use to server binary-proto-tls request |6651| +|webServicePort| Port that discovery service listen on |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|bindOnLocalhost| Control whether to bind directly on localhost rather than on normal hostname |false| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Authentication provider name list, which is comma separated list of class names (comma-separated) || +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics (comma-separated) || +|tlsEnabled| Enable TLS |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || + + + +## Log4j + + +|Name|Default| +|---|---| +|pulsar.root.logger| WARN,CONSOLE| +|pulsar.log.dir| logs| +|pulsar.log.file| pulsar.log| +|log4j.rootLogger| ${pulsar.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ISO8601} - %-5p - [%t:%C{1}@%L] - %m%n| +|log4j.appender.ROLLINGFILE| org.apache.log4j.DailyRollingFileAppender| +|log4j.appender.ROLLINGFILE.Threshold| DEBUG| +|log4j.appender.ROLLINGFILE.File| ${pulsar.log.dir}/${pulsar.log.file}| +|log4j.appender.ROLLINGFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.ROLLINGFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n| +|log4j.appender.TRACEFILE| org.apache.log4j.FileAppender| +|log4j.appender.TRACEFILE.Threshold| TRACE| +|log4j.appender.TRACEFILE.File| pulsar-trace.log| +|log4j.appender.TRACEFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.TRACEFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L][%x] - %m%n| + + +## Log4j shell + +|Name|Default| +|---|---| +|bookkeeper.root.logger| ERROR,CONSOLE| +|log4j.rootLogger| ${bookkeeper.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ABSOLUTE} %-5p %m%n| +|log4j.logger.org.apache.zookeeper| ERROR| +|log4j.logger.org.apache.bookkeeper| ERROR| +|log4j.logger.org.apache.bookkeeper.bookie.BookieShell| INFO| + + +## Standalone + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The quorum connection string for local ZooKeeper || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| The port on which the standalone broker listens for connections |6650| +|webServicePort| THe port used by the standalone broker for HTTP requests |8080| +|bindAddress| The hostname or IP address on which the standalone service binds |0.0.0.0| +|advertisedAddress| The hostname or IP address that the standalone service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| The name of the cluster that this broker belongs to. |standalone| +|zooKeeperSessionTimeoutMillis| The ZooKeeper session timeout, in milliseconds. |30000| +|brokerShutdownTimeoutMs| The time to wait for graceful broker shutdown. After this time elapses, the process will be killed. |60000| +|backlogQuotaCheckEnabled| Enable the backlog quota check, which enforces a specified action when the quota is reached. |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the backlog quota. |60| +|backlogQuotaDefaultLimitGB| The default per-topic backlog quota limit. |10| +|ttlDurationDefaultInSeconds| Default ttl for namespaces if ttl is not already configured at namespace policies. |0| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics, in seconds. |60| +|messageExpiryCheckIntervalInMinutes| How often to proactively check and purged expired messages. |5| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable checks for minimum allowed client library version. |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| The path for the file used to determine the rotation status for the broker when responding to service discovery health checks |/usr/local/apache/htdocs| +|maxUnackedMessagesPerConsumer| The maximum number of unacknowledged messages allowed to be received by consumers on a shared subscription. The broker will stop sending messages to a consumer once this limit is reached or until the consumer begins acknowledging messages. A value of 0 disables the unacked message limit check and thus allows consumers to receive messages without any restrictions. |50000| +|maxUnackedMessagesPerSubscription| The same as above, except per subscription rather than per consumer. |200000| +|authenticationEnabled| Enable authentication for the broker. |false| +|authenticationProviders| A comma-separated list of class names for authentication providers. |false| +|authorizationEnabled| Enforce authorization in brokers. |false| +|superUserRoles| Role names that are treated as “superusers.” Superusers are authorized to perform all admin tasks. || +|brokerClientAuthenticationPlugin| The authentication settings of the broker itself. Used when the broker connects to other brokers either in the same cluster or from other clusters. || +|brokerClientAuthenticationParameters| The parameters that go along with the plugin specified using brokerClientAuthenticationPlugin. || +|athenzDomainNames| Supported Athenz authentication provider domain names as a comma-separated list. || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to be used when connecting to bookies (BookKeeper servers). || +|bookkeeperClientAuthenticationParametersName| BookKeeper authentication plugin implementation parameters and values. || +|bookkeeperClientAuthenticationParameters| Parameters associated with the bookkeeperClientAuthenticationParametersName || +|bookkeeperClientTimeoutInSeconds| Timeout for BookKeeper add and read operations. |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time. A value of 0 disables speculative reads. |0| +|bookkeeperClientHealthCheckEnabled| Enable bookie health checks. |true| +|bookkeeperClientHealthCheckIntervalSeconds| The time interval, in seconds, at which health checks are performed. New ledgers are not created during health checks. |60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval| Error threshold for health checks. |5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds| If bookies have more than the allowed number of failures within the time interval specified by bookkeeperClientHealthCheckIntervalSeconds |1800| +|bookkeeperClientRackawarePolicyEnabled| |true| +|bookkeeperClientRegionawarePolicyEnabled| |false| +|bookkeeperClientReorderReadSequenceEnabled| |false| +|bookkeeperClientIsolationGroups||| +|managedLedgerDefaultEnsembleSize| |1| +|managedLedgerDefaultWriteQuorum| |1| +|managedLedgerDefaultAckQuorum| |1| +|managedLedgerCacheSizeMB| |1024| +|managedLedgerCacheEvictionWatermark| |0.9| +|managedLedgerDefaultMarkDeleteRateLimit| |0.1| +|managedLedgerMaxEntriesPerLedger| |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| |240| +|managedLedgerCursorMaxEntriesPerLedger| |50000| +|managedLedgerCursorRolloverTimeInSeconds| |14400| +|autoSkipNonRecoverableData| |false| +|loadBalancerEnabled| |false| +|loadBalancerPlacementStrategy| |weightedRandomSelection| +|loadBalancerReportUpdateThresholdPercentage| |10| +|loadBalancerReportUpdateMaxIntervalMinutes| |15| +|loadBalancerHostUsageCheckIntervalMinutes| |1| +|loadBalancerSheddingIntervalMinutes| |30| +|loadBalancerSheddingGracePeriodMinutes| |30| +|loadBalancerBrokerMaxTopics| |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| |1| +|loadBalancerBrokerOverloadedThresholdPercentage| |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| |15| +|loadBalancerBrokerComfortLoadLevelPercentage| |65| +|loadBalancerAutoBundleSplitEnabled| |false| +|loadBalancerNamespaceBundleMaxTopics| |1000| +|loadBalancerNamespaceBundleMaxSessions| |1000| +|loadBalancerNamespaceBundleMaxMsgRate| |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| |100| +|loadBalancerNamespaceMaximumBundles| |128| +|replicationMetricsEnabled| |true| +|replicationConnectionsPerBroker| |16| +|replicationProducerQueueSize| |1000| +|defaultRetentionTimeInMinutes| |0| +|defaultRetentionSizeInMB| |0| +|keepAliveIntervalSeconds| |30| + + + + + +## WebSocket + +|Name|Description|Default| +|---|---|---| +|configurationStoreServers ||| +|zooKeeperSessionTimeoutMillis| |30000| +|serviceUrl||| +|serviceUrlTls||| +|brokerServiceUrl||| +|brokerServiceUrlTls||| +|webServicePort||8080| +|webServicePortTls||8443| +|bindAddress||0.0.0.0| +|clusterName ||| +|authenticationEnabled||false| +|authenticationProviders||| +|authorizationEnabled||false| +|superUserRoles ||| +|brokerClientAuthenticationPlugin||| +|brokerClientAuthenticationParameters||| +|tlsEnabled||false| +|tlsAllowInsecureConnection||false| +|tlsCertificateFilePath||| +|tlsKeyFilePath ||| +|tlsTrustCertsFilePath||| + + +## Pulsar proxy + +The [Pulsar proxy](concepts-architecture-overview.md#pulsar-proxy) can be configured in the `conf/proxy.conf` file. + + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The ZooKeeper quorum connection string (as a comma-separated list) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds) |30000| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath| Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticateMetricsEndpoint| Whether the '/metrics' endpoint requires authentication. Defaults to true. 'authenticationEnabled' must also be set for this to take effect. |true| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they will be able to perform all admin || +|forwardAuthorizationCredentials| Whether client authorization credentials are forwared to the broker for re-authorization. Authentication must be enabled via authenticationEnabled=true for this to take effect. |false| +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy will reject requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy will error out requests beyond that. |50000| +|tlsEnabledInProxy| Whether TLS is enabled for the proxy |false| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate isn’t trusted. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.2```, ```TLSv1.1```, ```TLSv1``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`|| +|tokenPublicAlg| Configure the algorithm to be used to validate auth tokens. This can be any of the asymettric algorithms supported by Java JWT (https://github.com/jwtk/jjwt#signature-algorithms-keys) |RS256| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || + +## ZooKeeper + +ZooKeeper handles a broad range of essential configuration- and coordination-related tasks for Pulsar. The default configuration file for ZooKeeper is in the `conf/zookeeper.conf` file in your Pulsar installation. The following parameters are available: + + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper will store in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server will listen for connections. |2181| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, by which the ZooKeeper database purge task is triggered. Setting to a non-zero number will enable auto purge; setting to 0 will disable. Read this guide before enabling auto purge. |1| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + + + +In addition to the parameters in the table above, configuring ZooKeeper for Pulsar involves adding +a `server.N` line to the `conf/zookeeper.conf` file for each node in the ZooKeeper cluster, where `N` is the number of the ZooKeeper node. Here's an example for a three-node ZooKeeper cluster: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +> We strongly recommend consulting the [ZooKeeper Administrator's Guide](https://zookeeper.apache.org/doc/current/zookeeperAdmin.html) for a more thorough and comprehensive introduction to ZooKeeper configuration diff --git a/site2/website/versioned_docs/version-2.4.2/reference-connector-admin.md b/site2/website/versioned_docs/version-2.4.2/reference-connector-admin.md new file mode 100644 index 0000000000000..8fb090a539a99 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/reference-connector-admin.md @@ -0,0 +1,601 @@ +--- +id: version-2.4.2-reference-connector-admin +title: Connector Admin CLI +sidebar_label: Connector Admin CLI +original_id: reference-connector-admin +--- + +The `pulsar-admin` tool helps you manage Pulsar connectors. + +## `sources` + +An interface for managing Pulsar IO sources (ingress data into Pulsar). + +```bash +$ pulsar-admin sources subcommands +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sources` + +* `reload` + + +### `create` + +Submit a Pulsar IO source connector to run in a Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sources create options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the source.
Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. +| `--tenant` | The source's tenant. + +### `update` + +Update a already submitted Pulsar IO source connector. + +#### Usage + +```bash +$ pulsar-admin sources update options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the source.
Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. +| `--tenant` | The source's tenant. +| `--update-auth-data` | Whether or not to update the auth data.
**Default value: false.** + + +### `delete` + +Delete a Pulsar IO source connector. + +#### Usage + +```bash +$ pulsar-admin sources delete options +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `get` + +Get the information about a Pulsar IO source connector. + +#### Usage + +```bash +$ pulsar-admin sources get options +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `status` + +Check the current status of a Pulsar Source. + +#### Usage + +```bash +$ pulsar-admin sources status options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source ID.
If `instance-id` is not provided, Pulasr gets status of all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `list` + +List all running Pulsar IO source connectors. + +#### Usage + +```bash +$ pulsar-admin sources list options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `stop` + +Stop a source instance. + +#### Usage + +```bash +$ pulsar-admin sources stop options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `start` + +Start a source instance. + +#### Usage + +```bash +$ pulsar-admin sources start options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `restart` + +Restart a source instance. + +#### Usage + +```bash +$ pulsar-admin sources restart options +``` + +#### Options +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `localrun` + +Run a Pulsar IO source connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sources localrun options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the Source.
It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The source's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--deserialization-classname`|The SerDe classname for the source. +|`--destination-topic-name`|The Pulsar topic to which data is sent. +|`--disk`|The disk (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
**Default value: false**. +|`--name`|The source’s name.| +|`--namespace`|The source’s namespace.| +|`--parallelism`|The source’s parallelism factor, that is, the number of source instances to run).| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the source.
Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +| `-st`, `--schema-type` | The schema type.
Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +|`--source-config`|Source config key/values. +|`--source-config-file`|The path to a YAML config file specifying the source’s configuration. +|`--source-type`|The source's connector provider. +|`--tenant`|The source’s tenant. +|`--tls-allow-insecure`|Allow insecure tls connection.
**Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +|`--use-tls`|Use tls connection.
**Default value: false**. + +### `available-sources` + +Get the list of Pulsar IO connector sources supported by Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sources available-sources +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash +$ pulsar-admin sources reload +``` + +## `sinks` + +An interface for managing Pulsar IO sinks (egress data from Pulsar). + +```bash +$ pulsar-admin sinks subcommands +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sinks` + +* `reload` + + +### `create` + +Submit a Pulsar IO sink connector to run in a Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sinks create options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the sink.
Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
`--input` and `--topics-Pattern` are mutually exclusive.
Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). + +### `update` + +Update a Pulsar IO sink connector. + +#### Usage + +```bash +$ pulsar-admin sinks update options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the sink.
Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
`--input` and `--topics-Pattern` are mutually exclusive.
Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +| `--update-auth-data` | Whether or not to update the auth data.
**Default value: false.** + +### `delete` + +Delete a Pulsar IO sink connector. + +#### Usage + +```bash +$ pulsar-admin sinks delete options +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `get` + +Get the information about a Pulsar IO sink connector. + +#### Usage + +```bash +$ pulsar-admin sinks get options +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `status` + +Check the current status of a Pulsar sink. + +#### Usage + +```bash +$ pulsar-admin sinks status options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink ID.
If `instance-id` is not provided, Pulasr gets status of all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `list` + +List all running Pulsar IO sink connectors. + +#### Usage + +```bash +$ pulsar-admin sinks list options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `stop` + +Stop a sink instance. + +#### Usage + +```bash +$ pulsar-admin sinks stop options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `start` + +Start a sink instance. + +#### Usage + +```bash +$ pulsar-admin sinks start options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `restart` + +Restart a sink instance. + +#### Usage + +```bash +$ pulsar-admin sinks restart options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `localrun` + +Run a Pulsar IO sink connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sinks localrun options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The sink's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per sink instance (applicable only to the Docker runtime). +| `--custom-schema-inputs` | The map of input topics to Schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +|`--disk`|The disk (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
**Default value: false**. +| `-i`, `--inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name`|The sink’s name.| +|`--namespace`|The sink’s namespace.| +|`--parallelism`|The sink’s parallelism factor, that is, the number of sink instances to run).| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the sink.
Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--retain-ordering` | Sink consumes and sinks messages in order. +|`--sink-config`|sink config key/values. +|`--sink-config-file`|The path to a YAML config file specifying the sink’s configuration. +|`--sink-type`|The sink's connector provider. +|`--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +|`--tenant`|The sink’s tenant. +| `--timeout-ms` | The message timeout in milliseconds. +|`--tls-allow-insecure`|Allow insecure tls connection.
**Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
`--input` and `--topics-Pattern` are mutually exclusive.
Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +|`--use-tls`|Use tls connection.
**Default value: false**. + +### `available-sinks` + +Get the list of Pulsar IO connector sinks supported by Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sinks available-sinks +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash +$ pulsar-admin sinks reload +``` + diff --git a/site2/website/versioned_docs/version-2.4.2/reference-metrics.md b/site2/website/versioned_docs/version-2.4.2/reference-metrics.md new file mode 100644 index 0000000000000..28c404fc82a91 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/reference-metrics.md @@ -0,0 +1,244 @@ +--- +id: version-2.4.2-reference-metrics +title: Pulsar Metrics +sidebar_label: Pulsar Metrics +original_id: reference-metrics +--- + + + +Pulsar exposes metrics in Prometheus format that can be collected and used for monitoring the health of the cluster. + +* [ZooKeeper](#zookeeper) +* [BookKeeper](#bookkeeper) +* [Broker](#broker) + +## Overview + +The metrics exposed by Pulsar are in Prometheus format. The types of metrics are: + +- [Counter](https://prometheus.io/docs/concepts/metric_types/#counter): a cumulative metric that represents a single monotonically increasing counter whose value can only increase or be reset to zero on restart. +- [Gauge](https://prometheus.io/docs/concepts/metric_types/#gauge): a *gauge* is a metric that represents a single numerical value that can arbitrarily go up and down. +- [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram): a histogram samples observations (usually things like request durations or response sizes) and counts them in configurable buckets. +- [Summary](https://prometheus.io/docs/concepts/metric_types/#summary): similar to a histogram, a summary samples observations (usually things like request durations and response sizes). While it also provides a total count of observations and a sum of all observed values, it calculates configurable quantiles over a sliding time window. + +## ZooKeeper + +The ZooKeeper metrics are exposed under "/metrics" at port 8000. You can use a different port +by configuring the `stats_server_port` system property. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| zookeeper_server_znode_count | Gauge | The number of z-nodes stored. | +| zookeeper_server_data_size_bytes | Gauge | The total size of all of z-nodes stored. | +| zookeeper_server_connections | Gauge | The number of currently opened connections. | +| zookeeper_server_watches_count | Gauge | The number of watchers registered. | +| zookeeper_server_ephemerals_count | Gauge | The number of ephemeral z-nodes. | + +### Request metrics + +| Name | Type | Description | +|---|---|---| +| zookeeper_server_requests | Counter | The total number of requests received by a particular server. | +| zookeeper_server_requests_latency_ms | Summary | The requests latency calculated in milliseconds.
Available labels: *type* (write, read).
  • *write*: the requests that write data to ZooKeeper.
  • *read*: the requests that read data from ZooKeeper.
| + +## BookKeeper + +The BookKeeper metrics are exposed under "/metrics" at port 8000. You can change the port by updating `prometheusStatsHttpPort` +in `bookkeeper.conf` configuration file. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| bookie_SERVER_STATUS | Gauge | The server status for bookie server.
  • 1: the bookie is running in writable mode.
  • 0: the bookie is running in readonly mode.
| +| bookkeeper_server_ADD_ENTRY_count | Counter | The total number of ADD_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_count | Counter | The total number of READ_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_WRITE_BYTES | Counter | The total number of bytes written to the bookie. | +| bookie_READ_BYTES | Counter | The total number of bytes read from the bookie. | +| bookkeeper_server_ADD_ENTRY_REQUEST | Histogram | The histogram of request latency of ADD_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_REQUEST | Histogram | The histogram of request latency of READ_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | + +### Journal metrics + +| Name | Type | Description | +|---|---|---| +| bookie_journal_JOURNAL_SYNC_count | Counter | The total number of journal fsync operations happening at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_journal_JOURNAL_QUEUE_SIZE | Gauge | The total number of requests pending in the journal queue. | +| bookie_journal_JOURNAL_FORCE_WRITE_QUEUE_SIZE | Gauge | The total number of force write (fsync) requests pending in the force-write queue. | +| bookie_journal_JOURNAL_CB_QUEUE_SIZE | Gauge | The total number of callbacks pending in the callback queue. | +| bookie_journal_JOURNAL_ADD_ENTRY | Histogram | The histogram of request latency of adding entries to the journal. | +| bookie_journal_JOURNAL_SYNC | Histogram | The histogram of fsync latency of syncing data to the journal disk. | + +### Storage metrics + +| Name | Type | Description | +|---|---|---| +| bookie_ledgers_count | Gauge | The total number of ledgers stored in the bookie. | +| bookie_entries_count | Gauge | The total number of entries stored in the bookie. | +| bookie_write_cache_size | Gauge | The bookie write cache size (in bytes). | +| bookie_read_cache_size | Gauge | The bookie read cache size (in bytes). | +| bookie_DELETED_LEDGER_COUNT | Counter | The total number of ledgers deleted since the bookie has started. | +| bookie_ledger_writable_dirs | Gauge | The number of writable directories in the bookie. | + +## Broker + +The broker metrics are exposed under "/metrics" at port 8080. You can change the port by updating `webServicePort` to a different port +in `broker.conf` configuration file. + +All the metrics exposed by a broker are labelled with `cluster=${pulsar_cluster}`. The value of `${pulsar_cluster}` is the pulsar cluster +name you configured in `broker.conf`. + +Broker has the following kinds of metrics: + +* [Namespace metrics](#namespace-metrics) + * [Replication metrics](#replication-metrics) +* [Topic metrics](#topic-metrics) + * [Replication metrics](#replication-metrics-1) +* [Subscription metrics](#subscription-metrics) +* [Consumer metrics](#consumer-metrics) + +### Namespace metrics + +> Namespace metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `false`. + +All the namespace metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_topics_count | Gauge | The number of Pulsar topics of the namespace owned by this broker. | +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the namespace served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the namespace connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the namespace connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the namespace coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the namespace going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the namespace coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the namespace going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this namespace owned by this broker (bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this namespace owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this namespace offloaded to the tiered storage (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this namespace (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this namespace (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a namespace that the storage write latency is smaller with a given threshold.
Available thresholds:
  • pulsar_storage_write_latency_le_0_5: <= 0.5ms
  • pulsar_storage_write_latency_le_1: <= 1ms
  • pulsar_storage_write_latency_le_5: <= 5ms
  • pulsar_storage_write_latency_le_10: <= 10ms
  • pulsar_storage_write_latency_le_20: <= 20ms
  • pulsar_storage_write_latency_le_50: <= 50ms
  • pulsar_storage_write_latency_le_100: <= 100ms
  • pulsar_storage_write_latency_le_200: <= 200ms
  • pulsar_storage_write_latency_le_1000: <= 1s
  • pulsar_storage_write_latency_le_overflow: > 1s
| +| pulsar_entry_size_le_* | Histogram | The entry rate of a namespace that the entry size is smaller with a given threshold.
Available thresholds:
  • pulsar_entry_size_le_128: <= 128 bytes
  • pulsar_entry_size_le_512: <= 512 bytes
  • pulsar_entry_size_le_1_kb: <= 1 KB
  • pulsar_entry_size_le_2_kb: <= 2 KB
  • pulsar_entry_size_le_4_kb: <= 4 KB
  • pulsar_entry_size_le_16_kb: <= 16 KB
  • pulsar_entry_size_le_100_kb: <= 100 KB
  • pulsar_entry_size_le_1_mb: <= 1 MB
  • pulsar_entry_size_le_overflow: > 1 MB
| + +#### Replication metrics + +If a namespace is configured to be replicated between multiple Pulsar clusters, the corresponding replication metrics will also be exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics will also be labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the namespace replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the namespace replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the namespace replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the namespace replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the namespace replicating to remote cluster (messages). | + +### Topic metrics + +> Topic metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to true. + +All the topic metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the topic served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the topic connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the topic connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the topic coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the topic going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the topic coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the topic going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this topic owned by this broker (bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this topic owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this topic offloaded to the tiered storage (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this topic (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this topic (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a topic that the storage write latency is smaller with a given threshold.
Available thresholds:
  • pulsar_storage_write_latency_le_0_5: <= 0.5ms
  • pulsar_storage_write_latency_le_1: <= 1ms
  • pulsar_storage_write_latency_le_5: <= 5ms
  • pulsar_storage_write_latency_le_10: <= 10ms
  • pulsar_storage_write_latency_le_20: <= 20ms
  • pulsar_storage_write_latency_le_50: <= 50ms
  • pulsar_storage_write_latency_le_100: <= 100ms
  • pulsar_storage_write_latency_le_200: <= 200ms
  • pulsar_storage_write_latency_le_1000: <= 1s
  • pulsar_storage_write_latency_le_overflow: > 1s
| +| pulsar_entry_size_le_* | Histogram | The entry rate of a topic that the entry size is smaller with a given threshold.
Available thresholds:
  • pulsar_entry_size_le_128: <= 128 bytes
  • pulsar_entry_size_le_512: <= 512 bytes
  • pulsar_entry_size_le_1_kb: <= 1 KB
  • pulsar_entry_size_le_2_kb: <= 2 KB
  • pulsar_entry_size_le_4_kb: <= 4 KB
  • pulsar_entry_size_le_16_kb: <= 16 KB
  • pulsar_entry_size_le_100_kb: <= 100 KB
  • pulsar_entry_size_le_1_mb: <= 1 MB
  • pulsar_entry_size_le_overflow: > 1 MB
| + +#### Replication metrics + +If a namespace that a topic belongs to is configured to be replicated between multiple Pulsar clusters, the corresponding replication metrics will also be exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics will also be labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the topic replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the topic replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the topic replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the topic replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the topic replicating to remote cluster (messages). | + + +### Subscription metrics + +> Subscription metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to true. + +All the subscription metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscription_back_log | Gauge | The total backlog of a subscription (messages). | +| pulsar_subscription_delayed | Gauge | The total number of messages are delayed to be dispatched for a subscription (messages). | +| pulsar_subscription_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_subscription_unacked_messages | Gauge | The total number of unacknowledged messages of a subscription (messages). | +| pulsar_subscription_blocked_on_unacked_messages | Gauge | Indicate whether a subscription is blocked on unacknowledged messages or not.
  • 1 means the subscription is blocked on waiting unacknowledged messages to be acked.
  • 0 means the subscription is not blocked on waiting unacknowledged messages to be acked.
| +| pulsar_subscription_msg_rate_out | Gauge | The total message dispatch rate for a subscription (messages/second). | +| pulsar_subscription_msg_throughput_out | Gauge | The total message dispatch throughput for a subscription (bytes/second). | + +### Consumer metrics + +> Consumer metrics are only exposed when both `exposeTopicLevelMetricsInPrometheus` and `exposeConsumerLevelMetricsInPrometheus` +> are set to true. + +All the consumer metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. +- *consumer_name*: `consumer_name=${consumer_name}`. `${consumer_name}` is the topic consumer name. +- *consumer_id*: `consumer_id=${consumer_id}`. `${consumer_id}` is the topic consumer id. + +| Name | Type | Description | +|---|---|---| +| pulsar_consumer_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_consumer_unacked_messages | Gauge | The total number of unacknowledged messages of a consumer (messages). | +| pulsar_consumer_blocked_on_unacked_messages | Gauge | Indicate whether a consumer is blocked on unacknowledged messages or not.
  • 1 means the consumer is blocked on waiting unacknowledged messages to be acked.
  • 0 means the consumer is not blocked on waiting unacknowledged messages to be acked.
| +| pulsar_consumer_msg_rate_out | Gauge | The total message dispatch rate for a consumer (messages/second). | +| pulsar_consumer_msg_throughput_out | Gauge | The total message dispatch throughput for a consumer (bytes/second). | +| pulsar_consumer_available_permits | Gauge | The available permits for for a consumer. | + +## Monitor + +You can [set up a Prometheus instance](https://prometheus.io/) to collect all the metrics exposed at Pulsar components and set up +[Grafana](https://grafana.com/) dashboards to display the metrics and monitor your Pulsar cluster. + +The following are some Grafana dashboards examples: + +- [pulsar-grafana](http://pulsar.apache.org/docs/en/deploy-monitoring/#grafana): A grafana dashboard that displays metrics collected in Prometheus for Pulsar clusters running on Kubernetes. +- [apache-pulsar-grafana-dashboard](https://github.com/streamnative/apache-pulsar-grafana-dashboard): A collection of grafana dashboard templates for different Pulsar components running on both Kubernetes and on-premise machines. diff --git a/site2/website/versioned_docs/version-2.4.2/reference-pulsar-admin.md b/site2/website/versioned_docs/version-2.4.2/reference-pulsar-admin.md new file mode 100644 index 0000000000000..358d93118a7b8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/reference-pulsar-admin.md @@ -0,0 +1,2095 @@ +--- +id: version-2.4.2-pulsar-admin +title: Pulsar admin CLI +sidebar_label: Pulsar Admin CLI +original_id: pulsar-admin +--- + +The `pulsar-admin` tool enables you to manage Pulsar installations, including clusters, brokers, namespaces, tenants, and more. + +Usage +```bash +$ pulsar-admin command +``` + +Commands +* `broker-stats` +* `brokers` +* `clusters` +* `functions` +* `namespaces` +* `ns-isolation-policy` +* `sources` + + For more information, see [here](reference-connector-admin.md#sources) +* `sinks` + + For more information, see [here](reference-connector-admin.md#sinks) +* `topics` +* `tenants` +* `resource-quotas` +* `schemas` + +## `broker-stats` + +Operations to collect broker statistics + +```bash +$ pulsar-admin broker-stats subcommand +``` + +Subcommands +* `allocator-stats` +* `topics(destinations)` +* `mbeans` +* `monitoring-metrics` +* `load-report` + + +### `allocator-stats` + +Dump allocator stats + +Usage +```bash +$ pulsar-admin broker-stats allocator-stats allocator-name +``` + +### `topics(destinations)` + +Dump topic stats + +Usage +```bash +$ pulsar-admin broker-stats topics options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + +### `mbeans` + +Dump Mbean stats + +Usage +```bash +$ pulsar-admin broker-stats mbeans options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `monitoring-metrics` + +Dump metrics for monitoring + +Usage +```bash +$ pulsar-admin broker-stats monitoring-metrics options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `load-report` + +Dump broker load-report + +Usage +```bash +$ pulsar-admin broker-stats load-report +``` + + +## `brokers` + +Operations about brokers + +```bash +$ pulsar-admin brokers subcommand +``` + +Subcommands +* `list` +* `namespaces` +* `update-dynamic-config` +* `list-dynamic-config` +* `get-all-dynamic-config` +* `get-internal-config` +* `get-runtime-config` +* `healthcheck` + +### `list` +List active brokers of the cluster + +Usage +```bash +$ pulsar-admin brokers list cluster-name +``` + +### `namespaces` +List namespaces owned by the broker + +Usage +```bash +$ pulsar-admin brokers namespaces cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--url`|The URL for the broker|| + + +### `update-dynamic-config` +Update a broker's dynamic service configuration + +Usage +```bash +$ pulsar-admin brokers update-dynamic-config options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| +|`--value`|Value for the configuration parameter value specified using the `--config` flag|| + + +### `list-dynamic-config` +Get list of updatable configuration name + +Usage +```bash +$ pulsar-admin brokers list-dynamic-config +``` + +### `get-all-dynamic-config` +Get all overridden dynamic-configuration values + +Usage +```bash +$ pulsar-admin brokers get-all-dynamic-config +``` + +### `get-internal-config` +Get internal configuration information + +Usage +```bash +$ pulsar-admin brokers get-internal-config +``` + +### `get-runtime-config` +Get runtime configuration values + +Usage +```bash +$ pulsar-admin brokers get-runtime-config +``` + +### `healthcheck` +Run a health check against the broker + +Usage +```bash +$ pulsar-admin brokers healthcheck +``` + + +## `clusters` +Operations about clusters + +Usage +```bash +$ pulsar-admin clusters subcommand +``` + +Subcommands +* `get` +* `create` +* `update` +* `delete` +* `list` +* `update-peer-clusters` +* `get-peer-clusters` +* `get-failure-domain` +* `create-failure-domain` +* `update-failure-domain` +* `delete-failure-domain` +* `list-failure-domains` + + +### `get` +Get the configuration data for the specified cluster + +Usage +```bash +$ pulsar-admin clusters get cluster-name +``` + +### `create` +Provisions a new cluster. This operation requires Pulsar super-user privileges. + +Usage +```bash +$ pulsar-admin clusters create cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `update` +Update the configuration for a cluster + +Usage +```bash +$ pulsar-admin clusters update cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `delete` +Deletes an existing cluster + +Usage +```bash +$ pulsar-admin clusters delete cluster-name +``` + +### `list` +List the existing clusters + +Usage +```bash +$ pulsar-admin clusters list +``` + +### `update-peer-clusters` +Update peer cluster names + +Usage +```bash +$ pulsar-admin clusters update-peer-clusters cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--peer-clusters`|Comma separated peer cluster names (Pass empty string "" to delete list)|| + +### `get-peer-clusters` +Get list of peer clusters + +Usage +```bash +$ pulsar-admin clusters get-peer-clusters +``` + +### `get-failure-domain` +Get the configuration brokers of a failure domain + +Usage +```bash +$ pulsar-admin clusters get-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `create-failure-domain` +Create a new failure domain for a cluster (updates it if already created) + +Usage +```bash +$ pulsar-admin clusters create-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `update-failure-domain` +Update failure domain for a cluster (creates a new one if not exist) + +Usage +```bash +$ pulsar-admin clusters update-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `delete-failure-domain` +Delete an existing failure domain + +Usage +```bash +$ pulsar-admin clusters delete-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `list-failure-domains` +List the existing failure domains for a cluster + +Usage +```bash +$ pulsar-admin clusters list-failure-domains cluster-name +``` + + +## `functions` + +A command-line interface for Pulsar Functions + +Usage +```bash +$ pulsar-admin functions subcommand +``` + +Subcommands +* `localrun` +* `create` +* `delete` +* `update` +* `get` +* `restart` +* `stop` +* `start` +* `status` +* `stats` +* `list` +* `querystate` +* `trigger` + + +### `localrun` +Run the Pulsar Function locally (rather than deploying it to the Pulsar cluster) + + +Usage +```bash +$ pulsar-admin functions localrun options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--broker-service-url `|The URL of the Pulsar broker|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--client-auth-params`|Client authentication param|| +|`--client-auth-plugin`|Client authentication plugin using which function-process can connect to broker|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--hostname-verification-enabled`|Enable hostname verification|false| +|`--instance-id-offset`|Start the instanceIds from this offset|0| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--state-storage-service-url`|The URL for the state storage service (by default Apache BookKeeper)|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed successfully are sent|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--tls-allow-insecure`|Allow insecure tls connection|false| +|`--tls-trust-cert-path`|The tls trust cert file path|| +|`--use-tls`|Use tls connection|false| + + +### `create` +Create a Pulsar Function in cluster mode (i.e. deploy it on a Pulsar cluster) + +Usage +``` +$ pulsar-admin functions create options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--timeout-ms`|The message timeout in milliseconds|| + + +### `delete` +Delete a Pulsar Function that's running on a Pulsar cluster + +Usage +```bash +$ pulsar-admin functions delete options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `update` +Update a Pulsar Function that's been deployed to a Pulsar cluster + +Usage +```bash +$ pulsar-admin functions update options +``` + + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--timeout-ms`|The message timeout in milliseconds|| + + +### `get` +Fetch information about a Pulsar Function + +Usage +```bash +$ pulsar-admin functions get options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `restart` +Restart function instance + +Usage +```bash +$ pulsar-admin functions restart options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (restart all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stop` +Stops function instance + +Usage +```bash +$ pulsar-admin functions stop options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (stop all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `start` +Starts a stopped function instance + +Usage +```bash +$ pulsar-admin functions start options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (start all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `status` +Check the current status of a Pulsar Function + +Usage +```bash +$ pulsar-admin functions status options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-status of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stats` +Get the current stats of a Pulsar Function + +Usage +```bash +$ pulsar-admin functions stats options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-stats of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + +### `list` +List all of the Pulsar Functions running under a specific tenant and namespace + +Usage +```bash +$ pulsar-admin functions list options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `querystate` +Fetch the current state associated with a Pulsar Function running in cluster mode + +Usage +```bash +$ pulsar-admin functions querystate options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`-k`, `--key`|The key for the state you want to fetch|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`-w`, `--watch`|Watch for changes in the value associated with a key for a Pulsar Function|false| + + +### `trigger` +Triggers the specified Pulsar Function with a supplied value + +Usage +```bash +$ pulsar-admin functions trigger options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`--topic`|The specific topic name that the function consumes from that you want to inject the data to|| +|`--trigger-file`|The path to the file that contains the data with which you'd like to trigger the function|| +|`--trigger-value`|The value with which you want to trigger the function|| + + +## `namespaces` + +Operations for managing namespaces + + +```bash +$ pulsar-admin namespaces subcommand +``` + +Subcommands +* `list` +* `topics` +* `policies` +* `create` +* `delete` +* `set-deduplication` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `grant-subscription-permission` +* `revoke-subscription-permission` +* `set-clusters` +* `get-clusters` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `get-anti-affinity-group` +* `set-anti-affinity-group` +* `get-anti-affinity-namespaces` +* `delete-anti-affinity-group` +* `get-retention` +* `set-retention` +* `unload` +* `split-bundle` +* `set-dispatch-rate` +* `get-dispatch-rate` +* `set-subscribe-rate` +* `get-subscribe-rate` +* `set-subscription-dispatch-rate` +* `get-subscription-dispatch-rate` +* `clear-backlog` +* `unsubscribe` +* `set-encryption-required` +* `set-subscription-auth-mode` +* `get-max-producers-per-topic` +* `set-max-producers-per-topic` +* `get-max-consumers-per-topic` +* `set-max-consumers-per-topic` +* `get-max-consumers-per-subscription` +* `set-max-consumers-per-subscription` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `get-offload-threshold` +* `set-offload-threshold` +* `get-offload-deletion-lag` +* `set-offload-deletion-lag` +* `clear-offload-deletion-lag` +* `get-schema-autoupdate-strategy` +* `set-schema-autoupdate-strategy` + + +### `list` +Get the namespaces for a tenant + +Usage +```bash +$ pulsar-admin namespaces list tenant-name +``` + +### `topics` +Get the list of topics for a namespace + +Usage +```bash +$ pulsar-admin namespaces topics tenant/namespace +``` + +### `policies` +Get the configuration policies of a namespace + +Usage +```bash +$ pulsar-admin namespaces policies tenant/namespace +``` + +### `create` +Create a new namespace + +Usage +```bash +$ pulsar-admin namespaces create tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundles`|The number of bundles to activate|0| +|`-c`, `--clusters`|List of clusters this namespace will be assigned|| + + +### `delete` +Deletes a namespace. The namespace needs to be empty + +Usage +```bash +$ pulsar-admin namespaces delete tenant/namespace +``` + +### `set-deduplication` +Enable or disable message deduplication on a namespace + +Usage +```bash +$ pulsar-admin namespaces set-deduplication tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified namespace|false| +|`--disable`, `-d`|Disable message deduplication on the specified namespace|false| + + +### `permissions` +Get the permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces permissions tenant/namespace +``` + +### `grant-permission` +Grant permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces grant-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces revoke-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| + +### `grant-subscription-permission` +Grant permissions to access subscription admin-api + +Usage +```bash +$ pulsar-admin namespaces grant-subscription-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--roles`|The client roles to which to grant the permissions (comma separated roles)|| +|`--subscription`|The subscription name for which permission will be granted to roles|| + +### `revoke-subscription-permission` +Revoke permissions to access subscription admin-api + +Usage +```bash +$ pulsar-admin namespaces revoke-subscription-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| +|`--subscription`|The subscription name for which permission will be revoked to roles|| + +### `set-clusters` +Set replication clusters for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-clusters tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--clusters`|Replication clusters ID list (comma-separated values)|| + + +### `get-clusters` +Get replication clusters for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-clusters tenant/namespace +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-backlog-quotas tenant/namespace +``` + +### `set-backlog-quota` +Set a backlog quota policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-backlog-quota tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| + +Example +```bash +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ +--limit 2G \ +--policy producer_request_hold +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a namespace + +Usage +```bash +$ pulsar-admin namespaces remove-backlog-quota tenant/namespace +``` + +### `get-persistence` +Get the persistence policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-persistence tenant/namespace +``` + +### `set-persistence` +Set the persistence policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-persistence tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-a`, `--bookkeeper-ack-quorom`|The number of acks (guaranteed copies) to wait for each entry|0| +|`-e`, `--bookkeeper-ensemble`|The number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + + +### `get-message-ttl` +Get the message TTL for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-message-ttl tenant/namespace +``` + +### `set-message-ttl` +Set the message TTL for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-message-ttl tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL in seconds|0| + +### `get-anti-affinity-group` +Get Anti-affinity group name for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-anti-affinity-group tenant/namespace +``` + +### `set-anti-affinity-group` +Set Anti-affinity group name for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-anti-affinity-group tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-g`, `--group`|Anti-affinity group name|| + +### `get-anti-affinity-namespaces` +Get Anti-affinity namespaces grouped with the given anti-affinity group name + +Usage +```bash +$ pulsar-admin namespaces get-anti-affinity-namespaces options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--cluster`|Cluster name|| +|`-g`, `--group`|Anti-affinity group name|| +|`-p`, `--tenant`|Tenant is only used for authorization. Client has to be admin of any of the tenant to access this api|| + +### `delete-anti-affinity-group` +Remove Anti-affinity group name for a namespace + +Usage +```bash +$ pulsar-admin namespaces delete-anti-affinity-group tenant/namespace +``` + +### `get-retention` +Get the retention policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-retention tenant/namespace +``` + +### `set-retention` +Set the retention policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-retention tenant/namespace +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|The retention size limits (for example 10M, 16G or 3T). 0 means no retention and -1 means infinite size retention|| +|`-t`, `--time`|The retention time in minutes, hours, days, or weeks. Examples: 100m, 13h, 2d, 5w. 0 means no retention and -1 means infinite time retention|| + + +### `unload` +Unload a namespace or namespace bundle from the current serving broker. + +Usage +```bash +$ pulsar-admin namespaces unload tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| + +### `split-bundle` +Split a namespace-bundle from the current serving broker + +Usage +```bash +$ pulsar-admin namespaces split-bundle tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-u`, `--unload`|Unload newly split bundles after splitting old bundle|false| + +### `set-dispatch-rate` +Set message-dispatch-rate for all topics of the namespace + +Usage +```bash +$ pulsar-admin namespaces set-dispatch-rate tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-dispatch-rate` +Get configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage +```bash +$ pulsar-admin namespaces get-dispatch-rate tenant/namespace +``` + +### `set-subscribe-rate` +Set subscribe-rate per consumer for all topics of the namespace + +Usage +```bash +$ pulsar-admin namespaces set-subscribe-rate tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-sr`, `--subscribe-rate`|The subscribe rate (default -1 will be overwrite if not passed)|-1| +|`-st`, `--subscribe-rate-period`|The subscribe rate period in second type (default 30 second will be overwrite if not passed)|30| + +### `get-subscribe-rate` +Get configured subscribe-rate per consumer for all topics of the namespace + +Usage +```bash +$ pulsar-admin namespaces get-subscribe-rate tenant/namespace +``` + +### `set-subscription-dispatch-rate` +Set subscription message-dispatch-rate for all subscription of the namespace + +Usage +```bash +$ pulsar-admin namespaces set-subscription-dispatch-rate tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--sub-msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-subscription-dispatch-rate` +Get subscription configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage +```bash +$ pulsar-admin namespaces get-subscription-dispatch-rate tenant/namespace +``` + +### `clear-backlog` +Clear the backlog for a namespace + +Usage +```bash +$ pulsar-admin namespaces clear-backlog tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-force`, `--force`|Whether to force a clear backlog without prompt|false| +|`-s`, `--sub`|The subscription name|| + + +### `unsubscribe` +Unsubscribe the given subscription on all destinations on a namespace + +Usage +```bash +$ pulsar-admin namespaces unsubscribe tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-s`, `--sub`|The subscription name|| + +### `set-encryption-required` +Enable or disable message encryption required for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-encryption-required tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable message encryption required|false| +|`-e`, `--enable`|Enable message encryption required|false| + +### `set-subscription-auth-mode` +Set subscription auth mode on a namespace + +Usage +```bash +$ pulsar-admin namespaces set-subscription-auth-mode tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-m`, `--subscription-auth-mode`|Subscription authorization mode for Pulsar policies. Valid options are: [None, Prefix]|| + +### `get-max-producers-per-topic` +Get maxProducersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-max-producers-per-topic tenant/namespace +``` + +### `set-max-producers-per-topic` +Set maxProducersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-max-producers-per-topic tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-p`, `--max-producers-per-topic`|maxProducersPerTopic for a namespace|0| + +### `get-max-consumers-per-topic` +Get maxConsumersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-max-consumers-per-topic tenant/namespace +``` + +### `set-max-consumers-per-topic` +Set maxConsumersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-max-consumers-per-topic tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-topic`|maxConsumersPerTopic for a namespace|0| + +### `get-max-consumers-per-subscription` +Get maxConsumersPerSubscription for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-max-consumers-per-subscription tenant/namespace +``` + +### `set-max-consumers-per-subscription` +Set maxConsumersPerSubscription for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-max-consumers-per-subscription tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-subscription`|maxConsumersPerSubscription for a namespace|0| + + +### `get-compaction-threshold` +Get compactionThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-compaction-threshold tenant/namespace +``` + +### `set-compaction-threshold` +Set compactionThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-compaction-threshold tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-t`, `--threshold`|Maximum number of bytes in a topic backlog before compaction is triggered (eg: 10M, 16G, 3T). 0 disables automatic compaction|0| + + +### `get-offload-threshold` +Get offloadThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-offload-threshold tenant/namespace +``` + +### `set-offload-threshold` +Set offloadThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-offload-threshold tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|Maximum number of bytes stored in the pulsar cluster for a topic before data will start being automatically offloaded to longterm storage (eg: 10M, 16G, 3T, 100). Negative values disable automatic offload. 0 triggers offloading as soon as possible.|-1| + +### `get-offload-deletion-lag` +Get offloadDeletionLag, in minutes, for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-offload-deletion-lag tenant/namespace +``` + +### `set-offload-deletion-lag` +Set offloadDeletionLag for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-offload-deletion-lag tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-l`, `--lag`|Duration to wait after offloading a ledger segment, before deleting the copy of that segment from cluster local storage. (eg: 10m, 5h, 3d, 2w).|-1| + +### `clear-offload-deletion-lag` +Clear offloadDeletionLag for a namespace + +Usage +```bash +$ pulsar-admin namespaces clear-offload-deletion-lag tenant/namespace +``` + +### `get-schema-autoupdate-strategy` +Get the schema auto-update strategy for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-schema-autoupdate-strategy tenant/namespace +``` + +### `set-schema-autoupdate-strategy` +Set the schema auto-update strategy for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-schema-autoupdate-strategy tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--compatibility`|Compatibility level required for new schemas created via a Producer. Possible values (Full, Backward, Forward, None).|Full| +|`-d`, `--disabled`|Disable automatic schema updates.|false| + + +## `ns-isolation-policy` +Operations for managing namespace isolation policies. + +Usage +```bash +$ pulsar-admin ns-isolation-policy subcommand +``` + +Subcommands +* `set` +* `get` +* `list` +* `delete` +* `brokers` +* `broker` + +### `set` +Create/update a namespace isolation policy for a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy set cluster-name policy-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--auto-failover-policy-params`|Comma-separated name=value auto failover policy parameters|[]| +|`--auto-failover-policy-type`|Auto failover policy type name. Currently available options: min_available.|[]| +|`--namespaces`|Comma-separated namespaces regex list|[]| +|`--primary`|Comma-separated primary broker regex list|[]| +|`--secondary`|Comma-separated secondary broker regex list|[]| + + +### `get` +Get the namespace isolation policy of a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy get cluster-name policy-name +``` + +### `list` +List all namespace isolation policies of a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy list cluster-name +``` + +### `delete` +Delete namespace isolation policy of a cluster. This operation requires superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy delete +``` + +### `brokers` +List all brokers with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy brokers cluster-name +``` + +### `broker` +Get broker with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy broker cluster-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--broker`|Broker name to get namespace-isolation policies attached to it|| + +## `topics` +Operations for managing Pulsar topics (both persistent and non persistent) + +Usage +```bash +$ pulsar-admin topics subcommand +``` + +Subcommands +* `compact` +* `compaction-status` +* `offload` +* `offload-status` +* `create-partitioned-topic` +* `delete-partitioned-topic` +* `create` +* `get-partitioned-topic-metadata` +* `update-partitioned-topic` +* `list` +* `list-in-bundle` +* `terminate` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `lookup` +* `bundle-range` +* `delete` +* `unload` +* `subscriptions` +* `unsubscribe` +* `stats` +* `stats-internal` +* `info-internal` +* `partitioned-stats` +* `skip` +* `skip-all` +* `clear-backlog` +* `expire-messages` +* `expire-messages-all-subscriptions` +* `peek-messages` +* `reset-cursor` + + +### `compact` +Run compaction on the specified topic (persistent topics only) + +Usage +``` +$ pulsar-admin topics compact persistent://tenant/namespace/topic +``` + +### `compaction-status` +Check the status of a topic compaction (persistent topics only) + +Usage +```bash +$ pulsar-admin topics compaction-status persistent://tenant/namespace/topic +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `offload` +Trigger offload of data from a topic to long-term storage (e.g. Amazon S3) + +Usage +```bash +$ pulsar-admin topics offload persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--size-threshold`|The maximum amount of data to keep in BookKeeper for the specific topic|| + + +### `offload-status` +Check the status of data offloading from a topic to long-term storage + +Usage +```bash +$ pulsar-admin topics offload-status persistent://tenant/namespace/topic op +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `create-partitioned-topic` +Create a partitioned topic. A partitioned topic must be created before producers can publish to it. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +Usage +```bash +$ pulsar-admin topics create-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `delete-partitioned-topic` +Delete a partitioned topic. This will also delete all the partitions of the topic if they exist. + +Usage +```bash +$ pulsar-admin topics delete-partitioned-topic {persistent|non-persistent} +``` + +### `create` +Creates a non-partitioned topic. A non-partitioned topic must explicitly be created by the user if allowAutoTopicCreation or createIfMissing is disabled. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +Usage +```bash +$ pulsar-admin topics create {persistent|non-persistent}://tenant/namespace/topic +``` + +### `get-partitioned-topic-metadata` +Get the partitioned topic metadata. If the topic is not created or is a non-partitioned topic, this will return an empty topic with zero partitions. + +Usage +```bash +$ pulsar-admin topics get-partitioned-topic-metadata {persistent|non-persistent}://tenant/namespace/topic +``` + +### `update-partitioned-topic` +Update existing non-global partitioned topic. New updating number of partitions must be greater than existing number of partitions. + +Usage +```bash +$ pulsar-admin topics update-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `list` +Get the list of topics under a namespace + +Usage +``` +$ pulsar-admin topics list tenant/cluster/namespace +``` + +### `list-in-bundle` +Get a list of non-persistent topics present under a namespace bundle + +Usage +``` +$ pulsar-admin topics list-in-bundle tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundle`|The bundle range|| + + +### `terminate` +Terminate a topic (disallow further messages from being published on the topic) + +Usage +```bash +$ pulsar-admin topics terminate {persistent|non-persistent}://tenant/namespace/topic +``` + +### `permissions` +Get the permissions on a topic. Retrieve the effective permissions for a desination. These permissions are defined by the permissions set at the namespace level combined (union) with any eventual specific permissions set on the topic. + +Usage +```bash +$ pulsar-admin topics permissions topic +``` + +### `grant-permission` +Grant a new permission to a client role on a single topic + +Usage +```bash +$ pulsar-admin topics grant-permission {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions to a client role on a single topic. If the permission was not set at the topic level, but rather at the namespace level, this operation will return an error (HTTP status code 412). + +Usage +```bash +$ pulsar-admin topics revoke-permission topic +``` + +### `lookup` +Look up a topic from the current serving broker + +Usage +```bash +$ pulsar-admin topics lookup topic +``` + +### `bundle-range` +Get the namespace bundle which contains the given topic + +Usage +```bash +$ pulsar-admin topics bundle-range topic +``` + +### `delete` +Delete a topic. The topic cannot be deleted if there are any active subscriptions or producers connected to the topic. + +Usage +```bash +$ pulsar-admin topics delete topic +``` + +### `unload` +Unload a topic + +Usage +```bash +$ pulsar-admin topics unload topic +``` + +### `subscriptions` +Get the list of subscriptions on the topic + +Usage +```bash +$ pulsar-admin topics subscriptions topic +``` + +### `unsubscribe` +Delete a durable subscriber from a topic + +Usage +```bash +$ pulsar-admin topics unsubscribe topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to delete|| + + +### `stats` +Get the stats for the topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage +```bash +$ pulsar-admin topics stats topic +``` + +### `stats-internal` +Get the internal stats for the topic + +Usage +```bash +$ pulsar-admin topics stats-internal topic +``` + +### `info-internal` +Get the internal metadata info for the topic + +Usage +```bash +$ pulsar-admin topics info-internal topic +``` + +### `partitioned-stats` +Get the stats for the partitioned topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage +```bash +$ pulsar-admin topics partitioned-stats topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--per-partition`|Get per-partition stats|false| + + +### `skip` +Skip some messages for the subscription + +Usage +```bash +$ pulsar-admin topics skip topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages to skip|0| +|`-s`, `--subscription`|The subscription on which to skip messages|| + + +### `skip-all` +Skip all the messages for the subscription + +Usage +```bash +$ pulsar-admin topics skip-all topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to clear|| + +### `clear-backlog` + +Clear backlog (skip all the messages) for the subscription. + +Usage + +```bash +$ pulsar-admin topics clear-backlog topic options +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to clear|| + +### `expire-messages` +Expire messages that are older than the given expiry time (in seconds) for the subscription. + +Usage +```bash +$ pulsar-admin topics expire-messages topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| +|`-s`, `--subscription`|The subscription to skip messages on|| + + +### `expire-messages-all-subscriptions` +Expire messages older than the given expiry time (in seconds) for all subscriptions + +Usage +```bash +$ pulsar-admin topics expire-messages-all-subscriptions topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| + + +### `peek-messages` +Peek some messages for the subscription. + +Usage +```bash +$ pulsar-admin topics peek-messages topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages|0| +|`-s`, `--subscription`|Subscription to get messages from|| + + +### `reset-cursor` +Reset position for subscription to closest to timestamp + +Usage +```bash +$ pulsar-admin topics reset-cursor topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|Subscription to reset position on|| +|`-t`, `--time`|The time, in minutes, to reset back to (or minutes, hours, days, weeks, etc.). Examples: `100m`, `3h`, `2d`, `5w`.|| + + + +## `tenants` +Operations for managing tenants + +Usage +```bash +$ pulsar-admin tenants subcommand +``` + +Subcommands +* `list` +* `get` +* `create` +* `update` +* `delete` + +### `list` +List the existing tenants + +Usage +```bash +$ pulsar-admin tenants list +``` + +### `get` +Gets the configuration of a tenant + +Usage +```bash +$ pulsar-admin tenants get tenant-name +``` + +### `create` +Creates a new tenant + +Usage +```bash +$ pulsar-admin tenants create tenant-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + +### `update` +Updates a tenant + +Usage +```bash +$ pulsar-admin tenants update tenant-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + + +### `delete` +Deletes an existing tenant + +Usage +```bash +$ pulsar-admin tenants delete tenant-name +``` + + +## `resource-quotas` +Operations for managing resource quotas + +Usage +```bash +$ pulsar-admin resource-quotas subcommand +``` + +Subcommands +* `get` +* `set` +* `reset-namespace-bundle-quota` + + +### `get` +Get the resource quota for a specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage +```bash +$ pulsar-admin resource-quotas get options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + +### `set` +Set the resource quota for the specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage +```bash +$ pulsar-admin resource-quotas set options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bi`, `--bandwidthIn`|The expected inbound bandwidth (in bytes/second)|0| +|`-bo`, `--bandwidthOut`|Expected outbound bandwidth (in bytes/second)0| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-d`, `--dynamic`|Allow to be dynamically re-calculated (or not)|false| +|`-mem`, `--memory`|Expectred memory usage (in megabytes)|0| +|`-mi`, `--msgRateIn`|Expected incoming messages per second|0| +|`-mo`, `--msgRateOut`|Expected outgoing messages per second|0| +|`-n`, `--namespace`|The namespace as tenant/namespace, for example my-tenant/my-ns. Must be specified together with -b/--bundle.|| + + +### `reset-namespace-bundle-quota` +Reset the specifed namespace bundle's resource quota to a default value. + +Usage +```bash +$ pulsar-admin resource-quotas reset-namespace-bundle-quota options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + + +## `schemas` +Operations related to Schemas associated with Pulsar topics. + +Usage +``` +$ pulsar-admin schemas subcommand +``` + +Subcommands +* `upload` +* `delete` +* `get` +* `extract` + + +### `upload` +Upload the schema definition for a topic + +Usage +```bash +$ pulsar-admin schemas upload persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--filename`|The path to the schema definition file. An example schema file is available under conf directory.|| + + +### `delete` +Delete the schema definition associated with a topic + +Usage +```bash +$ pulsar-admin schemas delete persistent://tenant/namespace/topic +``` + + +### `get` +Retrieve the schema definition assoicated with a topic (at a given version if version is supplied). + +Usage +```bash +$ pulsar-admin schemas get persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--version`|The version of the schema definition to retrive for a topic.|| + +### `extract` +Provide the schema definition for a topic via Java class name contained in a JAR file + +Usage +```bash +$ pulsar-admin schemas extract persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--classname`|The Java class name|| +|`-j`, `--jar`|A path to the JAR file which contains the above Java class|| +|`-t`, `--type`|The type of the schema (avro or json)|| + + diff --git a/site2/website/versioned_docs/version-2.4.2/schema-evolution-compatibility.md b/site2/website/versioned_docs/version-2.4.2/schema-evolution-compatibility.md new file mode 100644 index 0000000000000..73389bc39d972 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/schema-evolution-compatibility.md @@ -0,0 +1,784 @@ +--- +id: version-2.4.2-schema-evolution-compatibility +title: Schema evolution and compatibility +sidebar_label: Schema evolution and compatibility +original_id: schema-evolution-compatibility +--- + +Normally, schemas do not stay the same over a long period of time. Instead, they undergo evolutions to satisfy new needs. + +This chapter examines how Pulsar schema evolves and what Pulsar schema compatibility check strategies are. + +## Schema evolution + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +Each `SchemaInfo` stored with a topic has a version. The version is used to manage the schema changes happening within a topic. + +The message produced with `SchemaInfo` is tagged with a schema version. When a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and use the correct schema information to deserialize data. + +### What is schema evolution? + +Schemas store the details of attributes and types. To satisfy new business requirements, you need to update schemas inevitably over time, which is called **schema evolution**. + +Any schema changes affect downstream consumers. Schema evolution ensures that the downstream consumers can seamlessly handle data encoded with both old schemas and new schemas. + +### How Pulsar schema should evolve? + +The answer is Pulsar schema compatibility check strategy. It determines how schema compares old schemas with new schemas in topics. + +For more information, see [Schema compatibility check strategy](#schema-compatibility-check-strategy). + +### How does Pulsar support schema evolution? + +1. When a producer/consumer/reader connects to a broker, the broker deploys the schema compatibility checker configured by `schemaRegistryCompatibilityCheckers` to enforce schema compatibility check. + + The schema compatibility checker is one instance per schema type. + + Currently, Avro and JSON have their own compatibility checkers, while all the other schema types share the default compatibility checker which disables schema evolution. + +2. The producer/consumer/reader sends its client `SchemaInfo` to the broker. + +3. The broker knows the schema type and locates the schema compatibility checker for that type. + +4. The broker uses the checker to check if the `SchemaInfo` is compatible with the latest schema of the topic by applying its compatibility check strategy. + + Currently, the compatibility check strategy is configured at the namespace level and applied to all the topics within that namespace. + +## Schema compatibility check strategy + +Pulsar has 8 schema compatibility check strategies, which are summarized in the following table. + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Compatibility check strategy + + + +Definition + + + +Changes allowed + + + +Check against which schema + + + +Upgrade first + +
+ +`ALWAYS_COMPATIBLE` + + + +Disable schema compatibility check. + + + +All changes are allowed + + + +All previous versions + + + +Any order + +
+ +`ALWAYS_INCOMPATIBLE` + + + +Disable schema evolution. + + + +All changes are disabled + + + +None + + + +None + +
+ +`BACKWARD` + + + +Consumers using the schema V3 can process data written by producers using the schema V3 or V2. + + + +* Add optional fields + +* Delete fields + + + +Latest version + + + +Consumers + +
+ +`BACKWARD_TRANSITIVE` + + + +Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. + + + +* Add optional fields + +* Delete fields + + + +All previous versions + + + +Consumers + +
+ +`FORWARD` + + + +Consumers using the schema V3 or V2 can process data written by producers using the schema V3. + + + +* Add fields + +* Delete optional fields + + + +Latest version + + + +Producers + +
+ +`FORWARD_TRANSITIVE` + + + +Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. + + + +* Add fields + +* Delete optional fields + + + +All previous versions + + + +Producers + +
+ +`FULL` + + + +Backward and forward compatible between the schema V3 and V2. + + + +* Modify optional fields + + + +Latest version + + + +Any order + +
+ +`FULL_TRANSITIVE` + + + +Backward and forward compatible among the schema V3, V2, and V1. + + + +* Modify optional fields + + + +All previous versions + + + +Any order + +
+ +### ALWAYS_COMPATIBLE and ALWAYS_INCOMPATIBLE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Compatibility check strategy + + + +Definition + + + +Note + +
+ +`ALWAYS_COMPATIBLE` + + + +Disable schema compatibility check. + + + +None + +
+ +`ALWAYS_INCOMPATIBLE` + + + +Disable schema evolution, that is, any schema change is rejected. + + + +* For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`. + +* For Avro and JSON, the default schema compatibility check strategy is `FULL`. + +
+ +#### Example + +* Example 1 + + In some situations, an application needs to store events of several different types in the same Pulsar topic. + + In particular, when developing a data model in an `Event Sourcing` style, you might have several kinds of events that affect the state of an entity. + + For example, for a user entity, there are `userCreated`, `userAddressChanged` and `userEnquiryReceived` events. The application requires that those events are always read in the same order. + + Consequently, those events need to go in the same Pulsar partition to maintain order. This application can use `ALWAYS_COMPATIBLE` to allow different kinds of events co-exist in the same topic. + +* Example 2 + + Sometimes we also make incompatible changes. + + For example, you are modifying a field type from `string` to `int`. + + In this case, you need to: + + * Upgrade all producers and consumers to the new schema versions at the same time. + + * Optionally, create a new topic and start migrating applications to use the new topic and the new schema, avoiding the need to handle two incompatible versions in the same topic. + +### BACKWARD and BACKWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`BACKWARD` | Consumers using the new schema can process data written by producers using the **last schema**. | The consumers using the schema V3 can process data written by producers using the schema V3 or V2. | +`BACKWARD_TRANSITIVE` | Consumers using the new schema can process data written by producers using **all previous schemas**. | The consumers using the schema V3 can process data written by producers using the schema V3, V2, or V1. | + +#### Example + +* Example 1 + + Remove a field. + + A consumer constructed to process events without one field can process events written with the old schema containing the field, and the consumer will ignore that field. + +* Example 2 + + You want to load all Pulsar data into a Hive data warehouse and run SQL queries against the data. + + Same SQL queries must continue to work even the data is changed. To support it, you can evolve the schemas using the `BACKWARD` strategy. + +### FORWARD and FORWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`FORWARD` | Consumers using the **last schema** can process data written by producers using a new schema, even though they may not be able to use the full capabilities of the new schema. | The consumers using the schema V3 or V2 can process data written by producers using the schema V3. | +`FORWARD_TRANSITIVE` | Consumers using **all previous schemas** can process data written by producers using a new schema. | The consumers using the schema V3, V2, or V1 can process data written by producers using the schema V3. + +#### Example + +* Example 1 + + Add a field. + + In most data formats, consumers written to process events without new fields can continue doing so even when they receive new events containing new fields. + +* Example 2 + + If a consumer has an application logic tied to a full version of a schema, the application logic may not be updated instantly when the schema evolves. + + In this case, you need to project data with a new schema onto an old schema that the application understands. + + Consequently, you can evolve the schemas using the `FORWARD` strategy to ensure that the old schema can process data encoded with the new schema. + +### FULL and FULL_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Compatibility check strategy + + + +Definition + + + +Description + + + +Note + +
+ +`FULL` + + + +Schemas are both backward and forward compatible, which means: + +Consumers using the last schema can process data written by producers using the new schema. + +AND + +Consumers using the new schema can process data written by producers using the last schema. + + + +Consumers using the schema V3 can process data written by producers using the schema V3 or V2. + +AND + +Consumers using the schema V3 or V2 can process data written by producers using the schema V3. + + + +* For Avro and JSON, the default schema compatibility check strategy is `FULL`. + +* For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`. + +
+ +`FULL_TRANSITIVE` + + + +The new schema is backward and forward compatible with all previously registered schemas. + + + +Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. + +AND + +Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. + + + +None + +
+ +#### Example + +In some data formats, for example, Avro, you can define fields with default values. Consequently, adding or removing a field with a default value is a fully compatible change. + +## Order of upgrading clients + +The order of upgrading client applications is determined by the compatibility check strategy. + +For example, the producers using schemas to write data to Pulsar and the consumers using schemas to read data from Pulsar. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Compatibility check strategy + + + +Upgrade first + + + +Description + +
+ +`ALWAYS_COMPATIBLE` + + + +Any order + + + +The compatibility check is disabled. + +Consequently, you can upgrade the producers and consumers in **any order**. + +
+ +`ALWAYS_INCOMPATIBLE` + + + +None + + + +The schema evolution is disabled. + +
+ +* `BACKWARD` + +* `BACKWARD_TRANSITIVE` + + + +Consumers + + + +There is no guarantee that consumers using the old schema can read data produced using the new schema. + +Consequently, **upgrade all consumers first**, and then start producing new data. + +
+ +* `FORWARD` + +* `FORWARD_TRANSITIVE` + + + +Producers + + + +There is no guarantee that consumers using the new schema can read data produced using the old schema. + +Consequently, **upgrade all producers first** to use the new schema and ensure that the data already produced using the old schemas are not available to consumers, and then upgrade the consumers. + +
+ +* `FULL` + +* `FULL_TRANSITIVE` + + + +Any order + + + +There is no guarantee that consumers using the old schema can read data produced using the new schema and consumers using the new schema can read data produced using the old schema. + +Consequently, you can upgrade the producers and consumers in **any order**. + +
diff --git a/site2/website/versioned_docs/version-2.4.2/schema-get-started.md b/site2/website/versioned_docs/version-2.4.2/schema-get-started.md new file mode 100644 index 0000000000000..7ccc4a4a3dacc --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/schema-get-started.md @@ -0,0 +1,91 @@ +--- +id: version-2.4.2-schema-get-started +title: Get started +sidebar_label: Get started +original_id: schema-get-started +--- + +This chapter introduces Pulsar schemas and explains why they are important. + +## Schema Registry + +Type safety is extremely important in any application built around a message bus like Pulsar. + +Producers and consumers need some kind of mechanism for coordinating types at the topic level to aviod various potential problems arise. For example, serialization and deserialization issues. + +Applications typically adopt one of the following approaches to guarantee type safety in messaging. Both approaches are available in Pulsar, and you're free to adopt one or the other or to mix and match on a per-topic basis. + +### Client-side approach + +Producers and consumers are responsible for not only serializing and deserializing messages (which consist of raw bytes) but also "knowing" which types are being transmitted via which topics. + +If a producer is sending temperature sensor data on the topic `topic-1`, consumers of that topic will run into trouble if they attempt to parse that data as moisture sensor readings. + +Producers and consumers can send and receive messages consisting of raw byte arrays and leave all type safety enforcement to the application on an "out-of-band" basis. + +### Server-side approach + +Producers and consumers inform the system which data types can be transmitted via the topic. + +With this approach, the messaging system enforces type safety and ensures that producers and consumers remain synced. + +Pulsar has a built-in **schema registry** that enables clients to upload data schemas on a per-topic basis. Those schemas dictate which data types are recognized as valid for that topic. + +## Why use schema + +When a schema is enabled, Pulsar does parse data, it takes bytes as inputs and sends bytes as outputs. While data has meaning beyond bytes, you need to parse data and might encounter parse exceptions which mainly occur in the following situations: + +* The field does not exist + +* The field type has changed (for example, `string` is changed to `int`) + +There are a few methods to prevent and overcome these exceptions, for example, you can catch exceptions when parsing errors, which makes code hard to maintain; or you can adopt a schema management system to perform schema evolution, not to break downstream applications, and enforces type safety to max extend in the language you are using, the solution is Pulsar Schema. + +Pulsar schema enables you to use language-specific types of data when constructing and handling messages from simple types like `string` to more complex application-specific types. + +**Example** + +You can use the _User_ class to define the messages sent to Pulsar topics. + +``` +public class User { + String name; + int age; +} +``` + +When constructing a producer with the _User_ class, you can specify a schema or not as below. + +### Without schema + +If you construct a producer without specifying a schema, then the producer can only produce messages of type `byte[]`. If you have a POJO class, you need to serialize the POJO into bytes before sending messages. + +**Example** + +``` +Producer producer = client.newProducer() + .topic(topic) + .create(); +User user = new User(“Tom”, 28); +byte[] message = … // serialize the `user` by yourself; +producer.send(message); +``` +### With schema + +If you construct a producer with specifying a schema, then you can send a class to a topic directly without worrying about how to serialize POJOs into bytes. + +**Example** + +This example constructs a producer with the _JSONSchema_, and you can send the _User_ class to topics directly without worrying about how to serialize it into bytes. + +``` +Producer producer = client.newProducer(JSONSchema.of(User.class)) + .topic(topic) + .create(); +User user = new User(“Tom”, 28); +producer.send(User); +``` + +### Summary + +When constructing a producer with a schema, you do not need to serialize messages into bytes, instead Pulsar schema does this job in the background. diff --git a/site2/website/versioned_docs/version-2.4.2/schema-manage.md b/site2/website/versioned_docs/version-2.4.2/schema-manage.md new file mode 100644 index 0000000000000..3eed501a3de8f --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/schema-manage.md @@ -0,0 +1,786 @@ +--- +id: version-2.4.2-schema-manage +title: Manage schema +sidebar_label: Manage schema +original_id: schema-manage +--- + +This guide demonstrates the ways to manage schemas: + +* Automatically + + * [Schema AutoUpdate](#schema-autoupdate) + +* Manually + + * [Schema manual management](#schema-manual-management) + + * [Custom schema storage](#custom-schema-storage) + +## Schema AutoUpdate + +If a schema passes the schema compatibility check, Pulsar producer automatically updates this schema to the topic it produces by default. + +### AutoUpdate for producer + +For a producer, the `AutoUpdate` happens in the following cases: + +* If a **topic doesn’t have a schema**, Pulsar registers a schema automatically. + +* If a **topic has a schema**: + + * If a **producer doesn’t carry a schema**: + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **disabled** in the namespace to which the topic belongs, the producer is allowed to connect to the topic and produce data. + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **enabled** in the namespace to which the topic belongs, the producer is rejected and disconnected. + + * If a **producer carries a schema**: + + A broker performs the compatibility check based on the configured compatibility check strategy of the namespace to which the topic belongs. + + * If it is a new schema and it passes the compatibility check, the broker registers a new schema automatically for the topic. + + * If the schema does not pass the compatibility check, the broker does not register a schema. + +![AutoUpdate Producer](assets/schema-autoupdate-producer.png) + +### AutoUpdate for consumer + +For a consumer, the `AutoUpdate` happens in the following cases: + +* If a **consumer connects to a topic without a schema** (which means the consumer receiving raw bytes), the consumer can connect to the topic successfully without doing any compatibility check. + +* If a **consumer connects to a topic with a schema**: + + * If the **topic is idle** (no producers, no entries, no other consumers and no registered schemas), the broker registers a schema for the topic automatically. + + * If the **topic is not idle**, the broker verifies if the schema provided by the consumer is compatible with the registered schema of the topic. + + * If the **schema passes the compatibility check**, the consumer can connect to the topic and receive messages. + + * If the **schema does not pass the compatibility check**, the consumer is rejected and disconnected. + +![AutoUpdate Producer](assets/schema-autoupdate-consumer.png) + +### Manage AutoUpdate strategy + +You can use the `pulsar-admin` command to manage the `AutoUpdate` strategy as below: + +* [Disable AutoUpdate](#disable-autoupdate) + +* [Adjust compatibility](#adjust-compatibility) + +#### Disable AutoUpdate + +To disable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-schema-autoupdate-strategy --disabled tenant/namespace +``` + +Once the `AutoUpdate` is disabled, you can only register a new schema using the `pulsar-admin` command. + +#### Adjust compatibility + +To adjust the schema compatibility level on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-schema-autoupdate-strategy --compatibility tenant/namespace +``` + +### Schema validation + +By default, `schemaValidationEnforced` is **disabled** for producers: + +* This means a producer without a schema can produce any kind of messages to a topic with schemas, which may result in producing trash data to the topic. + +* This allows non-java language clients that don’t support schema can produce messages to a topic with schemas. + +However, if you want a stronger guarantee on the topics with schemas, you can enable `schemaValidationEnforced` across the whole cluster or on a per-namespace basis. + +#### Enable schema validation + +To enable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-schema-validation-enforce --enable tenant/namespace +``` + +#### Disable schema validation + +To disable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-schema-validation-enforce --disable tenant/namespace +``` + +## Schema manual management + +To manage schemas, you can use one of the following methods. + + + + + + + + + + + + + + + + + + +
MethodDescription
+ +**Admin CLI** + +You can use the `pulsar-admin` tool to manage Pulsar schemas, brokers, clusters, sources, sinks, topics, tenants and so on. + +For more information about how to use the `pulsar-admin` tool, see [here](reference-pulsar-admin.md). +
+ +**REST API** + + +Pulsar exposes schema related management API in Pulsar’s admin RESTful API. You can access the admin RESTful endpoint directly to manage schemas. + +For more information about how to use the Pulsar REST API, see [here](http://pulsar.apache.org/admin-rest-api/). +
+ +**Java Admin API** + Pulsar provides Java admin library.
+ +### Upload a schema + +To upload (register) a new schema for a topic, you can use one of the following methods. + + + + + +Use the `upload` subcommand. + +```bash +$ pulsar-admin schemas upload --filename +``` + +The `schema-definition-file` is in JSON format. + +```json +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} +``` + +The `schema-definition-file` includes the following fields: + + + + + + + + + + + + + + + + + + +
FieldDescription
+ +`type` + + The schema type.
+ +`schema` + + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
+ + `properties` + The additional properties associated with the schema.
+ +Here are examples of the `schema-definition-file` for a JSON schema. + +**Example 1** + +```json +{ + "type": "JSON", + "schema": "{\"type\":\"record\",\"name\":\"User\",\"namespace\":\"com.foo\",\"fields\":[{\"name\":\"file1\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"file2\",\"type\":\"string\",\"default\":null},{\"name\":\"file3\",\"type\":[\"null\",\"string\"],\"default\":\"dfdf\"}]}", + "properties": {} +} +``` + +**Example 2** + +```json +{ + "type": "STRING", + "schema": "", + "properties": { + "key1": "value1" + } +} +``` + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/uploadSchema} + +The post payload is in JSON format. + +```json +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} +``` + +The post payload includes the following fields: + + + + + + + + + + + + + + + + + + +
FieldDescription
+ +`type` + + The schema type.
+ +`schema` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
+ + `properties` + The additional properties associated with the schema.
+ + + +```java +void createSchema(String topic, PostSchemaPayload schemaPayload) +``` + +The `PostSchemaPayload` includes the following fields: + + + + + + + + + + + + + + + + + + +
FieldDescription
+ +`type` + + The schema type.
+ +`schema` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
+ + `properties` + The additional properties associated with the schema.
+ +Here is an example of `PostSchemaPayload`: + +```java +PulsarAdmin admin = …; + +PostSchemaPayload payload = new PostSchemaPayload(); +payload.setType("INT8"); +payload.setSchema(""); + +admin.createSchema("my-tenant/my-ns/my-topic", payload); +``` + + +### Get a schema (latest) + +To get the latest schema for a topic, you can use one of the following methods. + + + + + +Use the `get` subcommand. + +```bash +$ pulsar-admin schemas get + +{ + "version": 0, + "type": "String", + "timestamp": 0, + "data": "string", + "properties": { + "property1": "string", + "property2": "string" + } +} +``` + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/getSchema} + +Here is an example of a response, which is returned in JSON format. + +```json +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} +``` + +The response includes the following fields: + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+ +`version` + + The schema version, which is a long number.
+ +`type` + + The schema type.
+ +`timestamp` + + The timestamp of creating this version of schema.
+ +`data` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
+ + `properties` + The additional properties associated with the schema.
+ + + +```java +SchemaInfo createSchema(String topic) +``` + +The `SchemaInfo` includes the following fields: + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+ +`name` + + The schema name.
+ +`type` + + The schema type.
+ +`schema` + +A byte array of the schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this byte array should be empty. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition converted to a byte array. +
+ + `properties` + The additional properties associated with the schema.
+ +Here is an example of `SchemaInfo`: + +```java +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic"); +``` + + + +### Get a schema (specific) + +To get a specific version of a schema, you can use one of the following methods. + + + + + +Use the `get` subcommand. + +```bash +$ pulsar-admin schemas get --version= +``` + + + +Send a `GET` request to a schema endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema/:version|operation/getSchema} + +Here is an example of a response, which is returned in JSON format. + +```json +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} +``` + +The response includes the following fields: + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+ +`version` + + The schema version, which is a long number.
+ +`type` + + The schema type.
+ +`timestamp` + + The timestamp of creating this version of schema.
+ +`data` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
+ + `properties` + The additional properties associated with the schema.
+ + + +```java +SchemaInfo createSchema(String topic, long version) +``` + +The `SchemaInfo` includes the following fields: + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
+ +`name` + The schema name.
+ +`type` + The schema type.
+ +`schema` + +A byte array of the schema definition data, which is encoded in UTF 8. + +* If the schema is a **primitive** schema, this byte array should be empty. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition converted to a byte array. +
+ + `properties` + The additional properties associated with the schema.
+ +Here is an example of `SchemaInfo`: + +```java +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic", 1L); +``` + + + +### Extract a schema + +To provide a schema via a topic, you can use the following method. + + + + + +Use the `extract` subcommand. + +```bash +$ pulsar-admin schemas extract --classname --jar --type +``` + + + +### Delete a schema + +To delete a schema for a topic, you can use one of the following methods. + +> #### Note +> +> In any case, the **delete** action deletes **all versions** of a schema registered for a topic. + + + + + +Use the `delete` subcommand. + +```bash +$ pulsar-admin schemas delete +``` + + + +Send a `DELETE` request to a schema endpoint: {@inject: endpoint|DELETE|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/deleteSchema} + +Here is an example of a response, which is returned in JSON format. + +```json +{ + "version": "", +} +``` + +The response includes the following field: + +Field | Description | +---|---| +`version` | The schema version, which is a long number. | + + + +```java +void deleteSchema(String topic) +``` + +Here is an example of deleting a schema. + +```java +PulsarAdmin admin = …; + +admin.deleteSchema("my-tenant/my-ns/my-topic"); +``` + + + +## Custom schema storage + +By default, Pulsar stores various data types of schemas in [Apache BookKeeper](https://bookkeeper.apache.org) deployed alongside Pulsar. + +However, you can use another storage system if needed. + +### Implement + +To use a non-default (non-BookKeeper) storage system for Pulsar schemas, you need to implement the following Java interfaces: + +* [SchemaStorage interface](#schemastorage-interface) + +* [SchemaStorageFactory interface](#schemastoragefactory-interface) + +#### SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} +``` + +> #### Tip +> +> For a complete example of **schema storage** implementation, see [BookKeeperSchemaStorage](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +#### SchemaStorageFactory interface + +The `SchemaStorageFactory` interface has the following method: + +```java +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} +``` + +> Tip +> +> For a complete example of **schema storage factory** implementation, see [BookKeeperSchemaStorageFactory](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +### Deploy + +To use your custom schema storage implementation, perform the following steps. + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. + +2. Add the JAR file to the `lib` folder in your Pulsar binary or source distribution. + +3. Change the `schemaRegistryStorageClassName` configuration in `broker.conf` to your custom factory class. + +4. Start Pulsar. diff --git a/site2/website/versioned_docs/version-2.4.2/schema-understand.md b/site2/website/versioned_docs/version-2.4.2/schema-understand.md new file mode 100644 index 0000000000000..e8290b83f371d --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/schema-understand.md @@ -0,0 +1,576 @@ +--- +id: version-2.4.2-schema-understand +title: Understand schema +sidebar_label: Understand schema +original_id: schema-understand +--- + +This chapter explains the basic concepts of Pulsar schema, focuses on the topics of particular importance, and provides additional background. + +## SchemaInfo + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +The `SchemaInfo` is stored and enforced on a per-topic basis and cannot be stored at the namespace or tenant level. + +A `SchemaInfo` consists of the following fields: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +Field + + + +Description + +
+ +`name` + + + +Schema name (a string). + +
+ +`type` + + + +Schema type, which determines how to interpret the schema data. + +* Predefined schema: see [here](schema-understand.md#schema-type). + +* Customized schema: it is left as an empty string. + +
+ +`schema`(`payload`) + + + +Schema data, which is a sequence of 8-bit unsigned bytes and schema-type specific. + +
+ +`properties` + + + +It is a user defined properties as a string/string map. + +Applications can use this bag for carrying any application specific logics. + +Possible properties might be the Git hash associated with the schema, an environment string like `dev` or `prod`. + +
+ +**Example** + +This is the `SchemaInfo` of a string. + +```text +{ + “name”: “test-string-schema”, + “type”: “STRING”, + “schema”: “”, + “properties”: {} +} +``` + +## Schema type + +Pulsar supports various schema types, which are mainly divided into two categories: + +* Primitive type + +* Complex type + +### Primitive type + +Currently, Pulsar supports the following primitive types: + +| Primitive Type | Description | +|---|---| +| `BOOLEAN` | A binary value | +| `INT8` | A 8-bit signed integer | +| `INT16` | A 16-bit signed integer | +| `INT32` | A 32-bit signed integer | +| `INT64` | A 64-bit signed integer | +| `FLOAT` | A single precision (32-bit) IEEE 754 floating-point number | +| `DOUBLE` | A double-precision (64-bit) IEEE 754 floating-point number | +| `BYTES` | A sequence of 8-bit unsigned bytes | +| `STRING` | A Unicode character sequence | +| `TIMESTAMP` (`DATE`, `TIME`) | A logic type represents a specific instant in time with millisecond precision.
It stores the number of milliseconds since `January 1, 1970, 00:00:00 GMT` as an `INT64` value | + +For primitive types, Pulsar does not store any schema data in `SchemaInfo`. The `type` in `SchemaInfo` is used to determine how to serialize and deserialize the data. + +Some of the primitive schema implementations can use `properties` to store implementation-specific tunable settings. For example, a `string` schema can use `properties` to store the encoding charset to serialize and deserialize strings. + +The conversions between **Pulsar schema types** and **language-specific primitive types** are as below. + +| Schema Type | Java Type| Python Type | Go Type | +|---|---|---|---| +| BOOLEAN | boolean | bool | bool | +| INT8 | byte | | int8 | +| INT16 | short | | int16 | +| INT32 | int | | int32 | +| INT64 | long | | int64 | +| FLOAT | float | float | float32 | +| DOUBLE | double | float | float64| +| BYTES | byte[], ByteBuffer, ByteBuf | bytes | []byte | +| STRING | string | str | string| +| TIMESTAMP | java.sql.Timestamp | | | +| TIME | java.sql.Time | | | +| DATE | java.util.Date | | | + +**Example** + +This example demonstrates how to use a string schema. + +1. Create a producer with a string schema and send messages. + + ```text + Producer producer = client.newProducer(Schema.STRING).create(); + producer.newMessage().value("Hello Pulsar!").send(); + ``` + +2. Create a consumer with a string schema and receive messages. + + ```text + Consumer consumer = client.newConsumer(Schema.STRING).create(); + consumer.receive(); + ``` + +### Complex type + +Currently, Pulsar supports the following complex types: + +| Complex Type | Description | +|---|---| +| `keyvalue` | Represents a complex type of a key/value pair. | +| `struct` | Supports **AVRO**, **JSON**, and **Protobuf**. | + +#### keyvalue + +`Keyvalue` schema helps applications define schemas for both key and value. + +For `SchemaInfo` of `keyvalue` schema, Pulsar stores the `SchemaInfo` of key schema and the `SchemaInfo` of value schema together. + +Pulsar provides two methods to encode a key/value pair in messages: + +* `INLINE` + +* `SEPARATED` + +Users can choose the encoding type when constructing the key/value schema. + +##### INLINE + +Key/value pairs will be encoded together in the message payload. + +##### SEPARATED + +Key will be encoded in the message key and the value will be encoded in the message payload. + +**Example** + +This example shows how to construct a key/value schema and then use it to produce and consume messages. + +1. Construct a key/value schema with `INLINE` encoding type. + + ```text + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.INLINE + ); + ``` + +2. Optionally, construct a key/value schema with `SEPARATED` encoding type. + + ```text + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + ``` + +3. Produce messages using a key/value schema. + + ```text + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Producer> producer = client.newProducer(kvSchema) + .topic(TOPIC) + .create(); + + final int key = 100; + final String value = "value-100”; + + // send the key/value message + producer.newMessage() + .value(new KeyValue<>(key, value)) + .send(); + ``` + +4. Consume messages using a key/value schema. + + ``` + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Consumer> consumer = client.newConsumer(kvSchema) + ... + .topic(TOPIC) + .subscriptionName(SubscriptionName).subscribe(); + + // receive key/value pair + Message> msg = consumer.receive(); + KeyValue kv = msg.getValue(); + ``` + +#### struct + +Pulsar uses [Avro Specification](http://avro.apache.org/docs/current/spec.html) to declare the schema definition for `struct` schema. + +This allows Pulsar: + +* to use same tools to manage schema definitions + +* to use different serialization/deserialization methods to handle data + +There are two methods to use `struct` schema: + +* `static` + +* `generic` + +##### static + +You can predefine the `struct` schema, and it can be a POJO in Java, a `struct` in Go, or classes generated by Avro or Protobuf tools. + +**Example** + +Pulsar gets the schema definition from the predefined `struct` using an Avro library. The schema definition is the schema data stored as a part of the `SchemaInfo`. + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```text + public class User { + String name; + int age; + } + ``` + +2. Create a producer with a `struct` schema and send messages. + + ```text + Producer producer = client.newProducer(Schema.AVRO(User.class)).create(); + producer.newMessage().value(User.builder().userName("pulsar-user").userId(1L).build()).send(); + ``` + +3. Create a consumer with a `struct` schema and receive messages + + ```text + Consumer consumer = client.newConsumer(Schema.AVRO(User.class)).create(); + User user = consumer.receive(); + ``` + +##### generic + +Sometimes applications do not have pre-defined structs, and you can use this method to define schema and access data. + +You can define the `struct` schema using the `GenericSchemaBuilder`, generate a generic struct using `GenericRecordBuilder` and consume messages into `GenericRecord`. + +**Example** + +1. Use `RecordSchemaBuilder` to build a schema. + + ```text + RecordSchemaBuilder recordSchemaBuilder = SchemaBuilder.record("schemaName"); + recordSchemaBuilder.field("intField").type(SchemaType.INT32); + SchemaInfo schemaInfo = recordSchemaBuilder.build(SchemaType.AVRO); + + Producer producer = client.newProducer(Schema.generic(schemaInfo)).create(); + ``` + +2. Use `RecordBuilder` to build the struct records. + + ```text + producer.newMessage().value(schema.newRecordBuilder() + .set("intField", 32) + .build()).send(); + ``` + +### Auto Schema + +If you don't know the schema type of a Pulsar topic in advance, you can use AUTO schema to produce or consume generic records to or from brokers. + +| Auto Schema Type | Description | +|---|---| +| `AUTO_PRODUCE` | This is useful for transferring data **from a producer to a Pulsar topic that has a schema**. | +| `AUTO_CONSUME` | This is useful for transferring data **from a Pulsar topic that has a schema to a consumer**. | + +#### AUTO_PRODUCE + +`AUTO_PRODUCE` schema helps a producer validate whether the bytes sent by the producer is compatible with the schema of a topic. + +**Example** + +Suppose that: + +* You have a producer processing messages from a Kafka topic _K_. + +* You have a Pulsar topic _P_, and you do not know its schema type. + +* Your application reads the messages from _K_ and writes the messages to _P_. + +In this case, you can use `AUTO_PRODUCE` to verify whether the bytes produced by _K_ can be sent to _P_ or not. + +```text +Produce pulsarProducer = client.newProducer(Schema.AUTO_PRODUCE()) + … + .create(); + +byte[] kafkaMessageBytes = … ; + +pulsarProducer.produce(kafkaMessageBytes); +``` + +#### AUTO_CONSUME + +`AUTO_CONSUME` schema helps a Pulsar topic validate whether the bytes sent by a Pulsar topic is compatible with a consumer, that is, the Pulsar topic deserializes messages into language-specific objects using the `SchemaInfo` retrieved from broker-side. + +Currently, `AUTO_CONSUME` only supports **AVRO** and **JSON** schemas. It deserializes messages into `GenericRecord`. + +**Example** + +Suppose that: + +* You have a Pulsar topic _P_. + +* You have a consumer (for example, MySQL) receiving messages from the topic _P_. + +* You application reads the messages from _P_ and writes the messages to MySQL. + +In this case, you can use `AUTO_CONSUME` to verify whether the bytes produced by _P_ can be sent to MySQL or not. + +```text +Consumer pulsarConsumer = client.newConsumer(Schema.AUTO_CONSUME()) + … + .subscribe(); + +Message msg = consumer.receive() ; +GenericRecord record = msg.getValue(); +… +``` + +## Schema version + +Each `SchemaInfo` stored with a topic has a version. Schema version manages schema changes happening within a topic. + +Messages produced with a given `SchemaInfo` is tagged with a schema version, so when a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and then use the `SchemaInfo` to deserialize data. + +Schemas are versioned in succession. Schema storage happens in a broker that handles the associated topics so that version assignments can be made. + +Once a version is assigned/fetched to/for a schema, all subsequent messages produced by that producer are tagged with the appropriate version. + +**Example** + +The following example illustrates how the schema version works. + +Suppose that a Pulsar [Java client](client-libraries-java.md) created using the code below attempts to connect to Pulsar and begins to send messages: + +```text +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-data") + .sendTimeout(3, TimeUnit.SECONDS) + .create(); +``` + +The table below lists the possible scenarios when this connection attempt occurs and what happens in each scenario: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ScenarioWhat happens
+ +* No schema exists for the topic. + + + +(1) The producer is created using the given schema. + +(2) Since no existing schema is compatible with the `SensorReading` schema, the schema is transmitted to the broker and stored. + +(3) Any consumer created using the same schema or topic can consume messages from the `sensor-data` topic. + +
+ +* A schema already exists. + +* The producer connects using the same schema that is already stored. + + + +(1) The schema is transmitted to the broker. + +(2) The broker determines that the schema is compatible. + +(3) The broker attempts to store the schema in [BookKeeper](concepts-architecture-overview.md#persistent-storage) but then determines that it's already stored, so it is used to tag produced messages. + +
+ +* A schema already exists. + +* The producer connects using a new schema that is compatible. + + + +(1) The schema is transmitted to the broker. + +(2) The broker determines that the schema is compatible and stores the new schema as the current version (with a new version number). + +
+ +## How does schema work + +Pulsar schemas are applied and enforced at the **topic** level (schemas cannot be applied at the namespace or tenant level). + +Producers and consumers upload schemas to brokers, so Pulsar schemas work on the producer side and the consumer side. + +### Producer side + +This diagram illustrates how does schema work on the Producer side. + +![Schema works at the producer side](assets/schema-producer.png) + +1. The application uses a schema instance to construct a producer instance. + + The schema instance defines the schema for the data being produced using the producer instance. + + Take AVRO as an example, Pulsar extract schema definition from the POJO class and construct the `SchemaInfo` that the producer needs to pass to a broker when it connects. + +2. The producer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker looks up the schema in the schema storage to check if it is already a registered schema. + +4. If yes, the broker skips the schema validation since it is a known schema, and returns the schema version to the producer. + +5. If no, the broker validates the schema based on the schema compatibility check strategy defined for the topic. + +6. If the schema is compatible, the broker stores it and returns the schema version to the producer. + + All the messages produced by this producer are tagged with the schema version. + +7. If the schema is incompatible, the broker rejects it. + +### Consumer side + +This diagram illustrates how does Schema work on the consumer side. + +![Schema works at the consumer side](assets/schema-consumer.png) + +1. The application uses a schema instance to construct a consumer instance. + + The schema instance defines the schema that the consumer uses for decoding messages received from a broker. + +2. The consumer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker looks up the schema in the schema storage to check if it is already a registered schema. + +4. If yes, the broker skips the schema validation since it is a known schema, and returns the schema version to the consumer. + +5. If no, the broker validates the schema based on the schema compatibility check strategy defined for the topic. + +6. If the schema is compatible, the broker stores it and returns the schema version to the consumer. + +7. If the schema is incompatible, the consumer will be disconnected. + +8. The consumer receives the messages from the broker. + + If the schema used by the consumer supports schema versioning (for example, AVRO schema), the consumer fetches the `SchemaInfo` of the version tagged in messages, and use the passed-in schema and the schema tagged in messages to decode the messages. diff --git a/site2/website/versioned_docs/version-2.4.2/security-athenz.md b/site2/website/versioned_docs/version-2.4.2/security-athenz.md new file mode 100644 index 0000000000000..186e451e8e6bc --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/security-athenz.md @@ -0,0 +1,93 @@ +--- +id: version-2.4.2-security-athenz +title: Authentication using Athenz +sidebar_label: Authentication using Athenz +original_id: security-athenz +--- + +[Athenz](https://github.com/yahoo/athenz) is a role-based authentication/authorization system. In Pulsar, you can use Athenz role tokens (also known as *z-tokens*) to establish the identify of the client. + +## Athenz authentication settings + +A [decentralized Athenz system](https://github.com/yahoo/athenz/blob/master/docs/dev_decentralized_access.md) contains an [authori**Z**ation **M**anagement **S**ystem](https://github.com/yahoo/athenz/blob/master/docs/setup_zms.md) (ZMS) server and an [authori**Z**ation **T**oken **S**ystem](https://github.com/yahoo/athenz/blob/master/docs/setup_zts.md) (ZTS) server. + +To begin, you need to set up Athenz service access control. You need to create domains for the *provider* (which provides some resources to other services with some authentication/authorization policies) and the *tenant* (which is provisioned to access some resources in a provider). In this case, the provider corresponds to the Pulsar service itself and the tenant corresponds to each application using Pulsar (typically, a [tenant](reference-terminology.md#tenant) in Pulsar). + +### Create the tenant domain and service + +On the [tenant](reference-terminology.md#tenant) side, you need to do the follwing things: + +1. Create a domain, such as `shopping` +2. Generate a private/public key pair +3. Create a service, such as `some_app`, on the domain with the public key + +Note that you need to specify the private key generated in step 2 when the Pulsar client connects to the [broker](reference-terminology.md#broker) (see client configuration examples for [Java](client-libraries-java.md#tls-authentication) and [C++](client-libraries-cpp.md#tls-authentication)). + +For more specific steps involving the Athenz UI, refer to [Example Service Access Control Setup](https://github.com/yahoo/athenz/blob/master/docs/example_service_athenz_setup.md#client-tenant-domain). + +### Create the provider domain and add the tenant service to some role members + +On the provider side, you need to do the follwing things: + +1. Create a domain, such as `pulsar` +2. Create a role +3. Add the tenant service to members of the role + +Note that you can specify any action and resource in step 2 since they are not used on Pulsar. In other words, Pulsar uses the Athenz role token only for authentication, *not* for authorization. + +For more specific steps involving UI, refer to [Example Service Access Control Setup](https://github.com/yahoo/athenz/blob/master/docs/example_service_athenz_setup.md#server-provider-domain). + +## Configure the broker for Athenz + +> ### TLS encryption +> +> Note that when you are using Athenz as an authentication provider, you had better use TLS encryption +> as it can protect role tokens from being intercepted and reused. (for more details involving TLS encrption see [Architecture - Data Model](https://github.com/yahoo/athenz/blob/master/docs/data_model.md)). + +In the `conf/broker.conf` configuration file in your Pulsar installation, you need to provide the class name of the Athenz authentication provider as well as a comma-separated list of provider domain names. + +```properties +# Add the Athenz auth provider +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderAthenz +athenzDomainNames=pulsar + +# Enable TLS +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker-cert.pem +tlsKeyFilePath=/path/to/broker-key.pem + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +brokerClientAuthenticationParameters={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} +``` + +> A full listing of parameters is available in the `conf/broker.conf` file, you can also find the default +> values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +## Configure clients for Athenz + +For more information on Pulsar client authentication using Athenz, see the following language-specific docs: + +* [Java client](client-libraries-java.md#athenz) + +## Configure CLI tools for Athenz + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following authentication parameters to the `conf/client.conf` config file to use Athenz with CLI tools of Pulsar: + +```properties +# URL for the broker +serviceUrl=https://broker.example.com:8443/ + +# Set Athenz auth plugin and its parameters +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationAthenz +authParams={"tenantDomain":"shopping","tenantService":"some_app","providerDomain":"pulsar","privateKey":"file:///path/to/private.pem","keyId":"v1"} + +# Enable TLS +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/cacert.pem +``` diff --git a/site2/website/versioned_docs/version-2.4.2/security-authorization.md b/site2/website/versioned_docs/version-2.4.2/security-authorization.md new file mode 100644 index 0000000000000..950bfa9644b6b --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/security-authorization.md @@ -0,0 +1,100 @@ +--- +id: version-2.4.2-security-authorization +title: Authentication and authorization in Pulsar +sidebar_label: Authorization and ACLs +original_id: security-authorization +--- + + +In Pulsar, the [authentication provider](security-overview.md#authentication-providers) is responsible for properly identifying clients and associating the clients with [role tokens](security-overview.md#role-tokens). If you only enable authentication, an authenticated role token has the ability to access all resources in the cluster. *Authorization* is the process that determines *what* clients are able to do. + +The role tokens with the most privileges are the *superusers*. The *superusers* can create and destroy tenants, along with having full access to all tenant resources. + +When a superuser creates a [tenant](reference-terminology.md#tenant), that tenant is assigned an admin role. A client with the admin role token can then create, modify and destroy namespaces, and grant and revoke permissions to *other role tokens* on those namespaces. + +## Broker and Proxy Setup + +### Enable authorization and assign superusers +You can enable the authorization and assign the superusers in the broker ([`conf/broker.conf`](reference-configuration.md#broker)) configuration files. + +```properties +authorizationEnabled=true +superUserRoles=my-super-user-1,my-super-user-2 +``` + +> A full list of parameters is available in the `conf/broker.conf` file. +> You can also find the default values for those parameters in [Broker Configuration](reference-configuration.md#broker). + +Typically, you can not only use superuser roles for administrators and clients but also for broker-to-broker authorization. When you use [geo-replication](concepts-replication.md), every broker needs to be able to publish to all the other topics of clusters. + +You can also enable the authorization for the proxy in the proxy configuration file (`conf/proxy.conf`). Once you enable the authorization on the proxy, the proxy does an additional authorization check before forwarding the request to a broker. The broker still checks the authorization of the request when the broker receives the forwarded request. + +### Proxy Roles + +By default, the broker treats the connection between a proxy and the broker as a normal user connection. The broker authenticates the user as the role configured in `proxy.conf`(see ["Enable TLS Authentication on Proxies"](security-tls-authentication.md#enable-tls-authentication-on-proxies)). However, when the user connects to the cluster through a proxy, the user rarely requires the authentication. The user expects to be able to interact with the cluster as the role for which they have authenticated with the proxy. + +Pulsar uses *Proxy roles* to enable the authentication. Proxy roles are specified in the broker configuration file, [`conf/broker.conf`](reference-configuration.md#broker). If a client that is authenticated with a broker is one of its ```proxyRoles```, all requests from that client must also carry information about the role of the client that is authenticated with the proxy. This information is called the *original principle*. If the *original principle* misses, the client is not able to access anything. + +You must authorize the *proxy role* and the *original principle* to access a resource. Thus that resource can be accessible via the proxy. Administrators can take two approaches to authorize the *proxy role* and the *original principle*. + +The more secure approach is to grant access to the proxy roles each time you grant access to a resource. For example, if you have a proxy role named `proxy1`, when the superuser creats a tenant, you should specify `proxy1` as one of the admin roles. When a role is granted permissions to produce or consume from a namespace, if that client wants to produce or consume through a proxy, you should also grant `proxy1` the same permissions. + +Another approach is to make the proxy role a superuser. This allows the proxy to access all resources. The client still needs to authenticate with the proxy, and all requests made through the proxy have their role downgraded to the *original principal* of the authenticated client. However, if the proxy is compromised, a bad actor could get full access to your cluster. + +You can specify the roles as proxy roles in [`conf/broker.conf`](reference-configuration.md#broker). + +```properties +proxyRoles=my-proxy-role + +# if you want to allow superusers to use the proxy (see above) +superUserRoles=my-super-user-1,my-super-user-2,my-proxy-role +``` + +## Administer tenants + +Pulsar [instance](reference-terminology.md#instance) administrators or some kind of self-service portal typically provisions a Pulsar [tenant](reference-terminology.md#tenant). + +You can manage tenants using the [`pulsar-admin`](reference-pulsar-admin.md) tool. + +### Create a new tenant + +The following is an example tenant creation command: + +```shell +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east +``` + +This command creates a new tenant `my-tenant` that is allowed to use the clusters `us-west` and `us-east`. + +A client that successfully identifies itself as having the role `my-admin-role` is allowed to perform all administrative tasks on this tenant. + +The structure of topic names in Pulsar reflects the hierarchy between tenants, clusters, and namespaces: + +```shell +persistent://tenant/namespace/topic +``` + +### Manage permissions + +You can use [Pulsar Admin Tools](admin-api-permissions.md) for managing permission in Pulsar. + +### Pulsar admin authentication + +```java +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("http://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .build(); +``` + +To use TLS: + +```java +PulsarAdmin admin = PulsarAdmin.builder() + .serviceHttpUrl("https://broker:8080") + .authentication("com.org.MyAuthPluginClass", "param1:value1") + .tlsTrustCertsFilePath("/path/to/trust/cert") + .build(); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/security-kerberos.md b/site2/website/versioned_docs/version-2.4.2/security-kerberos.md new file mode 100644 index 0000000000000..7bd985c6105d5 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/security-kerberos.md @@ -0,0 +1,391 @@ +--- +id: version-2.4.2-security-kerberos +title: Authentication using Kerberos +sidebar_label: Authentication using Kerberos +original_id: security-kerberos +--- + +[Kerberos](https://web.mit.edu/kerberos/) is a network authentication protocol. By using secret-key cryptography, [Kerberos](https://web.mit.edu/kerberos/) is designed to provide strong authentication for client applications and server applications. + +In Pulsar, you can use Kerberos with [SASL](https://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer) as a choice for authentication. And Pulsar uses the [Java Authentication and Authorization Service (JAAS)](https://en.wikipedia.org/wiki/Java_Authentication_and_Authorization_Service) for SASL configuration. You need to provide JAAS configurations for Kerberos authentication. + +This document introduces how to configure `Kerberos` with `SASL` between Pulsar clients and brokers and how to configure Kerberos for Pulsar proxy in detail. + +## Configuration for Kerberos between Client and Broker + +### Prerequisites + +To begin, you need to set up (or already have) a [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center). Also you need to configure and run the [Key Distribution Center(KDC)](https://en.wikipedia.org/wiki/Key_distribution_center)in advance. + +If your organization already uses a Kerberos server (for example, by using `Active Directory`), you do not have to install a new server for Pulsar. If your organization does not use a Kerberos server, you need to install one. Your Linux vendor might have packages for `Kerberos`. On how to install and configure Kerberos, refer to [Ubuntu](https://help.ubuntu.com/community/Kerberos), +[Redhat](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Managing_Smart_Cards/installing-kerberos.html). + +Note that if you use Oracle Java, you need to download JCE policy files for your Java version and copy them to the `$JAVA_HOME/jre/lib/security` directory. + +#### Kerberos principals + +If you use the existing Kerberos system, ask your Kerberos administrator for a principal for each Brokers in your cluster and for every operating system user that accesses Pulsar with Kerberos authentication(via clients and tools). + +If you have installed your own Kerberos system, you can create these principals with the following commands: + +```shell +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" +``` + +Note that *Kerberos* requires that all your hosts can be resolved with their FQDNs. + +The first part of Broker principal (for example, `broker` in `broker/{hostname}@{REALM}`) is the `serverType` of each host. The suggested values of `serverType` are `broker` (host machine runs service Pulsar Broker) and `proxy` (host machine runs service Pulsar Proxy). + +#### Configure how to connect to KDC + +You need to enter the command below to specify the path to the `krb5.conf` file for the client side and the broker side. The content of `krb5.conf` file indicates the default Realm and KDC information. See [JDK’s Kerberos Requirements](https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/KerberosReq.html) for more details. + +```shell +-Djava.security.krb5.conf=/etc/pulsar/krb5.conf +``` +Here is an example of the krb5.conf file: + +In the configuration file, `EXAMPLE.COM` is the default realm; `kdc = localhost:62037` is the kdc server url for realm `EXAMPLE.COM `: + +``` +[libdefaults] + default_realm = EXAMPLE.COM + +[realms] + EXAMPLE.COM = { + kdc = localhost:62037 + } +``` + +Usually machines configured with kerberos already have a system wide configuration and this configuration is optional. + +#### JAAS configuration file + +You need JAAS configuration file for the client side and the broker side. JAAS configuration file provides the section of information that is used to connect KDC. Here is an example named `pulsar_jaas.conf`: + +``` + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; +``` + +You need to set the `JAAS` configuration file path as JVM parameter for client and broker. For example: + +```shell + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf +``` + +In the `pulsar_jaas.conf` file above + +1. `PulsarBroker` is a section name in the JAAS file that each broker uses. This section tells the broker to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarBroker` allows the broker to use the keytab specified in this section. +2. `PulsarClient` is a section name in the JASS file that each broker uses. This section tells the client to use which principal inside Kerberos and the location of the keytab where the principal is stored. `PulsarClient` allows the client to use the keytab specified in this section. + The following example also reuses this `PulsarClient` section in both the Pulsar internal admin configuration and in CLI command of `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`. You can also add different sections for different use cases. + +You can have 2 separate JAAS configuration files: +* the file for a broker that has sections of both `PulsarBroker` and `PulsarClient`; +* the file for a client that only has a `PulsarClient` section. + + +### Kerberos configuration for Brokers + +#### Configure the `broker.conf` file + + In the `broker.conf` file, set Kerberos related configurations. + + - Set `authenticationEnabled` to `true`; + - Set `authenticationProviders` to choose `AuthenticationProviderSasl`; + - Set `saslJaasClientAllowedIds` regex for principal that is allowed to connect to broker; + - Set `saslJaasBrokerSectionName` that corresponds to the section in JAAS configuration file for broker; + + To make Pulsar internal admin client work properly, you need to set the configuration in the `broker.conf` file as below: + - Set `brokerClientAuthenticationPlugin` to client plugin `AuthenticationSasl`; + - Set `brokerClientAuthenticationParameters` to value in JSON string `{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}`, in which `PulsarClient` is the section name in the `pulsar_jaas.conf` file, and `"serverType":"broker"` indicates that the internal admin client connects to a Pulsar Broker; + + Here is an example: + +``` +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker + +## Authentication settings of the broker itself. Used when the broker connects to other brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} +``` + +#### Set Broker JVM parameter + + Set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. +```shell + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf +``` +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_env.sh) + +You must ensure that the operating system user who starts broker can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +### Kerberos configuration for clients + +#### Java Client and Java Admin Client + +In client application, include `pulsar-client-auth-sasl` in your project dependency. + +``` + + org.apache.pulsar + pulsar-client-auth-sasl + ${pulsar.version} + +``` + +Configure the authentication type to use `AuthenticationSasl`, and also provide the authentication parameters to it. + +You need 2 parameters: +- `saslJaasClientSectionName`. This parameter corresponds to the section in JAAS configuration file for client; +- `serverType`. This parameter stands for whether this client connects to broker or proxy. And client uses this parameter to know which server side principal should be used. + +When you authenticate between client and broker with the setting in above JAAS configuration file, we need to set `saslJaasClientSectionName` to `PulsarClient` and set `serverType` to `broker`. + +The following is an example of creating a Java client: + + ```java + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "broker"); + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME +``` + +You must ensure that the operating system user who starts pulsar client can reach the keytabs configured in the `pulsar_jaas.conf` file and kdc server in the `krb5.conf` file. + +#### Configure CLI tools + +If you use a command-line tool (such as `bin/pulsar-client`, `bin/pulsar-perf` and `bin/pulsar-admin`), you need to preform the following steps: + +Step 1. Enter the command below to configure your `client.conf`. +```shell +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +authParams={"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"} +``` + + +Step 2. Enter the command below to set JVM parameters for JAAS configuration file and krb5 configuration file with additional options. +```shell + -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf +``` + +You can add this at the end of `PULSAR_EXTRA_OPTS` in the file [`pulsar_tools_env.sh`](https://github.com/apache/pulsar/blob/master/conf/pulsar_tools_env.sh), +or add this line `OPTS="$OPTS -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf "` directly to the CLI tool script. + +The meaning of configurations is the same as the meaning of configurations in Java client section. + +## Kerberos configuration for working with Pulsar Proxy + +With the above configuration, client and broker can do authentication using Kerberos. + +A client that connects to Pulsar Proxy is a little different. Pulsar Proxy (as a SASL Server in Kerberos) authenticates Client (as a SASL client in Kerberos) first; and then Pulsar broker authenticates Pulsar Proxy. + +Now in comparision with the above configuration between client and broker, we show you how to configure Pulsar Proxy as follows. + +### Create principal for Pulsar Proxy in Kerberos + +You need to add new principals for Pulsar Proxy comparing with the above configuration. If you already have principals for client and broker, you only need to add the proxy principal here. + +```shell +### add Principals for Pulsar Proxy +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey proxy/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{proxy-keytabname}.keytab proxy/{hostname}@{REALM}" +### add Principals for broker +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey broker/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{broker-keytabname}.keytab broker/{hostname}@{REALM}" +### add Principals for client +sudo /usr/sbin/kadmin.local -q 'addprinc -randkey client/{hostname}@{REALM}' +sudo /usr/sbin/kadmin.local -q "ktadd -k /etc/security/keytabs/{client-keytabname}.keytab client/{hostname}@{REALM}" +``` + +### Add a section in JAAS configuration file for Pulsar Proxy + +In comparision with the above configuration, add a new section for Pulsar Proxy in JAAS configuration file. + +Here is an example named `pulsar_jaas.conf`: + +``` + PulsarBroker { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; + + PulsarProxy { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarproxy.keytab" + principal="proxy/localhost@EXAMPLE.COM"; +}; + + PulsarClient { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarclient.keytab" + principal="client/localhost@EXAMPLE.COM"; +}; +``` + +### Proxy client configuration + +Pulsar client configuration is similar with client and broker configuration, except that you need to set `serverType` to `proxy` instead of `broker`, for the reason that you need to do the Kerberos authentication between client and proxy. + + ```java + System.setProperty("java.security.auth.login.config", "/etc/pulsar/pulsar_jaas.conf"); + System.setProperty("java.security.krb5.conf", "/etc/pulsar/krb5.conf"); + + Map authParams = Maps.newHashMap(); + authParams.put("saslJaasClientSectionName", "PulsarClient"); + authParams.put("serverType", "proxy"); // ** here is the different ** + + Authentication saslAuth = AuthenticationFactory + .create(org.apache.pulsar.client.impl.auth.AuthenticationSasl.class.getName(), authParams); + + PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://my-broker.com:6650") + .authentication(saslAuth) + .build(); + ``` + +> The first two lines in the example above are hard coded, alternatively, you can set additional JVM parameters for JAAS and krb5 configuration file when you run the application like below: + +``` +java -cp -Djava.security.auth.login.config=/etc/pulsar/pulsar_jaas.conf -Djava.security.krb5.conf=/etc/pulsar/krb5.conf $APP-jar-with-dependencies.jar $CLASSNAME +``` + +### Kerberos configuration for Pulsar proxy service + +In the `proxy.conf` file, set Kerberos related configuration. Here is an example: +```shell +## related to authenticate client. +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarProxy + +## related to be authenticated by broker +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationSasl +brokerClientAuthenticationParameters={"saslJaasClientSectionName":"PulsarProxy", "serverType":"broker"} +forwardAuthorizationCredentials=true +``` + +The first part relates to authenticating between client and Pulsar Proxy. In this phase, client works as SASL client, while Pulsar Proxy works as SASL server. + +The second part relates to authenticating between Pulsar Proxy and Pulsar Broker. In this phase, Pulsar Proxy works as SASL client, while Pulsar Broker works as SASL server. + +### Broker side configuration. + +The broker side configuration file is the same with the above `broker.conf`, you do not need special configuration for Pulsar Proxy. + +``` +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderSasl +saslJaasClientAllowedIds=.*client.* +saslJaasBrokerSectionName=PulsarBroker +``` + +## Regarding authorization and role token + +For Kerberos authentication, we usually use the authenticated principal as the role token for Pulsar authorization. For more information of authorization in Pulsar, see [security authorization](security-authorization.md). + +If you enable 'authorizationEnabled', you need to set `superUserRoles` in `broker.conf` that corresponds to the name registered in kdc. + +For example: +```bash +superUserRoles=client/{clientIp}@EXAMPLE.COM +``` + +## Regarding authentication between ZooKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Zookeeper. According to [ZooKeeper document](https://cwiki.apache.org/confluence/display/ZOOKEEPER/Client-Server+mutual+authentication), you need these settings in `conf/zookeeper.conf`: + +``` +authProvider.1=org.apache.zookeeper.server.auth.SASLAuthenticationProvider +requireClientAuthScheme=sasl +``` + +Enter the following commands to add a section of `Client` configurations in the file `pulsar_jaas.conf`, which Pulsar Broker uses: + +``` + Client { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with ZooKeeper. + +## Regarding authentication between BookKeeper and Broker + +Pulsar Broker acts as a Kerberos client when you authenticate with Bookie. According to [BookKeeper document](http://bookkeeper.apache.org/docs/latest/security/sasl/), you need to add `bookkeeperClientAuthenticationPlugin` parameter in `broker.conf`: + +``` +bookkeeperClientAuthenticationPlugin=org.apache.bookkeeper.sasl.SASLClientProviderFactory +``` + +In this setting, `SASLClientProviderFactory` creates a BookKeeper SASL client in a Broker, and the Broker uses the created SASL client to authenticate with a Bookie node. + +Enter the following commands to add a section of `BookKeeper` configurations in the `pulsar_jaas.conf` that Pulsar Broker uses: + +``` + BookKeeper { + com.sun.security.auth.module.Krb5LoginModule required + useKeyTab=true + storeKey=true + useTicketCache=false + keyTab="/etc/security/keytabs/pulsarbroker.keytab" + principal="broker/localhost@EXAMPLE.COM"; +}; +``` + +In this setting, the principal of Pulsar Broker and keyTab file indicates the role of Broker when you authenticate with Bookie. diff --git a/site2/website/versioned_docs/version-2.4.2/security-overview.md b/site2/website/versioned_docs/version-2.4.2/security-overview.md new file mode 100644 index 0000000000000..f623e50007356 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/security-overview.md @@ -0,0 +1,38 @@ +--- +id: version-2.4.2-security-overview +title: Pulsar Security Overview +sidebar_label: Overview +original_id: security-overview +--- + +As the central message bus for a business, Apache Pulsar is frequently used for storing mission-critical data. Therefore, enabling security features in Pulsar is crucial. + +By default, Pulsar configures no encryption, authentication, or authorization. Any client can communicate to Apache Pulsar via plain text service URLs. So we must ensure that Pulsar accessing via these plain text service URLs is restricted to trusted clients only. In such cases, you can use Network segmentation and/or authorization ACLs to restrict access to trusted IPs. If you use neither, the state of cluster is wide open and anyone can access the cluster. + +Pulsar supports a pluggable authentication mechanism. And Pulsar clients use this mechanism to authenticate with brokers and proxies. You can also configure Pulsar to support multiple authentication sources. + +You'd better secure the service components in your Apache Pulsar deployment. + +## Role Tokens + +In Pulsar, a *role* is a string, like `admin` or `app1`, which can represent one or more clients. You can use roles to control permission for clients to produce or consume from certain topics, administer the configuration for tenants, and so on. + +Apache Pulsar uses the [Authentication Provider](#authentication-providers) to establish the identity of a client and then assign a *role token* to that client. This role token is then used for [Authorization and ACLs](security-authorization.md) to determine what the client is authorized to do. + +## Authentication Providers + +Currently Pulsar supports the following authentication providers: + +- [TLS Authentication](security-tls-authentication.md) +- [Athenz](security-athenz.md) +- [Kerberos](security-kerberos.md) +- JSON Web Token Authentication + +## Contents + +- [Encryption](security-tls-transport.md) and [Authentication](security-tls-authentication.md) using TLS +- [Authentication using Athenz](security-athenz.md) +- [Authentication using Kerberos](security-kerberos.md) +- [Authorization and ACLs](security-authorization.md) +- [End-to-End Encryption](security-encryption.md) + diff --git a/site2/website/versioned_docs/version-2.4.2/security-tls-authentication.md b/site2/website/versioned_docs/version-2.4.2/security-tls-authentication.md new file mode 100644 index 0000000000000..2372fd9cf6c48 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/security-tls-authentication.md @@ -0,0 +1,156 @@ +--- +id: version-2.4.2-security-tls-authentication +title: Authentication using TLS +sidebar_label: Authentication using TLS +original_id: security-tls-authentication +--- + +## TLS authentication overview + +TLS authentication is an extension of [TLS transport encryption](security-tls-transport.md). Not only servers have keys and certs that the client uses to verify the identity of servers, clients also have keys and certs that the server uses to verify the identity of clients. You must have TLS transport encryption configured on your cluster before you can use TLS authentication. This guide assumes you already have TLS transport encryption configured. + +### Create client certificates + +Client certificates are generated using the certificate authority. Server certificates are also generated with the same certificate authority. + +The biggest difference between client certs and server certs is that the **common name** for the client certificate is the **role token** which that client is authenticated as. + +First, you need to enter the follwing command to generate the key : + +```bash +$ openssl genrsa -out admin.key.pem 2048 +``` + +Similar to the broker, the client expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so you need to convert it by entering the follwing command: + +```bash +$ openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in admin.key.pem -out admin.key-pk8.pem -nocrypt +``` + +Next, enter the command below to generate the certificate request. When you are asked for a **common name**, enter the **role token** that you want this key pair to authenticate a client as. + +```bash +$ openssl req -config openssl.cnf \ + -key admin.key.pem -new -sha256 -out admin.csr.pem +``` +> Note +> If openssl.cnf is not specified, read [Certificate authority](http://pulsar.apache.org/docs/en/security-tls-transport/#certificate-authority) to get the openssl.cnf. + +Then, enter the command below to sign with request with the certificate authority. Note that the client certs uses the **usr_cert** extension, which allows the cert to be used for client authentication. + +```bash +$ openssl ca -config openssl.cnf -extensions usr_cert \ + -days 1000 -notext -md sha256 \ + -in admin.csr.pem -out admin.cert.pem +``` + +You can get a cert, `admin.cert.pem`, and a key, `admin.key-pk8.pem` from this command. With `ca.cert.pem`, clients can use this cert and this key to authenticate themselves to brokers and proxies as the role token ``admin``. + +> Note +> If the "unable to load CA private key" error occurs and the reason of this error is "No such file or directory: /etc/pki/CA/private/cakey.pem" in this step. Try the command below: +> +> ```bash +> $ cd /etc/pki/tls/misc/CA +> $ ./CA -newca +> ``` +> +> to generate `cakey.pem` . + +## Enable TLS authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#broker-configuration): + +```properties +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# operations and publish/consume from all topics +superUserRoles=admin + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters=tlsCertFile:/path/my-ca/admin.cert.pem,tlsKeyFile:/path/my-ca/admin.key-pk8.pem +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem +``` + +## Enable TLS authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#proxy-configuration): + +The proxy should have its own client key pair for connecting to brokers. You need to configure the role token for this key pair in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters=tlsCertFile:/path/to/proxy.cert.pem,tlsKeyFile:/path/to/proxy.key-pk8.pem +``` + +## Client configuration + +When you use TLS authentication, client connects via TLS transport. You need to configure the client to use ```https://``` and 8443 port for the web service URL, ```pulsar+ssl://``` and 6651 port for the broker service URL. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS authentication with the CLI tools of Pulsar: + +```properties +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +authParams=tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem +``` + +### Java client + +```java +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .authentication("org.apache.pulsar.client.impl.auth.AuthenticationTls", + "tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem") + .build(); +``` + +### Python client + +```python +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) +``` + +### C++ client + +```c++ +#include + +pulsar::ClientConfiguration config; +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/ca.cert.pem"); +config.setTlsAllowInsecureConnection(false); + +pulsar::AuthenticationPtr auth = pulsar::AuthTls::create("/path/to/my-role.cert.pem", + "/path/to/my-role.key-pk8.pem") +config.setAuth(auth); + +pulsar::Client client("pulsar+ssl://broker.example.com:6651/", config); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/security-tls-transport.md b/site2/website/versioned_docs/version-2.4.2/security-tls-transport.md new file mode 100644 index 0000000000000..210b08cc7e8c4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/security-tls-transport.md @@ -0,0 +1,230 @@ +--- +id: version-2.4.2-security-tls-transport +title: Transport Encryption using TLS +sidebar_label: Transport Encryption using TLS +original_id: security-tls-transport +--- + +## TLS overview + +By default, Apache Pulsar clients communicate with the Apache Pulsar service in plain text. This means that all data is sent in the clear. You can use TLS to encrypt this traffic to protect the traffic from the snooping of a man-in-the-middle attacker. + +You can also configure TLS for both encryption and authentication. Use this guide to configure just TLS transport encryption and refer to [here](security-tls-authentication.md) for TLS authentication configuration. Alternatively, you can use [another authentication mechanism](security-athenz.md) on top of TLS transport encryption. + +> Note that enabling TLS may impact the performance due to encryption overhead. + +## TLS concepts + +TLS is a form of [public key cryptography](https://en.wikipedia.org/wiki/Public-key_cryptography). Using key pairs consisting of a public key and a private key can perform the encryption. The public key encrpyts the messages and the private key decrypts the messages. + +To use TLS transport encryption, you need two kinds of key pairs, **server key pairs** and a **certificate authority**. + +You can use a third kind of key pair, **client key pairs**, for [client authentication](security-tls-authentication.md). + +You should store the **certificate authority** private key in a very secure location (a fully encrypted, disconnected, air gapped computer). As for the certificate authority public key, the **trust cert**, you can freely shared it. + +For both client and server key pairs, the administrator first generates a private key and a certificate request, then uses the certificate authority private key to sign the certificate request, finally generates a certificate. This certificate is the public key for the server/client key pair. + +For TLS transport encryption, the clients can use the **trust cert** to verify that the server has a key pair that the certificate authority signed when the clients are talking to the server. A man-in-the-middle attacker does not have access to the certificate authority, so they couldn't create a server with such a key pair. + +For TLS authentication, the server uses the **trust cert** to verify that the client has a key pair that the certificate authority signed. The common name of the **client cert** is then used as the client's role token (see [Overview](security-overview.md)). + +## Create TLS certificates + +Creating TLS certificates for Pulsar involves creating a [certificate authority](#certificate-authority) (CA), [server certificate](#server-certificate), and [client certificate](#client-certificate). + +Follow the guide below to set up a certificate authority. You can also refer to plenty of resources on the internet for more details. We recommend [this guide](https://jamielinux.com/docs/openssl-certificate-authority/index.html) for your detailed reference. + +### Certificate authority + +1. Create the certificate for the CA. You can use CA to sign both the broker and client certificates. This ensures that each party will trust the others. You should store CA in a very secure location (ideally completely disconnected from networks, air gapped, and fully encrypted). + +2. Entering the follwing command to create a directory for your CA, and place [this openssl configuration file](https://github.com/apache/pulsar/tree/master/site2/website/static/examples/openssl.cnf) in the directory. You may want to modify the default answers for company name and department in the configuration file. Export the location of the CA directory to the environment variable, CA_HOME. The configuration file uses this environment variable to find the rest of the files and directories that the CA needs. + +```bash +$ mkdir my-ca +$ cd my-ca +$ wget https://raw.githubusercontent.com/apache/pulsar/master/site2/website/static/examples/openssl.cnf +$ export CA_HOME=$(pwd) +``` + +3. Enter the commands below to create the necessary directories, keys and certs. + +```bash +$ mkdir certs crl newcerts private +$ chmod 700 private/ +$ touch index.txt +$ echo 1000 > serial +$ openssl genrsa -aes256 -out private/ca.key.pem 4096 +$ chmod 400 private/ca.key.pem +$ openssl req -config openssl.cnf -key private/ca.key.pem \ + -new -x509 -days 7300 -sha256 -extensions v3_ca \ + -out certs/ca.cert.pem +$ chmod 444 certs/ca.cert.pem +``` + +4. After you answer the question prompts, CA-related files are stored in the `./my-ca` directory. Within that directory: + +* `certs/ca.cert.pem` is the public certificate. This public certificates is meant to be distributed to all parties involved. +* `private/ca.key.pem` is the private key. You only need it when you are signing a new certificate for either broker or clients and you must safely guard this private key. + +### Server certificate + +Once you have created a CA certificate, you can create certificate requests and sign them with the CA. + +The following commands ask you a few questions and then create the certificates. When you are asked for the common name, you should match the hostname of the broker. You can also use a wildcard to match a group of broker hostnames, for example, `*.broker.usw.example.com`. This ensures that multiple machines can reuse the same certificate. + +> #### Tips +> +> Sometimes matching the hostname is not possible or makes no sense, +> such as when you creat the brokers with random hostnames, or you +> plan to connect to the hosts via their IP. In these cases, you +> should configure the client to disable TLS hostname verification. For more +> details, you can see [the host verification section in client configuration](#hostname-verification). + +1. Enter the command below to generate the key. + +```bash +$ openssl genrsa -out broker.key.pem 2048 +``` + +The broker expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so enter the following command to convert it. + +```bash +$ openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in broker.key.pem -out broker.key-pk8.pem -nocrypt +``` + +2. Enter the follwing command to generate the certificate request. + +```bash +$ openssl req -config openssl.cnf \ + -key broker.key.pem -new -sha256 -out broker.csr.pem +``` + +3. Sign it with the certificate authority by entering the command below. + +```bash +$ openssl ca -config openssl.cnf -extensions server_cert \ + -days 1000 -notext -md sha256 \ + -in broker.csr.pem -out broker.cert.pem +``` + +At this point, you have a cert, `broker.cert.pem`, and a key, `broker.key-pk8.pem`, which you can use along with `ca.cert.pem` to configure TLS transport encryption for your broker and proxy nodes. + +## Broker Configuration + +To configure a Pulsar [broker](reference-terminology.md#broker) to use TLS transport encryption, you need to make some changes to `broker.conf`, which locates in the `conf` directory of your [Pulsar installation](getting-started-standalone.md). + +Add these values to the configuration file (substituting the appropriate certificate paths where necessary): + +```properties +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem +``` + +> You can find a full list of parameters available in the `conf/broker.conf` file, +> as well as the default values for those parameters, in [Broker Configuration](reference-configuration.md#broker) +> +### TLS Protocol Version and Cipher + +You can configure the broker (and proxy) to require specific TLS protocol versions and ciphers for TLS negiotation. You can use the TLS protocol versions and ciphers to stop clients from requesting downgraded TLS protocol versions or ciphers that may have weaknesses. + +Both the TLS protocol versions and cipher properties can take multiple values, separated by commas. The possible values for protocol version and ciphers depend on the TLS provider that you are using. Pulsar uses OpenSSL if the OpenSSL is available, but if the OpenSSL is not available, Pulsar defaults back to the JDK implementation. + +```properties +tlsProtocols=TLSv1.2,TLSv1.1 +tlsCiphers=TLS_DH_RSA_WITH_AES_256_GCM_SHA384,TLS_DH_RSA_WITH_AES_256_CBC_SHA +``` + +OpenSSL currently supports ```SSL2```, ```SSL3```, ```TLSv1```, ```TLSv1.1``` and ```TLSv1.2``` for the protocol version. You can acquire a list of supported cipher from the openssl ciphers command, i.e. ```openssl ciphers -tls_v2```. + +For JDK 8, you can obtain a list of supported values from the documentation: +- [TLS protocol](https://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#SSLContext) +- [Ciphers](https://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#ciphersuites) + +## Proxy Configuration + +Proxies need to configure TLS in two directions, for clients connecting to the proxy, and for the proxy connecting to brokers. + +```properties +# For clients connecting to the proxy +tlsEnabledInProxy=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +# For the proxy to connect to brokers +tlsEnabledWithBroker=true +brokerClientTrustCertsFilePath=/path/to/ca.cert.pem +``` + +## Client configuration + +When you enable the TLS transport encryption, you need to configure the client to use ```https://``` and port 8443 for the web service URL, and ```pulsar+ssl://``` and port 6651 for the broker service URL. + +As the server certificate that you generated above does not belong to any of the default trust chains, you also need to either specify the path the **trust cert** (recommended), or tell the client to allow untrusted server certs. + +#### Hostname verification + +Hostname verification is a TLS security feature whereby a client can refuse to connect to a server if the "CommonName" does not match the hostname to which the hostname is connecting. By default, Pulsar clients disable hostname verification, as it requires that each broker has a DNS record and a unique cert. + +Moreover, as the administrator has full control of the certificate authority, a bad actor is unlikely to be able to pull off a man-in-the-middle attack. "allowInsecureConnection" allows the client to connect to servers whose cert has not been signed by an approved CA. The client disables "allowInsecureConnection" by default, and you should always disable "allowInsecureConnection" in production environments. As long as you disable "allowInsecureConnection", a man-in-the-middle attack requires that the attacker has access to the CA. + +One scenario where you may want to enable hostname verification is where you have multiple proxy nodes behind a VIP, and the VIP has a DNS record, for example, pulsar.mycompany.com. In this case, you can generate a TLS cert with pulsar.mycompany.com as the "CommonName," and then enable hostname verification on the client. + +The examples below show hostname verification being disabled for the Java client, though you can omit this as the client disables the hostname verification by default. C++/python clients do now allow configuring this at the moment. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS transport with the CLI tools of Pulsar: + +```properties +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +tlsEnableHostnameVerification=false +``` + +### Java client + +```java +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .enableTlsHostnameVerification(false) // false by default, in any case + .allowTlsInsecureConnection(false) // false by default, in any case + .build(); +``` + +### Python client + +```python +from pulsar import Client + +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False) // defaults to false from v2.2.0 onwards +``` + +### C++ client + +```c++ +#include + +pulsar::ClientConfiguration config; +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/ca.cert.pem"); +config.setTlsAllowInsecureConnection(false); // defaults to false from v2.2.0 onwards + +pulsar::Client client("pulsar+ssl://broker.example.com:6651/", config); +``` diff --git a/site2/website/versioned_docs/version-2.4.2/security-token-client.md b/site2/website/versioned_docs/version-2.4.2/security-token-client.md new file mode 100644 index 0000000000000..3ccabe4306d14 --- /dev/null +++ b/site2/website/versioned_docs/version-2.4.2/security-token-client.md @@ -0,0 +1,123 @@ +--- +id: version-2.4.2-security-token-client +title: Client Authentication using tokens +sidebar_label: Client Authentication using tokens +original_id: security-token-client +--- + +## Token Authentication Overview + +Pulsar supports authenticating clients using security tokens that are based on +[JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +You can use tokens to identify a Pulsar client and associate with some "principal" (or "role") that +is permitted to do some actions (for example, publish messages to a topic or consume messages from a topic). + +The administrator (or some automated service) typically gives a user a token string. + +The compact representation of a signed JWT is a string that looks like as the follwing: + +``` +eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY +``` + +Application specifies the token when you are creating the client instance. An alternative is to pass a "token supplier" (a function that returns the token when the client library needs one). + +See [Token authentication admin](security-token-admin.md) for a reference on how to enable token +authentication on a Pulsar cluster. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use the token authentication with CLI tools of Pulsar: + +```properties +webServiceUrl=http://broker.example.com:8080/ +brokerServiceUrl=pulsar://broker.example.com:6650/ +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +authParams=token:eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY +``` + +The token string can also be read from a file, eg: + +``` +authParams=file:///path/to/token/file +``` + +### Java client + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY") + .build(); +``` + +Similarly, one can also pass a `Supplier`: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://broker.example.com:6650/") + .authentication( + AuthenticationFactory.token(() -> { + // Read token from custom source + return readToken(); + }) + .build(); +``` + +### Python client + +```python +from pulsar import Client, AuthenticationToken + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken('eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY')) +``` + +Alternatively, with a supplier: + +```python + +def read_token(): + with open('/path/to/token.txt') as tf: + return tf.read().strip() + +client = Client('pulsar://broker.example.com:6650/' + authentication=AuthenticationToken(read_token)) +``` + +### Go client + + +```go +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY"), +}) +``` + +Alternatively, with a supplier: + +```go +client, err := NewClient(ClientOptions{ + URL: "pulsar://localhost:6650", + Authentication: NewAuthenticationTokenSupplier(func () string { + // Read token from custom source + return readToken() + }), +}) +``` + +### C++ client + +```c++ +#include + +pulsar::ClientConfiguration config; +config.setAuth(pulsar::AuthToken::createWithToken("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY")); + +pulsar::Client client("pulsar://broker.example.com:6650/", config); +``` diff --git a/site2/website/versioned_sidebars/version-2.4.2-sidebars.json b/site2/website/versioned_sidebars/version-2.4.2-sidebars.json new file mode 100644 index 0000000000000..b71224d4ab2af --- /dev/null +++ b/site2/website/versioned_sidebars/version-2.4.2-sidebars.json @@ -0,0 +1,142 @@ +{ + "version-2.4.2-docs": { + "Get started": [ + "version-2.4.2-pulsar-2.0", + "version-2.4.2-standalone", + "version-2.4.2-standalone-docker", + "version-2.4.2-client-libraries" + ], + "Concepts and Architecture": [ + "version-2.4.2-concepts-overview", + "version-2.4.2-concepts-messaging", + "version-2.4.2-concepts-architecture-overview", + "version-2.4.2-concepts-clients", + "version-2.4.2-concepts-replication", + "version-2.4.2-concepts-multi-tenancy", + "version-2.4.2-concepts-authentication", + "version-2.4.2-concepts-topic-compaction", + "version-2.4.2-concepts-tiered-storage", + "version-2.4.2-concepts-schema-registry" + ], + "Pulsar Schema": [ + "version-2.4.2-schema-get-started", + "version-2.4.2-schema-understand", + "version-2.4.2-schema-evolution-compatibility", + "version-2.4.2-schema-manage" + ], + "Pulsar Functions": [ + "version-2.4.2-functions-overview", + "version-2.4.2-functions-worker", + "version-2.4.2-functions-runtime", + "version-2.4.2-functions-develop", + "version-2.4.2-functions-debug", + "version-2.4.2-functions-deploy", + "version-2.4.2-functions-configure", + "version-2.4.2-functions-monitor", + "version-2.4.2-functions-secure", + "version-2.4.2-functions-troubleshoot", + "version-2.4.2-functions-cli" + ], + "Pulsar IO": [ + "version-2.4.2-io-overview", + "version-2.4.2-io-quickstart", + "version-2.4.2-io-use", + "version-2.4.2-io-managing", + "version-2.4.2-io-debug", + "version-2.4.2-io-connectors", + "version-2.4.2-io-develop", + "version-2.4.2-io-cdc" + ], + "Pulsar SQL": [ + "version-2.4.2-sql-overview", + "version-2.4.2-sql-getting-started", + "version-2.4.2-sql-deployment-configurations" + ], + "Deployment": [ + "version-2.4.2-deploy-aws", + "version-2.4.2-deploy-kubernetes", + "version-2.4.2-deploy-bare-metal", + "version-2.4.2-deploy-bare-metal-multi-cluster", + "version-2.4.2-deploy-dcos", + "version-2.4.2-deploy-monitoring" + ], + "Administration": [ + "version-2.4.2-administration-zk-bk", + "version-2.4.2-administration-geo", + "version-2.4.2-administration-dashboard", + "version-2.4.2-administration-stats", + "version-2.4.2-administration-load-balance", + "version-2.4.2-administration-proxy", + "version-2.4.2-administration-upgrade" + ], + "Security": [ + "version-2.4.2-security-overview", + "version-2.4.2-security-tls-transport", + "version-2.4.2-security-tls-authentication", + "version-2.4.2-security-token-client", + "version-2.4.2-security-token-admin", + "version-2.4.2-security-athenz", + "version-2.4.2-security-kerberos", + "version-2.4.2-security-authorization", + "version-2.4.2-security-encryption", + "version-2.4.2-security-extending" + ], + "Client libraries": [ + "version-2.4.2-client-libraries-java", + "version-2.4.2-client-libraries-go", + "version-2.4.2-client-libraries-python", + "version-2.4.2-client-libraries-cpp", + "version-2.4.2-client-libraries-websocket" + ], + "Admin API": [ + "version-2.4.2-admin-api-overview", + "version-2.4.2-admin-api-clusters", + "version-2.4.2-admin-api-tenants", + "version-2.4.2-admin-api-brokers", + "version-2.4.2-admin-api-namespaces", + "version-2.4.2-admin-api-permissions", + "version-2.4.2-admin-api-persistent-topics", + "version-2.4.2-admin-api-non-persistent-topics", + "version-2.4.2-admin-api-partitioned-topics", + "version-2.4.2-admin-api-schemas", + "version-2.4.2-admin-api-functions" + ], + "Adaptors": [ + "version-2.4.2-adaptors-kafka", + "version-2.4.2-adaptors-spark", + "version-2.4.2-adaptors-storm" + ], + "Cookbooks": [ + "version-2.4.2-cookbooks-tiered-storage", + "version-2.4.2-cookbooks-compaction", + "version-2.4.2-cookbooks-deduplication", + "version-2.4.2-cookbooks-non-persistent", + "version-2.4.2-cookbooks-partitioned", + "version-2.4.2-cookbooks-retention-expiry", + "version-2.4.2-cookbooks-encryption", + "version-2.4.2-cookbooks-message-queue", + "version-2.4.2-cookbooks-bookkeepermetadata" + ], + "Development": [ + "version-2.4.2-develop-tools", + "version-2.4.2-develop-binary-protocol", + "version-2.4.2-develop-schema", + "version-2.4.2-develop-load-manager", + "version-2.4.2-develop-cpp" + ], + "Reference": [ + "version-2.4.2-reference-terminology", + "version-2.4.2-reference-cli-tools", + "version-2.4.2-pulsar-admin", + "version-2.4.2-reference-connector-admin", + "version-2.4.2-reference-configuration", + "version-2.4.2-reference-metrics" + ] + }, + "version-2.4.2-docs-other": { + "First Category": [ + "version-2.4.2-doc4", + "version-2.4.2-doc5" + ] + } +} diff --git a/site2/website/versions.json b/site2/website/versions.json index c9fa92c002886..385b06c37a49d 100644 --- a/site2/website/versions.json +++ b/site2/website/versions.json @@ -1,5 +1,6 @@ [ "2.4.1", + "2.4.2", "2.4.0", "2.3.2", "2.3.1",