From a75cd7d27bffec02a9bf7982c5d673a5c29e8324 Mon Sep 17 00:00:00 2001 From: Sijie Guo Date: Fri, 17 Jan 2020 17:35:32 +0800 Subject: [PATCH] [website] Update website to include release 2.5.0 (#6075) --- site2/website/releases.json | 1 + .../version-2.5.0/adaptors-kafka.md | 264 ++ .../admin-api-non-partitioned-topics.md | 104 + .../admin-api-partitioned-topics.md | 376 +++ .../admin-api-persistent-topics.md | 661 +++++ .../version-2.5.0/admin-api-schemas.md | 7 + .../version-2.5.0/administration-dashboard.md | 63 + .../version-2.5.0/administration-geo.md | 158 ++ .../administration-load-balance.md | 182 ++ .../version-2.5.0/administration-proxy.md | 105 + .../administration-pulsar-manager.md | 133 + .../version-2.5.0/administration-stats.md | 64 + .../version-2.5.0/administration-upgrade.md | 151 ++ .../version-2.5.0/administration-zk-bk.md | 322 +++ .../version-2.5.0/client-libraries-cpp.md | 178 ++ .../version-2.5.0/client-libraries-go.md | 493 ++++ .../version-2.5.0/client-libraries-java.md | 825 ++++++ .../version-2.5.0/client-libraries-node.md | 402 +++ .../version-2.5.0/client-libraries-python.md | 249 ++ .../version-2.5.0/concepts-clients.md | 82 + .../version-2.5.0/concepts-messaging.md | 445 ++++ .../version-2.5.0/concepts-overview.md | 31 + .../version-2.5.0/concepts-tiered-storage.md | 18 + .../version-2.5.0/cookbooks-deduplication.md | 121 + .../cookbooks-retention-expiry.md | 291 +++ .../version-2.5.0/cookbooks-tiered-storage.md | 296 +++ .../version-2.5.0/deploy-aws.md | 224 ++ .../deploy-bare-metal-multi-cluster.md | 426 ++++ .../version-2.5.0/deploy-bare-metal.md | 461 ++++ .../version-2.5.0/deploy-dcos.md | 183 ++ .../version-2.5.0/deploy-kubernetes.md | 394 +++ .../version-2.5.0/deploy-monitoring.md | 90 + .../version-2.5.0/functions-cli.md | 198 ++ .../version-2.5.0/functions-debug.md | 455 ++++ .../version-2.5.0/functions-develop.md | 983 ++++++++ .../version-2.5.0/functions-metrics.md | 7 + .../version-2.5.0/functions-overview.md | 200 ++ .../version-2.5.0/functions-runtime.md | 173 ++ .../version-2.5.0/functions-worker.md | 242 ++ .../version-2.5.0/getting-started-clients.md | 59 + .../version-2.5.0/getting-started-docker.md | 161 ++ .../getting-started-standalone.md | 226 ++ .../version-2.5.0/io-aerospike-sink.md | 26 + .../version-2.5.0/io-canal-source.md | 203 ++ .../version-2.5.0/io-cassandra-sink.md | 54 + .../version-2.5.0/io-cdc-debezium.md | 475 ++++ .../versioned_docs/version-2.5.0/io-cdc.md | 26 + .../versioned_docs/version-2.5.0/io-cli.md | 601 +++++ .../version-2.5.0/io-connectors.md | 189 ++ .../version-2.5.0/io-debezium-source.md | 350 +++ .../version-2.5.0/io-develop.md | 230 ++ .../version-2.5.0/io-elasticsearch-sink.md | 50 + .../version-2.5.0/io-file-source.md | 138 + .../version-2.5.0/io-flume-sink.md | 52 + .../version-2.5.0/io-flume-source.md | 52 + .../version-2.5.0/io-hbase-sink.md | 64 + .../version-2.5.0/io-hdfs2-sink.md | 54 + .../version-2.5.0/io-hdfs3-sink.md | 54 + .../version-2.5.0/io-influxdb-sink.md | 62 + .../version-2.5.0/io-jdbc-sink.md | 57 + .../version-2.5.0/io-kafka-sink.md | 69 + .../version-2.5.0/io-kafka-source.md | 171 ++ .../version-2.5.0/io-kinesis-sink.md | 73 + .../version-2.5.0/io-kinesis-source.md | 77 + .../version-2.5.0/io-mongo-sink.md | 52 + .../version-2.5.0/io-netty-source.md | 205 ++ .../version-2.5.0/io-overview.md | 136 + .../version-2.5.0/io-quickstart.md | 824 ++++++ .../version-2.5.0/io-rabbitmq-sink.md | 81 + .../version-2.5.0/io-rabbitmq-source.md | 78 + .../version-2.5.0/io-redis-sink.md | 70 + .../version-2.5.0/io-solr-sink.md | 61 + .../version-2.5.0/io-twitter-source.md | 28 + .../version-2.5.0/io-twitter.md | 7 + .../version-2.5.0/reference-cli-tools.md | 731 ++++++ .../version-2.5.0/reference-configuration.md | 500 ++++ .../reference-connector-admin.md | 7 + .../version-2.5.0/reference-metrics.md | 246 ++ .../version-2.5.0/reference-pulsar-admin.md | 2213 +++++++++++++++++ .../schema-evolution-compatibility.md | 953 +++++++ .../version-2.5.0/schema-get-started.md | 91 + .../version-2.5.0/schema-manage.md | 809 ++++++ .../version-2.5.0/schema-understand.md | 592 +++++ .../version-2.5.0/security-encryption.md | 176 ++ .../version-2.5.0/security-extending.md | 194 ++ .../version-2.5.0/security-overview.md | 31 + .../security-tls-authentication.md | 175 ++ .../version-2.5.0/security-tls-transport.md | 243 ++ .../version-2.5.0/security-token-admin.md | 159 ++ .../sql-deployment-configurations.md | 156 ++ .../version-2.5.0/sql-getting-started.md | 144 ++ .../version-2.5.0/sql-overview.md | 18 + .../version-2.5.0/sql-rest-api.md | 186 ++ .../version-2.5.0-sidebars.json | 145 ++ site2/website/versions.json | 1 + 95 files changed, 22943 insertions(+) create mode 100644 site2/website/versioned_docs/version-2.5.0/adaptors-kafka.md create mode 100644 site2/website/versioned_docs/version-2.5.0/admin-api-non-partitioned-topics.md create mode 100644 site2/website/versioned_docs/version-2.5.0/admin-api-partitioned-topics.md create mode 100644 site2/website/versioned_docs/version-2.5.0/admin-api-persistent-topics.md create mode 100644 site2/website/versioned_docs/version-2.5.0/admin-api-schemas.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-dashboard.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-geo.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-load-balance.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-proxy.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-pulsar-manager.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-stats.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-upgrade.md create mode 100644 site2/website/versioned_docs/version-2.5.0/administration-zk-bk.md create mode 100644 site2/website/versioned_docs/version-2.5.0/client-libraries-cpp.md create mode 100644 site2/website/versioned_docs/version-2.5.0/client-libraries-go.md create mode 100644 site2/website/versioned_docs/version-2.5.0/client-libraries-java.md create mode 100644 site2/website/versioned_docs/version-2.5.0/client-libraries-node.md create mode 100644 site2/website/versioned_docs/version-2.5.0/client-libraries-python.md create mode 100644 site2/website/versioned_docs/version-2.5.0/concepts-clients.md create mode 100644 site2/website/versioned_docs/version-2.5.0/concepts-messaging.md create mode 100644 site2/website/versioned_docs/version-2.5.0/concepts-overview.md create mode 100644 site2/website/versioned_docs/version-2.5.0/concepts-tiered-storage.md create mode 100644 site2/website/versioned_docs/version-2.5.0/cookbooks-deduplication.md create mode 100644 site2/website/versioned_docs/version-2.5.0/cookbooks-retention-expiry.md create mode 100644 site2/website/versioned_docs/version-2.5.0/cookbooks-tiered-storage.md create mode 100644 site2/website/versioned_docs/version-2.5.0/deploy-aws.md create mode 100644 site2/website/versioned_docs/version-2.5.0/deploy-bare-metal-multi-cluster.md create mode 100644 site2/website/versioned_docs/version-2.5.0/deploy-bare-metal.md create mode 100644 site2/website/versioned_docs/version-2.5.0/deploy-dcos.md create mode 100644 site2/website/versioned_docs/version-2.5.0/deploy-kubernetes.md create mode 100644 site2/website/versioned_docs/version-2.5.0/deploy-monitoring.md create mode 100644 site2/website/versioned_docs/version-2.5.0/functions-cli.md create mode 100644 site2/website/versioned_docs/version-2.5.0/functions-debug.md create mode 100644 site2/website/versioned_docs/version-2.5.0/functions-develop.md create mode 100644 site2/website/versioned_docs/version-2.5.0/functions-metrics.md create mode 100644 site2/website/versioned_docs/version-2.5.0/functions-overview.md create mode 100644 site2/website/versioned_docs/version-2.5.0/functions-runtime.md create mode 100644 site2/website/versioned_docs/version-2.5.0/functions-worker.md create mode 100644 site2/website/versioned_docs/version-2.5.0/getting-started-clients.md create mode 100644 site2/website/versioned_docs/version-2.5.0/getting-started-docker.md create mode 100644 site2/website/versioned_docs/version-2.5.0/getting-started-standalone.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-aerospike-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-canal-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-cassandra-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-cdc-debezium.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-cdc.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-cli.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-connectors.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-debezium-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-develop.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-elasticsearch-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-file-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-flume-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-flume-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-hbase-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-hdfs2-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-hdfs3-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-influxdb-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-jdbc-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-kafka-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-kafka-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-kinesis-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-kinesis-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-mongo-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-netty-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-overview.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-quickstart.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-rabbitmq-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-rabbitmq-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-redis-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-solr-sink.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-twitter-source.md create mode 100644 site2/website/versioned_docs/version-2.5.0/io-twitter.md create mode 100644 site2/website/versioned_docs/version-2.5.0/reference-cli-tools.md create mode 100644 site2/website/versioned_docs/version-2.5.0/reference-configuration.md create mode 100644 site2/website/versioned_docs/version-2.5.0/reference-connector-admin.md create mode 100644 site2/website/versioned_docs/version-2.5.0/reference-metrics.md create mode 100644 site2/website/versioned_docs/version-2.5.0/reference-pulsar-admin.md create mode 100644 site2/website/versioned_docs/version-2.5.0/schema-evolution-compatibility.md create mode 100644 site2/website/versioned_docs/version-2.5.0/schema-get-started.md create mode 100644 site2/website/versioned_docs/version-2.5.0/schema-manage.md create mode 100644 site2/website/versioned_docs/version-2.5.0/schema-understand.md create mode 100644 site2/website/versioned_docs/version-2.5.0/security-encryption.md create mode 100644 site2/website/versioned_docs/version-2.5.0/security-extending.md create mode 100644 site2/website/versioned_docs/version-2.5.0/security-overview.md create mode 100644 site2/website/versioned_docs/version-2.5.0/security-tls-authentication.md create mode 100644 site2/website/versioned_docs/version-2.5.0/security-tls-transport.md create mode 100644 site2/website/versioned_docs/version-2.5.0/security-token-admin.md create mode 100644 site2/website/versioned_docs/version-2.5.0/sql-deployment-configurations.md create mode 100644 site2/website/versioned_docs/version-2.5.0/sql-getting-started.md create mode 100644 site2/website/versioned_docs/version-2.5.0/sql-overview.md create mode 100644 site2/website/versioned_docs/version-2.5.0/sql-rest-api.md create mode 100644 site2/website/versioned_sidebars/version-2.5.0-sidebars.json diff --git a/site2/website/releases.json b/site2/website/releases.json index 0f4da24423a69..01d2789fc80cc 100644 --- a/site2/website/releases.json +++ b/site2/website/releases.json @@ -1,5 +1,6 @@ [ "2.4.2", + "2.5.0", "2.4.1", "2.4.0", "2.3.2", diff --git a/site2/website/versioned_docs/version-2.5.0/adaptors-kafka.md b/site2/website/versioned_docs/version-2.5.0/adaptors-kafka.md new file mode 100644 index 0000000000000..753a011d7e422 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/adaptors-kafka.md @@ -0,0 +1,264 @@ +--- +id: version-2.5.0-adaptors-kafka +title: Pulsar adaptor for Apache Kafka +sidebar_label: Kafka client wrapper +original_id: adaptors-kafka +--- + + +Pulsar provides an easy option for applications that are currently written using the [Apache Kafka](http://kafka.apache.org) Java client API. + +## Using the Pulsar Kafka compatibility wrapper + +In an existing application, change the regular Kafka client dependency and replace it with the Pulsar Kafka wrapper. Remove the following dependency in `pom.xml`: + +```xml + + org.apache.kafka + kafka-clients + 0.10.2.1 + +``` + +Then include this dependency for the Pulsar Kafka wrapper: + +```xml + + org.apache.pulsar + pulsar-client-kafka + {{pulsar:version}} + +``` + +With the new dependency, the existing code works without any changes. You need to adjust the configuration, and make sure it points the +producers and consumers to Pulsar service rather than Kafka, and uses a particular +Pulsar topic. + +## Using the Pulsar Kafka compatibility wrapper together with existing kafka client + +When migrating from Kafka to Pulsar, the application might use the original kafka client +and the pulsar kafka wrapper together during migration. You should consider using the +unshaded pulsar kafka client wrapper. + +```xml + + org.apache.pulsar + pulsar-client-kafka-original + {{pulsar:version}} + +``` + +When using this dependency, construct producers using `org.apache.kafka.clients.producer.PulsarKafkaProducer` +instead of `org.apache.kafka.clients.producer.KafkaProducer` and `org.apache.kafka.clients.producer.PulsarKafkaConsumer` for consumers. + +## Producer example + +```java +// Topic needs to be a regular Pulsar topic +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); + +props.put("key.serializer", IntegerSerializer.class.getName()); +props.put("value.serializer", StringSerializer.class.getName()); + +Producer producer = new KafkaProducer<>(props); + +for (int i = 0; i < 10; i++) { + producer.send(new ProducerRecord(topic, i, "hello-" + i)); + log.info("Message {} sent successfully", i); +} + +producer.close(); +``` + +## Consumer example + +```java +String topic = "persistent://public/default/my-topic"; + +Properties props = new Properties(); +// Point to a Pulsar service +props.put("bootstrap.servers", "pulsar://localhost:6650"); +props.put("group.id", "my-subscription-name"); +props.put("enable.auto.commit", "false"); +props.put("key.deserializer", IntegerDeserializer.class.getName()); +props.put("value.deserializer", StringDeserializer.class.getName()); + +Consumer consumer = new KafkaConsumer<>(props); +consumer.subscribe(Arrays.asList(topic)); + +while (true) { + ConsumerRecords records = consumer.poll(100); + records.forEach(record -> { + log.info("Received record: {}", record); + }); + + // Commit last offset + consumer.commitSync(); +} +``` + +## Complete Examples + +You can find the complete producer and consumer examples +[here](https://github.com/apache/pulsar/tree/master/pulsar-client-kafka-compat/pulsar-client-kafka-tests/src/test/java/org/apache/pulsar/client/kafka/compat/examples). + +## Compatibility matrix + +Currently the Pulsar Kafka wrapper supports most of the operations offered by the Kafka API. + +#### Producer + +APIs: + +| Producer Method | Supported | Notes | +|:------------------------------------------------------------------------------|:----------|:-------------------------------------------------------------------------| +| `Future send(ProducerRecord record)` | Yes | | +| `Future send(ProducerRecord record, Callback callback)` | Yes | | +| `void flush()` | Yes | | +| `List partitionsFor(String topic)` | No | | +| `Map metrics()` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | + +Properties: + +| Config property | Supported | Notes | +|:----------------------------------------|:----------|:------------------------------------------------------------------------------| +| `acks` | Ignored | Durability and quorum writes are configured at the namespace level | +| `auto.offset.reset` | Yes | Will have a default value of `latest` if user does not give specific setting. | +| `batch.size` | Ignored | | +| `bootstrap.servers` | Yes | | +| `buffer.memory` | Ignored | | +| `client.id` | Ignored | | +| `compression.type` | Yes | Allows `gzip` and `lz4`. No `snappy`. | +| `connections.max.idle.ms` | Yes | Only support up to 2,147,483,647,000(Integer.MAX_VALUE * 1000) ms of idle time| +| `interceptor.classes` | Yes | | +| `key.serializer` | Yes | | +| `linger.ms` | Yes | Controls the group commit time when batching messages | +| `max.block.ms` | Ignored | | +| `max.in.flight.requests.per.connection` | Ignored | In Pulsar ordering is maintained even with multiple requests in flight | +| `max.request.size` | Ignored | | +| `metric.reporters` | Ignored | | +| `metrics.num.samples` | Ignored | | +| `metrics.sample.window.ms` | Ignored | | +| `partitioner.class` | Yes | | +| `receive.buffer.bytes` | Ignored | | +| `reconnect.backoff.ms` | Ignored | | +| `request.timeout.ms` | Ignored | | +| `retries` | Ignored | Pulsar client retries with exponential backoff until the send timeout expires. | +| `send.buffer.bytes` | Ignored | | +| `timeout.ms` | Yes | | +| `value.serializer` | Yes | | + + +#### Consumer + +The following table lists consumer APIs. + +| Consumer Method | Supported | Notes | +|:--------------------------------------------------------------------------------------------------------|:----------|:------| +| `Set assignment()` | No | | +| `Set subscription()` | Yes | | +| `void subscribe(Collection topics)` | Yes | | +| `void subscribe(Collection topics, ConsumerRebalanceListener callback)` | No | | +| `void assign(Collection partitions)` | No | | +| `void subscribe(Pattern pattern, ConsumerRebalanceListener callback)` | No | | +| `void unsubscribe()` | Yes | | +| `ConsumerRecords poll(long timeoutMillis)` | Yes | | +| `void commitSync()` | Yes | | +| `void commitSync(Map offsets)` | Yes | | +| `void commitAsync()` | Yes | | +| `void commitAsync(OffsetCommitCallback callback)` | Yes | | +| `void commitAsync(Map offsets, OffsetCommitCallback callback)` | Yes | | +| `void seek(TopicPartition partition, long offset)` | Yes | | +| `void seekToBeginning(Collection partitions)` | Yes | | +| `void seekToEnd(Collection partitions)` | Yes | | +| `long position(TopicPartition partition)` | Yes | | +| `OffsetAndMetadata committed(TopicPartition partition)` | Yes | | +| `Map metrics()` | No | | +| `List partitionsFor(String topic)` | No | | +| `Map> listTopics()` | No | | +| `Set paused()` | No | | +| `void pause(Collection partitions)` | No | | +| `void resume(Collection partitions)` | No | | +| `Map offsetsForTimes(Map timestampsToSearch)` | No | | +| `Map beginningOffsets(Collection partitions)` | No | | +| `Map endOffsets(Collection partitions)` | No | | +| `void close()` | Yes | | +| `void close(long timeout, TimeUnit unit)` | Yes | | +| `void wakeup()` | No | | + +Properties: + +| Config property | Supported | Notes | +|:--------------------------------|:----------|:------------------------------------------------------| +| `group.id` | Yes | Maps to a Pulsar subscription name | +| `max.poll.records` | Yes | | +| `max.poll.interval.ms` | Ignored | Messages are "pushed" from broker | +| `session.timeout.ms` | Ignored | | +| `heartbeat.interval.ms` | Ignored | | +| `bootstrap.servers` | Yes | Needs to point to a single Pulsar service URL | +| `enable.auto.commit` | Yes | | +| `auto.commit.interval.ms` | Ignored | With auto-commit, acks are sent immediately to broker | +| `partition.assignment.strategy` | Ignored | | +| `auto.offset.reset` | Yes | Only support earliest and latest. | +| `fetch.min.bytes` | Ignored | | +| `fetch.max.bytes` | Ignored | | +| `fetch.max.wait.ms` | Ignored | | +| `interceptor.classes` | Yes | | +| `metadata.max.age.ms` | Ignored | | +| `max.partition.fetch.bytes` | Ignored | | +| `send.buffer.bytes` | Ignored | | +| `receive.buffer.bytes` | Ignored | | +| `client.id` | Ignored | | + + +## Customize Pulsar configurations + +You can configure Pulsar authentication provider directly from the Kafka properties. + +### Pulsar client properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.authentication.class`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-org.apache.pulsar.client.api.Authentication-) | | Configure to auth provider. For example, `org.apache.pulsar.client.impl.auth.AuthenticationTls`.| +| [`pulsar.authentication.params.map`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.util.Map-) | | Map which represents parameters for the Authentication-Plugin. | +| [`pulsar.authentication.params.string`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setAuthentication-java.lang.String-java.lang.String-) | | String which represents parameters for the Authentication-Plugin, for example, `key1:val1,key2:val2`. | +| [`pulsar.use.tls`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTls-boolean-) | `false` | Enable TLS transport encryption. | +| [`pulsar.tls.trust.certs.file.path`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsTrustCertsFilePath-java.lang.String-) | | Path for the TLS trust certificate store. | +| [`pulsar.tls.allow.insecure.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setTlsAllowInsecureConnection-boolean-) | `false` | Accept self-signed certificates from brokers. | +| [`pulsar.operation.timeout.ms`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setOperationTimeout-int-java.util.concurrent.TimeUnit-) | `30000` | General operations timeout. | +| [`pulsar.stats.interval.seconds`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setStatsInterval-long-java.util.concurrent.TimeUnit-) | `60` | Pulsar client lib stats printing interval. | +| [`pulsar.num.io.threads`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setIoThreads-int-) | `1` | The number of Netty IO threads to use. | +| [`pulsar.connections.per.broker`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConnectionsPerBroker-int-) | `1` | The maximum number of connection to each broker. | +| [`pulsar.use.tcp.nodelay`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setUseTcpNoDelay-boolean-) | `true` | TCP no-delay. | +| [`pulsar.concurrent.lookup.requests`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setConcurrentLookupRequest-int-) | `50000` | The maximum number of concurrent topic lookups. | +| [`pulsar.max.number.rejected.request.per.connection`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientConfiguration.html#setMaxNumberOfRejectedRequestPerConnection-int-) | `50` | The threshold of errors to forcefully close a connection. | +| [`pulsar.keepalive.interval.ms`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ClientBuilder.html#keepAliveInterval-int-java.util.concurrent.TimeUnit-)| `30000` | Keep alive interval for each client-broker-connection. | + + +### Pulsar producer properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.producer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setProducerName-java.lang.String-) | | Specify the producer name. | +| [`pulsar.producer.initial.sequence.id`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setInitialSequenceId-long-) | | Specify baseline for sequence ID of this producer. | +| [`pulsar.producer.max.pending.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessages-int-) | `1000` | Set the maximum size of the message queue pending to receive an acknowledgment from the broker. | +| [`pulsar.producer.max.pending.messages.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setMaxPendingMessagesAcrossPartitions-int-) | `50000` | Set the maximum number of pending messages across all the partitions. | +| [`pulsar.producer.batching.enabled`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingEnabled-boolean-) | `true` | Control whether automatic batching of messages is enabled for the producer. | +| [`pulsar.producer.batching.max.messages`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ProducerConfiguration.html#setBatchingMaxMessages-int-) | `1000` | The maximum number of messages in a batch. | + + +### Pulsar consumer Properties + +| Config property | Default | Notes | +|:---------------------------------------|:--------|:---------------------------------------------------------------------------------------| +| [`pulsar.consumer.name`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setConsumerName-java.lang.String-) | | Specify the consumer name. | +| [`pulsar.consumer.receiver.queue.size`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setReceiverQueueSize-int-) | 1000 | Set the size of the consumer receiver queue. | +| [`pulsar.consumer.acknowledgments.group.time.millis`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#acknowledgmentGroupTime-long-java.util.concurrent.TimeUnit-) | 100 | Set the maximum amount of group time for consumers to send the acknowledgments to the broker. | +| [`pulsar.consumer.total.receiver.queue.size.across.partitions`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerConfiguration.html#setMaxTotalReceiverQueueSizeAcrossPartitions-int-) | 50000 | Set the maximum size of the total receiver queue across partitions. | +| [`pulsar.consumer.subscription.topics.mode`](http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/ConsumerBuilder.html#subscriptionTopicsMode-Mode-) | PersistentOnly | Set the subscription topic mode for consumers. | diff --git a/site2/website/versioned_docs/version-2.5.0/admin-api-non-partitioned-topics.md b/site2/website/versioned_docs/version-2.5.0/admin-api-non-partitioned-topics.md new file mode 100644 index 0000000000000..b5da301259d80 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/admin-api-non-partitioned-topics.md @@ -0,0 +1,104 @@ +--- +id: version-2.5.0-admin-api-non-partitioned-topics +title: Managing non-partitioned topics +sidebar_label: Non-Partitioned topics +original_id: admin-api-non-partitioned-topics +--- + + +You can use Pulsar's [admin API](admin-api-overview.md) to create and manage non-partitioned topics. + +In all of the instructions and commands below, the topic name structure is: + +```shell +persistent://tenant/namespace/topic +``` + +## Non-Partitioned topics resources + +### Create + +Non-partitioned topics in Pulsar must be explicitly created. When creating a new non-partitioned topic you +need to provide a name for the topic. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +#### pulsar-admin + +You can create non-partitioned topics using the [`create`](reference-pulsar-admin.md#create-3) +command and specifying the topic name as an argument. +Here's an example: + +```shell +$ bin/pulsar-admin topics create \ + persistent://my-tenant/my-namespace/my-topic +``` + +> #### Note +> +> It's only allowed to create non partitioned topic of name contains suffix '-partition-' followed by numeric value like +> 'xyz-topic-partition-10', if there's already a partitioned topic with same name, in this case 'xyz-topic', and has +> number of partition larger then that numeric value in this case 11(partition index is start from 0). Else creation of such topic will fail. + +#### REST API + +{@inject: endpoint|PUT|/admin/v2/persistent/:tenant/:namespace/:topic|operation/createNonPartitionedTopic} + +#### Java + +```java +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.topics().createNonPartitionedTopic(topicName); +``` + +### Delete + +#### pulsar-admin + +Non-partitioned topics can be deleted using the +[`delete`](reference-pulsar-admin.md#delete-4) command, specifying the topic by name: + +```shell +$ bin/pulsar-admin topics delete \ + persistent://my-tenant/my-namespace/my-topic +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/persistent/:tenant/:namespace/:topic|operation/deleteTopic} + +#### Java + +```java +admin.topics().delete(persistentTopic); +``` + +### List + +It provides a list of topics existing under a given namespace. + +#### pulsar-admin + +```shell +$ pulsar-admin topics list tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace|operation/getList} + +#### Java + +```java +admin.topics().getList(namespace); +``` diff --git a/site2/website/versioned_docs/version-2.5.0/admin-api-partitioned-topics.md b/site2/website/versioned_docs/version-2.5.0/admin-api-partitioned-topics.md new file mode 100644 index 0000000000000..85fb787a1ba08 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/admin-api-partitioned-topics.md @@ -0,0 +1,376 @@ +--- +id: version-2.5.0-admin-api-partitioned-topics +title: Managing partitioned topics +sidebar_label: Partitioned topics +original_id: admin-api-partitioned-topics +--- + + +You can use Pulsar's [admin API](admin-api-overview.md) to create and manage partitioned topics. + +In all of the instructions and commands below, the topic name structure is: + +```shell +persistent://tenant/namespace/topic +``` + +## Partitioned topics resources + +### Create + +Partitioned topics in Pulsar must be explicitly created. When creating a new partitioned topic you +need to provide a name for the topic as well as the desired number of partitions. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +#### pulsar-admin + +You can create partitioned topics using the [`create-partitioned-topic`](reference-pulsar-admin.md#create-partitioned-topic) +command and specifying the topic name as an argument and the number of partitions using the `-p` or `--partitions` flag. + +Here's an example: + +```shell +$ bin/pulsar-admin topics create-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 4 +``` + +> #### Note +> +> If there already exists a non partitioned topic with suffix '-partition-' followed by numeric value like +> 'xyz-topic-partition-10', then you can not create partitioned topic with name 'xyz-topic' as the partitions +> of the partitioned topic could override the existing non partitioned topic. You have to delete that non +> partitioned topic first then create the partitioned topic. + +#### REST API + +{@inject: endpoint|PUT|/admin/v2/persistent/:tenant/:namespace/:topic/partitions|operation/createPartitionedTopic} + +#### Java + +```java +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +int numPartitions = 4; +admin.persistentTopics().createPartitionedTopic(topicName, numPartitions); +``` + +### Create missed partitions + +Try to create partitions for partitioned topic. The partitions of partition topic has to be created, +can be used by repair partitions when topic auto creation is disabled + +#### pulsar-admin + +You can create missed partitions using the [`create-missed-partitions`](reference-pulsar-admin.md#create-missed-partitions) +command and specifying the topic name as an argument. + +Here's an example: + +```shell +$ bin/pulsar-admin topics create-missed-partitions \ + persistent://my-tenant/my-namespace/my-topic \ +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/persistent/:tenant/:namespace/:topic|operation/createMissedPartitions} + +#### Java + +```java +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().createMissedPartitions(topicName); +``` + +### Get metadata + +Partitioned topics have metadata associated with them that you can fetch as a JSON object. +The following metadata fields are currently available: + +Field | Meaning +:-----|:------- +`partitions` | The number of partitions into which the topic is divided + +#### pulsar-admin + +You can see the number of partitions in a partitioned topic using the +[`get-partitioned-topic-metadata`](reference-pulsar-admin.md#get-partitioned-topic-metadata) +subcommand. Here's an example: + +```shell +$ pulsar-admin topics get-partitioned-topic-metadata \ + persistent://my-tenant/my-namespace/my-topic +{ + "partitions": 4 +} +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/partitions|operation/getPartitionedMetadata} + +#### Java + +```java +String topicName = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getPartitionedTopicMetadata(topicName); +``` + +### Update + +You can update the number of partitions on an existing partitioned topic +*if* the topic is non-global. To update, the new number of partitions must be greater +than the existing number. + +Decrementing the number of partitions would deleting the topic, which is not supported in Pulsar. + +Already created partitioned producers and consumers will automatically find the newly created partitions. + +#### pulsar-admin + +Partitioned topics can be updated using the +[`update-partitioned-topic`](reference-pulsar-admin.md#update-partitioned-topic) command. + +```shell +$ pulsar-admin topics update-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic \ + --partitions 8 +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/persistent/:tenant/:cluster/:namespace/:destination/partitions|operation/updatePartitionedTopic} + +#### Java + +```java +admin.persistentTopics().updatePartitionedTopic(persistentTopic, numPartitions); +``` + +### Delete + +#### pulsar-admin + +Partitioned topics can be deleted using the +[`delete-partitioned-topic`](reference-pulsar-admin.md#delete-partitioned-topic) command, specifying the topic by name: + +```shell +$ bin/pulsar-admin topics delete-partitioned-topic \ + persistent://my-tenant/my-namespace/my-topic +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/persistent/:topic/:namespace/:destination/partitions|operation/deletePartitionedTopic} + +#### Java + +```java +admin.persistentTopics().delete(persistentTopic); +``` + +### List + +It provides a list of persistent topics existing under a given namespace. + +#### pulsar-admin + +```shell +$ pulsar-admin topics list tenant/namespace +persistent://tenant/namespace/topic1 +persistent://tenant/namespace/topic2 +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace|operation/getPartitionedTopicList} + +#### Java + +```java +admin.persistentTopics().getList(namespace); +``` + +### Stats + +It shows current statistics of a given partitioned topic. Here's an example payload: + +```json +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} +``` + +The following stats are available: + +|Stat|Description| +|----|-----------| +|msgRateIn|The sum of all local and replication publishers’ publish rates in messages per second| +|msgThroughputIn|Same as msgRateIn but in bytes per second instead of messages per second| +|msgRateOut|The sum of all local and replication consumers’ dispatch rates in messages per second| +|msgThroughputOut|Same as msgRateOut but in bytes per second instead of messages per second| +|averageMsgSize|Average message size, in bytes, from this publisher within the last interval| +|storageSize|The sum of the ledgers’ storage size for this topic| +|publishers|The list of all local publishers into the topic. There can be anywhere from zero to thousands.| +|producerId|Internal identifier for this producer on this topic| +|producerName|Internal identifier for this producer, generated by the client library| +|address|IP address and source port for the connection of this producer| +|connectedSince|Timestamp this producer was created or last reconnected| +|subscriptions|The list of all local subscriptions to the topic| +|my-subscription|The name of this subscription (client defined)| +|msgBacklog|The count of messages in backlog for this subscription| +|type|This subscription type| +|msgRateExpired|The rate at which messages were discarded instead of dispatched from this subscription due to TTL| +|consumers|The list of connected consumers for this subscription| +|consumerName|Internal identifier for this consumer, generated by the client library| +|availablePermits|The number of messages this consumer has space for in the client library’s listen queue. A value of 0 means the client library’s queue is full and receive() isn’t being called. A nonzero value means this consumer is ready to be dispatched messages.| +|replication|This section gives the stats for cross-colo replication of this topic| +|replicationBacklog|The outbound replication backlog in messages| +|connected|Whether the outbound replicator is connected| +|replicationDelayInSeconds|How long the oldest message has been waiting to be sent through the connection, if connected is true| +|inboundConnection|The IP and port of the broker in the remote cluster’s publisher connection to this broker| +|inboundConnectedSince|The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute.| + +#### pulsar-admin + +The stats for the partitioned topic and its connected producers and consumers can be fetched by using the +[`partitioned-stats`](reference-pulsar-admin.md#partitioned-stats) command, specifying the topic by name: + +```shell +$ pulsar-admin topics partitioned-stats \ + persistent://test-tenant/namespace/topic \ + --per-partition +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/partitioned-stats|operation/getPartitionedStats} + +#### Java + +```java +admin.persistentTopics().getStats(persistentTopic); +``` + +### Internal stats + +It shows detailed statistics of a topic. + +|Stat|Description| +|----|-----------| +|entriesAddedCounter|Messages published since this broker loaded this topic| +|numberOfEntries|Total number of messages being tracked| +|totalSize|Total storage size in bytes of all messages| +|currentLedgerEntries|Count of messages written to the ledger currently open for writing| +|currentLedgerSize|Size in bytes of messages written to ledger currently open for writing| +|lastLedgerCreatedTimestamp|Time when last ledger was created| +|lastLedgerCreationFailureTimestamp|time when last ledger was failed| +|waitingCursorsCount|How many cursors are caught up and waiting for a new message to be published| +|pendingAddEntriesCount|How many messages have (asynchronous) write requests we are waiting on completion| +|lastConfirmedEntry|The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger has been opened or is currently being opened but has no entries written yet.| +|state|The state of the cursor ledger. Open means we have a cursor ledger for saving updates of the markDeletePosition.| +|ledgers|The ordered list of all ledgers for this topic holding its messages| +|cursors|The list of all cursors on this topic. There will be one for every subscription you saw in the topic stats.| +|markDeletePosition|The ack position: the last message the subscriber acknowledged receiving| +|readPosition|The latest position of subscriber for reading message| +|waitingReadOp|This is true when the subscription has read the latest message published to the topic and is waiting on new messages to be published.| +|pendingReadOps|The counter for how many outstanding read requests to the BookKeepers we have in progress| +|messagesConsumedCounter|Number of messages this cursor has acked since this broker loaded this topic| +|cursorLedger|The ledger being used to persistently store the current markDeletePosition| +|cursorLedgerLastEntry|The last entryid used to persistently store the current markDeletePosition| +|individuallyDeletedMessages|If Acks are being done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position| +|lastLedgerSwitchTimestamp|The last time the cursor ledger was rolled over| + + +```json +{ + "entriesAddedCounter": 20449518, + "numberOfEntries": 3233, + "totalSize": 331482, + "currentLedgerEntries": 3233, + "currentLedgerSize": 331482, + "lastLedgerCreatedTimestamp": "2016-06-29 03:00:23.825", + "lastLedgerCreationFailureTimestamp": null, + "waitingCursorsCount": 1, + "pendingAddEntriesCount": 0, + "lastConfirmedEntry": "324711539:3232", + "state": "LedgerOpened", + "ledgers": [ + { + "ledgerId": 324711539, + "entries": 0, + "size": 0 + } + ], + "cursors": { + "my-subscription": { + "markDeletePosition": "324711539:3133", + "readPosition": "324711539:3233", + "waitingReadOp": true, + "pendingReadOps": 0, + "messagesConsumedCounter": 20449501, + "cursorLedger": 324702104, + "cursorLedgerLastEntry": 21, + "individuallyDeletedMessages": "[(324711539:3134‥324711539:3136], (324711539:3137‥324711539:3140], ]", + "lastLedgerSwitchTimestamp": "2016-06-29 01:30:19.313", + "state": "Open" + } + } +} +``` + +#### pulsar-admin + +The internal stats for the partitioned topic can be fetched by using the +[`stats-internal`](reference-pulsar-admin.md#stats-internal) command, specifying the topic by name: + +```shell +$ pulsar-admin topics stats-internal \ + persistent://test-tenant/namespace/topic +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/internalStats|operation/getInternalStats} + +#### Java + +```java +admin.persistentTopics().getInternalStats(persistentTopic); +``` diff --git a/site2/website/versioned_docs/version-2.5.0/admin-api-persistent-topics.md b/site2/website/versioned_docs/version-2.5.0/admin-api-persistent-topics.md new file mode 100644 index 0000000000000..fa44304f2da3e --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/admin-api-persistent-topics.md @@ -0,0 +1,661 @@ +--- +id: version-2.5.0-admin-api-persistent-topics +title: Managing persistent topics +sidebar_label: Persistent topics +original_id: admin-api-persistent-topics +--- + +Persistent helps to access topic which is a logical endpoint for publishing and consuming messages. Producers publish messages to the topic and consumers subscribe to the topic, to consume messages published to the topic. + +In all of the instructions and commands below, the topic name structure is: + + +```shell +persistent://tenant/namespace/topic +``` + +## Persistent topics resources + +### List of topics + +It provides a list of persistent topics exist under a given namespace. + +#### pulsar-admin + +List of topics can be fetched using [`list`](../../reference/CliTools#list) command. + +```shell +$ pulsar-admin persistent list \ + my-tenant/my-namespace +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace|operation/getList} + +#### Java + +```java +String namespace = "my-tenant/my-namespace"; +admin.persistentTopics().getList(namespace); +``` + +### Grant permission + +It grants permissions on a client role to perform specific actions on a given topic. + +#### pulsar-admin + +Permission can be granted using [`grant-permission`](../../reference/CliTools#grant-permission) command. + +```shell +$ pulsar-admin persistent grant-permission \ + --actions produce,consume --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/persistent/:tenant/:namespace/:topic/permissions/:role|operation/grantPermissionsOnTopic} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +Set actions = Sets.newHashSet(AuthAction.produce, AuthAction.consume); +admin.persistentTopics().grantPermission(topic, role, actions); +``` + +### Get permission + +Permission can be fetched using [`permissions`](../../reference/CliTools#permissions) command. + +#### pulsar-admin + +```shell +$ pulsar-admin persistent permissions \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/permissions|operation/getPermissionsOnTopic} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getPermissions(topic); +``` + +### Revoke permission + +It revokes a permission which was granted on a client role. + +#### pulsar-admin + +Permission can be revoked using [`revoke-permission`](../../reference/CliTools#revoke-permission) command. + +```shell +$ pulsar-admin persistent revoke-permission \ + --role application1 \ + persistent://test-tenant/ns1/tp1 \ + +{ + "application1": [ + "consume", + "produce" + ] +} +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/persistent/:tenant/:namespace/:topic/permissions/:role|operation/revokePermissionsOnTopic} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String role = "test-role"; +admin.persistentTopics().revokePermissions(topic, role); +``` + +### Delete topic + +It deletes a topic. The topic cannot be deleted if there's any active subscription or producers connected to it. + +#### pulsar-admin + +Topic can be deleted using [`delete`](../../reference/CliTools#delete) command. + +```shell +$ pulsar-admin persistent delete \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/persistent/:tenant/:namespace/:topic|operation/deleteTopic} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().delete(topic); +``` + +### Unload topic + +It unloads a topic. + +#### pulsar-admin + +Topic can be unloaded using [`unload`](../../reference/CliTools#unload) command. + +```shell +$ pulsar-admin persistent unload \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|PUT|/admin/v2/persistent/:tenant/:namespace/:topic/unload|operation/unloadTopic} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().unload(topic); +``` + +### Get stats + +It shows current statistics of a given non-partitioned topic. + + - **msgRateIn**: The sum of all local and replication publishers' publish rates in messages per second + + - **msgThroughputIn**: Same as above, but in bytes per second instead of messages per second + + - **msgRateOut**: The sum of all local and replication consumers' dispatch rates in messages per second + + - **msgThroughputOut**: Same as above, but in bytes per second instead of messages per second + + - **averageMsgSize**: The average size in bytes of messages published within the last interval + + - **storageSize**: The sum of the ledgers' storage size for this topic. Space used to store the messages for the topic + + - **publishers**: The list of all local publishers into the topic. There can be zero or thousands + + - **msgRateIn**: Total rate of messages published by this publisher in messages per second + + - **msgThroughputIn**: Total throughput of the messages published by this publisher in bytes per second + + - **averageMsgSize**: Average message size in bytes from this publisher within the last interval + + - **producerId**: Internal identifier for this producer on this topic + + - **producerName**: Internal identifier for this producer, generated by the client library + + - **address**: IP address and source port for the connection of this producer + + - **connectedSince**: Timestamp this producer was created or last reconnected + + - **subscriptions**: The list of all local subscriptions to the topic + + - **my-subscription**: The name of this subscription (client defined) + + - **msgRateOut**: Total rate of messages delivered on this subscription (msg/s) + + - **msgThroughputOut**: Total throughput delivered on this subscription (bytes/s) + + - **msgBacklog**: Number of messages in the subscription backlog + + - **type**: This subscription type + + - **msgRateExpired**: The rate at which messages were discarded instead of dispatched from this subscription due to TTL + + - **consumers**: The list of connected consumers for this subscription + + - **msgRateOut**: Total rate of messages delivered to the consumer (msg/s) + + - **msgThroughputOut**: Total throughput delivered to the consumer (bytes/s) + + - **consumerName**: Internal identifier for this consumer, generated by the client library + + - **availablePermits**: The number of messages this consumer has space for in the client library's listen queue. A value of 0 means the client library's queue is full and receive() isn't being called. A nonzero value means this consumer is ready to be dispatched messages. + + - **unackedMessages**: Number of unacknowledged messages for the consumer + + - **blockedConsumerOnUnackedMsgs**: Flag to verify if the consumer is blocked due to reaching threshold of unacked messages + + - **replication**: This section gives the stats for cross-colo replication of this topic + + - **msgRateIn**: Total rate of messages received from the remote cluster (msg/s) + + - **msgThroughputIn**: Total throughput received from the remote cluster (bytes/s) + + - **msgRateOut**: Total rate of messages delivered to the replication-subscriber (msg/s) + + - **msgThroughputOut**: Total through delivered to the replication-subscriber (bytes/s) + + - **msgRateExpired**: Total rate of messages expired (msg/s) + + - **replicationBacklog**: Number of messages pending to be replicated to remote cluster + + - **connected**: Whether the outbound replicator is connected + + - **replicationDelayInSeconds**: How long the oldest message has been waiting to be sent through the connection, if connected is true + + - **inboundConnection**: The IP and port of the broker in the remote cluster's publisher connection to this broker + + - **inboundConnectedSince**: The TCP connection being used to publish messages to the remote cluster. If there are no local publishers connected, this connection is automatically closed after a minute. + + - **outboundConnection**: Address of outbound replication connection + + - **outboundConnectedSince**: Timestamp of establishing outbound connection + +```json +{ + "msgRateIn": 4641.528542257553, + "msgThroughputIn": 44663039.74947473, + "msgRateOut": 0, + "msgThroughputOut": 0, + "averageMsgSize": 1232439.816728665, + "storageSize": 135532389160, + "publishers": [ + { + "msgRateIn": 57.855383881403576, + "msgThroughputIn": 558994.7078932219, + "averageMsgSize": 613135, + "producerId": 0, + "producerName": null, + "address": null, + "connectedSince": null + } + ], + "subscriptions": { + "my-topic_subscription": { + "msgRateOut": 0, + "msgThroughputOut": 0, + "msgBacklog": 116632, + "type": null, + "msgRateExpired": 36.98245516804671, + "consumers": [] + } + }, + "replication": {} +} +``` + +#### pulsar-admin + +Topic stats can be fetched using [`stats`](../../reference/CliTools#stats) command. + +```shell +$ pulsar-admin persistent stats \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/stats|operation/getStats} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getStats(topic); +``` + +### Get internal stats + +It shows detailed statistics of a topic. + + - **entriesAddedCounter**: Messages published since this broker loaded this topic + + - **numberOfEntries**: Total number of messages being tracked + + - **totalSize**: Total storage size in bytes of all messages + + - **currentLedgerEntries**: Count of messages written to the ledger currently open for writing + + - **currentLedgerSize**: Size in bytes of messages written to ledger currently open for writing + + - **lastLedgerCreatedTimestamp**: time when last ledger was created + + - **lastLedgerCreationFailureTimestamp:** time when last ledger was failed + + - **waitingCursorsCount**: How many cursors are "caught up" and waiting for a new message to be published + + - **pendingAddEntriesCount**: How many messages have (asynchronous) write requests we are waiting on completion + + - **lastConfirmedEntry**: The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger has been opened or is currently being opened but has no entries written yet. + + - **state**: The state of this ledger for writing. LedgerOpened means we have a ledger open for saving published messages. + + - **ledgers**: The ordered list of all ledgers for this topic holding its messages + + - **ledgerId**: Id of this ledger + + - **entries**: Total number of entries belong to this ledger + + - **size**: Size of messages written to this ledger (in bytes) + + - **offloaded**: Whether this ledger is offloaded + + - **cursors**: The list of all cursors on this topic. There will be one for every subscription you saw in the topic stats. + + - **markDeletePosition**: All of messages before the markDeletePosition are acknowledged by the subscriber. + + - **readPosition**: The latest position of subscriber for reading message + + - **waitingReadOp**: This is true when the subscription has read the latest message published to the topic and is waiting on new messages to be published. + + - **pendingReadOps**: The counter for how many outstanding read requests to the BookKeepers we have in progress + + - **messagesConsumedCounter**: Number of messages this cursor has acked since this broker loaded this topic + + - **cursorLedger**: The ledger being used to persistently store the current markDeletePosition + + - **cursorLedgerLastEntry**: The last entryid used to persistently store the current markDeletePosition + + - **individuallyDeletedMessages**: If Acks are being done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position + + - **lastLedgerSwitchTimestamp**: The last time the cursor ledger was rolled over + + - **state**: The state of the cursor ledger: Open means we have a cursor ledger for saving updates of the markDeletePosition. + +```json +{ + "entriesAddedCounter": 20449518, + "numberOfEntries": 3233, + "totalSize": 331482, + "currentLedgerEntries": 3233, + "currentLedgerSize": 331482, + "lastLedgerCreatedTimestamp": "2016-06-29 03:00:23.825", + "lastLedgerCreationFailureTimestamp": null, + "waitingCursorsCount": 1, + "pendingAddEntriesCount": 0, + "lastConfirmedEntry": "324711539:3232", + "state": "LedgerOpened", + "ledgers": [ + { + "ledgerId": 324711539, + "entries": 0, + "size": 0 + } + ], + "cursors": { + "my-subscription": { + "markDeletePosition": "324711539:3133", + "readPosition": "324711539:3233", + "waitingReadOp": true, + "pendingReadOps": 0, + "messagesConsumedCounter": 20449501, + "cursorLedger": 324702104, + "cursorLedgerLastEntry": 21, + "individuallyDeletedMessages": "[(324711539:3134‥324711539:3136], (324711539:3137‥324711539:3140], ]", + "lastLedgerSwitchTimestamp": "2016-06-29 01:30:19.313", + "state": "Open" + } + } +} +``` + + +#### pulsar-admin + +Topic internal-stats can be fetched using [`stats-internal`](../../reference/CliTools#stats-internal) command. + +```shell +$ pulsar-admin persistent stats-internal \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/internalStats|operation/getInternalStats} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getInternalStats(topic); +``` + +### Peek messages + +It peeks N messages for a specific subscription of a given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent peek-messages \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ + +Message ID: 315674752:0 +Properties: { "X-Pulsar-publish-time" : "2015-07-13 17:40:28.451" } +msg-payload +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/subscription/:subName/position/:messagePosition|operation/peekNthMessage} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.persistentTopics().peekMessages(topic, subName, numMessages); +``` + +### Skip messages + +It skips N messages for a specific subscription of a given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent skip \ + --count 10 --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/persistent/:tenant/:namespace/:topic/subscription/:subName/skip/:numMessages|operation/skipMessages} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +int numMessages = 1; +admin.persistentTopics().skipMessages(topic, subName, numMessages); +``` + +### Skip all messages + +It skips all old messages for a specific subscription of a given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent skip-all \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/persistent/:tenant/:namespace/:topic/subscription/:subName/skip_all|operation/skipAllMessages} + +[More info](../../reference/RestApi#/admin/persistent/:tenant/:namespace/:topic/subscription/:subName/skip_all) + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +admin.persistentTopics().skipAllMessages(topic, subName); +``` + +### Reset cursor + +It resets a subscription’s cursor position back to the position which was recorded X minutes before. It essentially calculates time and position of cursor at X minutes before and resets it at that position. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent reset-cursor \ + --subscription my-subscription --time 10 \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/persistent/:tenant/:namespace/:topic/subscription/:subName/resetcursor/:timestamp|operation/resetCursor} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subName = "my-subscription"; +long timestamp = 2342343L; +admin.persistentTopics().skipAllMessages(topic, subName, timestamp); +``` + +### Lookup of topic + +It locates broker url which is serving the given topic. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent lookup \ + persistent://test-tenant/ns1/tp1 \ + + "pulsar://broker1.org.com:4480" +``` + +#### REST API + +{@inject: endpoint|GET|/lookup/v2/topic/persistent/:tenant:namespace/:topic|/} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().lookupDestination(topic); +``` + +### Get bundle + +It gives range of the bundle which contains given topic + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent bundle-range \ + persistent://test-tenant/ns1/tp1 \ + + "0x00000000_0xffffffff" +``` + +#### REST API + +{@inject: endpoint|GET|/lookup/v2/topic/:topic_domain/:tenant/:namespace/:topic/bundle|/} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.lookup().getBundleRange(topic); +``` + + +### Get subscriptions + +It shows all subscription names for a given topic. + +#### pulsar-admin + +```shell +$ pulsar-admin persistent subscriptions \ + persistent://test-tenant/ns1/tp1 \ + + my-subscription +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/persistent/:tenant/:namespace/:topic/subscriptions|operation/getSubscriptions} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getSubscriptions(topic); +``` + +### Unsubscribe + +It can also help to unsubscribe a subscription which is no more processing further messages. + +#### pulsar-admin + + +```shell +$ pulsar-admin persistent unsubscribe \ + --subscription my-subscription \ + persistent://test-tenant/ns1/tp1 \ +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/:topic/subscription/:subscription|operation/deleteSubscription} + +#### Java + +```java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +String subscriptionName = "my-subscription"; +admin.persistentTopics().deleteSubscription(topic, subscriptionName); +``` + +### Last Message Id + +It gives the last commited message ID for a persistent topic, and it will be available in 2.3.0. + +```shell +pulsar-admin topics last-message-id topic-name +``` + +#### REST API +{% endpoint Get /admin/v2/persistent/:tenant/:namespace/:topic/lastMessageId %} + +#### Java + +```Java +String topic = "persistent://my-tenant/my-namespace/my-topic"; +admin.persistentTopics().getLastMessage(topic); +``` diff --git a/site2/website/versioned_docs/version-2.5.0/admin-api-schemas.md b/site2/website/versioned_docs/version-2.5.0/admin-api-schemas.md new file mode 100644 index 0000000000000..015fbea037760 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/admin-api-schemas.md @@ -0,0 +1,7 @@ +--- +id: version-2.5.0-admin-api-schemas +title: Managing Schemas +sidebar_label: Schemas +original_id: admin-api-schemas +--- + diff --git a/site2/website/versioned_docs/version-2.5.0/administration-dashboard.md b/site2/website/versioned_docs/version-2.5.0/administration-dashboard.md new file mode 100644 index 0000000000000..04a91abdc5002 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-dashboard.md @@ -0,0 +1,63 @@ +--- +id: version-2.5.0-administration-dashboard +title: Pulsar dashboard +sidebar_label: Dashboard +original_id: administration-dashboard +--- + +> Note +> Pulsar dashboard is deprecated. If you want to manage and monitor the stats of your topics, use [Pulsar Manager](administration-pulsar-manager.md). + +Pulsar dashboard is a web application that enables users to monitor current stats for all [topics](reference-terminology.md#topic) in tabular form. + +The dashboard is a data collector that polls stats from all the brokers in a Pulsar instance (across multiple clusters) and stores all the information in a [PostgreSQL](https://www.postgresql.org/) database. + +You can use the [Django](https://www.djangoproject.com) web app to render the collected data. + +## Install + +The easiest way to use the dashboard is to run it inside a [Docker](https://www.docker.com/products/docker) container. + +```shell +$ SERVICE_URL=http://broker.example.com:8080/ +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + apachepulsar/pulsar-dashboard:{{pulsar:version}} +``` + +You can find the {@inject: github:`Dockerfile`:/dashboard/Dockerfile} in the `dashboard` directory and build an image from scratch as well: + +```shell +$ docker build -t apachepulsar/pulsar-dashboard dashboard +``` + +If token authentication is enabled: +> Provided token should have super-user access. +```shell +$ SERVICE_URL=http://broker.example.com:8080/ +$ JWT_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c +$ docker run -p 80:80 \ + -e SERVICE_URL=$SERVICE_URL \ + -e JWT_TOKEN=$JWT_TOKEN \ + apachepulsar/pulsar-dashboard +``` + +You need to specify only one service URL for a Pulsar cluster. Internally, the collector figures out all the existing clusters and the brokers from where it needs to pull the metrics. If you connect the dashboard to Pulsar running in standalone mode, the URL is `http://:8080` by default. `` is the ip address or hostname of the machine running Pulsar standalone. The ip address or hostname should be accessible from the docker instance running dashboard. + +Once the Docker container runs, the web dashboard is accessible via `localhost` or whichever host that Docker uses. + +> The `SERVICE_URL` that the dashboard uses needs to be reachable from inside the Docker container + +If the Pulsar service runs in standalone mode in `localhost`, the `SERVICE_URL` has to +be the IP of the machine. + +Similarly, given the Pulsar standalone advertises itself with localhost by default, you need to +explicitely set the advertise address to the host IP. For example: + +```shell +$ bin/pulsar standalone --advertised-address 1.2.3.4 +``` + +### Known issues + +Currently, only Pulsar Token [authentication](security-overview.md#authentication-providers) is supported. \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/administration-geo.md b/site2/website/versioned_docs/version-2.5.0/administration-geo.md new file mode 100644 index 0000000000000..e46714191f893 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-geo.md @@ -0,0 +1,158 @@ +--- +id: version-2.5.0-administration-geo +title: Pulsar geo-replication +sidebar_label: Geo-replication +original_id: administration-geo +--- + +*Geo-replication* is the replication of persistently stored message data across multiple clusters of a Pulsar instance. + +## How geo-replication works + +The diagram below illustrates the process of geo-replication across Pulsar clusters: + +![Replication Diagram](assets/geo-replication.png) + +In this diagram, whenever **P1**, **P2**, and **P3** producers publish messages to the **T1** topic on **Cluster-A**, **Cluster-B**, and **Cluster-C** clusters respectively, those messages are instantly replicated across clusters. Once the messages are replicated, **C1** and **C2** consumers can consume those messages from their respective clusters. + +Without geo-replication, **C1** and **C2** consumers are not able to consume messages that **P3** producer publishes. + +## Geo-replication and Pulsar properties + +You must enable geo-replication on a per-tenant basis in Pulsar. You can enable geo-replication between clusters only when a tenant is created that allows access to both clusters. + +Although geo-replication must be enabled between two clusters, actually geo-replication is managed at the namespace level. You must complete the following tasks to enable geo-replication for a namespace: + +* [Enable geo-replication namespaces](#enable-geo-replication-namespaces) +* Configure that namespace to replicate across two or more provisioned clusters + +Any message published on *any* topic in that namespace is replicated to all clusters in the specified set. + +## Local persistence and forwarding + +When messages are produced on a Pulsar topic, messages are first persisted in the local cluster, and then forwarded asynchronously to the remote clusters. + +In normal cases, when connectivity issues are none, messages are replicated immediately, at the same time as they are dispatched to local consumers. Typically, the network [round-trip time](https://en.wikipedia.org/wiki/Round-trip_delay_time) (RTT) between the remote regions defines end-to-end delivery latency. + +Applications can create producers and consumers in any of the clusters, even when the remote clusters are not reachable (like during a network partition). + +> #### Subscriptions are local to a cluster +> While producers and consumers can publish to and consume from any cluster in a Pulsar instance, subscriptions are local to the clusters in which the subscriptions are created and cannot be transferred between clusters. If you do need to transfer a subscription, you need to create a new subscription in the desired cluster. + +In the aforementioned example, the **T1** topic is replicated among three clusters, **Cluster-A**, **Cluster-B**, and **Cluster-C**. + +All messages produced in any of the three clusters are delivered to all subscriptions in other clusters. In this case, **C1** and **C2** consumers receive all messages that **P1**, **P2**, and **P3** producers publish. Ordering is still guaranteed on a per-producer basis. + +## Configure replication + +As stated in [Geo-replication and Pulsar properties](#geo-replication-and-pulsar-properties) section, geo-replication in Pulsar is managed at the [tenant](reference-terminology.md#tenant) level. + +### Grant permissions to properties + +To replicate to a cluster, the tenant needs permission to use that cluster. You can grant permission to the tenant when you create the tenant or grant later. + +Specify all the intended clusters when you create a tenant: + +```shell +$ bin/pulsar-admin tenants create my-tenant \ + --admin-roles my-admin-role \ + --allowed-clusters us-west,us-east,us-cent +``` + +To update permissions of an existing tenant, use `update` instead of `create`. + +### Enable geo-replication namespaces + +You can create a namespace with the following command sample. + +```shell +$ bin/pulsar-admin namespaces create my-tenant/my-namespace +``` + +Initially, the namespace is not assigned to any cluster. You can assign the namespace to clusters using the `set-clusters` subcommand: + +```shell +$ bin/pulsar-admin namespaces set-clusters my-tenant/my-namespace \ + --clusters us-west,us-east,us-cent +``` + +You can change the replication clusters for a namespace at any time, without disruption to ongoing traffic. Replication channels are immediately set up or stopped in all clusters as soon as the configuration changes. + +### Use topics with geo-replication + +Once you create a geo-replication namespace, any topics that producers or consumers create within that namespace is replicated across clusters. Typically, each application uses the `serviceUrl` for the local cluster. + +#### Selective replication + +By default, messages are replicated to all clusters configured for the namespace. You can restrict replication selectively by specifying a replication list for a message, and then that message is replicated only to the subset in the replication list. + +The following is an example for the [Java API](client-libraries-java.md). Note the use of the `setReplicationClusters` method when you construct the {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} object: + +```java +List restrictReplicationTo = Arrays.asList( + "us-west", + "us-east" +); + +Producer producer = client.newProducer() + .topic("some-topic") + .create(); + +producer.newMessage() + .value("my-payload".getBytes()) + .setReplicationClusters(restrictReplicationTo) + .send(); +``` + +#### Topic stats + +Topic-specific statistics for geo-replication topics are available via the [`pulsar-admin`](reference-pulsar-admin.md) tool and {@inject: rest:REST:/} API: + +```shell +$ bin/pulsar-admin persistent stats persistent://my-tenant/my-namespace/my-topic +``` + +Each cluster reports its own local stats, including the incoming and outgoing replication rates and backlogs. + +#### Delete a geo-replication topic + +Given that geo-replication topics exist in multiple regions, directly deleting a geo-replication topic is not possible. Instead, you should rely on automatic topic garbage collection. + +In Pulsar, a topic is automatically deleted when the topic meets the following three conditions: +- no producers or consumers are connected to it; +- no subscriptions to it; +- no more messages are kept for retention. +For geo-replication topics, each region uses a fault-tolerant mechanism to decide when deleting the topic locally is safe. + +You can explicitly disable topic garbage collection by setting `brokerDeleteInactiveTopicsEnabled` to `false` in your [broker configuration](reference-configuration.md#broker). + +To delete a geo-replication topic, close all producers and consumers on the topic, and delete all of its local subscriptions in every replication cluster. When Pulsar determines that no valid subscription for the topic remains across the system, it will garbage collect the topic. + +## Replicated subscriptions + +Pulsar supports replicated subscriptions, so you can keep subscription state in sync, within a sub-second timeframe, in the context of a topic that is being asynchronously replicated across multiple geographical regions. + +In case of failover, a consumer can restart consuming from the failure point in a different cluster. + +### Enable replicated subscription + +Replicated subscription is disabled by default. You can enable replicated subscription when creating a consumer. + +```java +Consumer consumer = client.newConsumer(Schema.STRING) + .topic("my-topic") + .subscriptionName("my-subscription") + .replicateSubscriptionState(true) + .subscribe(); +``` + +### Advantages + + * It is easy to implement the logic. + * You can choose to enable or disable replicated subscription. + * When you enable it, the overhead is low, and it is easy to configure. + * When you disable it, the overhead is zero. + +### Limitations + +When you enable replicated subscription, you're creating a consistent distributed snapshot to establish an association between message ids from different clusters. The snapshots are taken periodically. The default value is `1 second`. It means that a consumer failing over to a different cluster can potentially receive 1 second of duplicates. You can also configure the frequency of the snapshot in the `broker.conf` file. diff --git a/site2/website/versioned_docs/version-2.5.0/administration-load-balance.md b/site2/website/versioned_docs/version-2.5.0/administration-load-balance.md new file mode 100644 index 0000000000000..7fa35fe40d38e --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-load-balance.md @@ -0,0 +1,182 @@ +--- +id: version-2.5.0-administration-load-balance +title: Pulsar load balance +sidebar_label: Load balance +original_id: administration-load-balance +--- + +## Load balance across Pulsar brokers + +Pulsar is an horizontally scalable messaging system, so the traffic +in a logical cluster must be spread across all the available Pulsar brokers as evenly as possible, which is a core requirement. + +You can use multiple settings and tools to control the traffic distribution which require a bit of context to understand how the traffic is managed in Pulsar. Though, in most cases, the core requirement mentioned above is true out of the box and you should not worry about it. + +## Pulsar load manager architecture + +The following part introduces the basic architecture of the Pulsar load manager. + +### Assign topics to brokers dynamically + +Topics are dynamically assigned to brokers based on the load conditions of all brokers in the cluster. + +When a client starts using new topics that are not assigned to any broker, a process is triggered to choose the best suited broker to acquire ownership of these topics according to the load conditions. + +In case of partitioned topics, different partitions are assigned to different brokers. Here "topic" means either a non-partitioned topic or one partition of a topic. + +The assignment is "dynamic" because the assignment changes quickly. For example, if the broker owning the topic crashes, the topic is reassigned immediately to another broker. Another scenario is that the broker owning the topic becomes overloaded. In this case, the topic is reassigned to a less loaded broker. + +The stateless nature of brokers makes the dynamic assignment possible, so you can quickly expand or shrink the cluster based on usage. + +#### Assignment granularity + +The assignment of topics or partitions to brokers is not done at the topics or partitions level, but done at the Bundle level (a higher level). The reason is to amortize the amount of information that you need to keep track. Based on CPU, memory, traffic load and other indexes, topics are assigned to a particular broker dynamically. + +Instead of individual topic or partition assignment, each broker takes ownership of a subset of the topics for a namespace. This subset is called a "*bundle*" and effectively this subset is a sharding mechanism. + +The namespace is the "administrative" unit: many config knobs or operations are done at the namespace level. + +For assignment, a namespaces is sharded into a list of "bundles", with each bundle comprising +a portion of overall hash range of the namespace. + +Topics are assigned to a particular bundle by taking the hash of the topic name and checking in which +bundle the hash falls into. + +Each bundle is independent of the others and thus is independently assigned to different brokers. + +### Create namespaces and bundles + +When you create a new namespace, the new namespace sets to use the default number of bundles. You can set this in `conf/broker.conf`: + +```properties +# When a namespace is created without specifying the number of bundle, this +# value will be used as the default +defaultNumberOfNamespaceBundles=4 +``` + +You can either change the system default, or override it when you create a new namespace: + +```shell +$ bin/pulsar-admin namespaces create my-tenant/my-namespace --clusters us-west --bundles 16 +``` + +With this command, you create a namespace with 16 initial bundles. Therefore the topics for this namespaces can immediately be spread across up to 16 brokers. + +In general, if you know the expected traffic and number of topics in advance, you had better start with a reasonable number of bundles instead of waiting for the system to auto-correct the distribution. + +On the same note, it is beneficial to start with more bundles than the number of brokers, because of the hashing nature of the distribution of topics into bundles. For example, for a namespace with 1000 topics, using something like 64 bundles achieves a good distribution of traffic across 16 brokers. + +### Unload topics and bundles + +You can "unload" a topic in Pulsar with admin operation. Unloading means to close the topics, +release ownership and reassign the topics to a new broker, based on current load. + +When unloading happens, the client experiences a small latency blip, typically in the order of tens of milliseconds, while the topic is reassigned. + +Unloading is the mechanism that the load-manager uses to perform the load shedding, but you can also trigger the unloading manually, for example to correct the assignments and redistribute traffic even before having any broker overloaded. + +Unloading a topic has no effect on the assignment, but just closes and reopens the particular topic: + +```shell +pulsar-admin topics unload persistent://tenant/namespace/topic +``` + +To unload all topics for a namespace and trigger reassignments: + +```shell +pulsar-admin namespaces unload tenant/namespace +``` + +### Split namespace bundles + +Since the load for the topics in a bundle might change over time, or predicting upfront might just be hard, brokers can split bundles into two. The new smaller bundles can be reassigned to different brokers. + +The splitting happens based on some tunable thresholds. Any existing bundle that exceeds any of the threshold is a candidate to be split. By default the newly split bundles are also immediately offloaded to other brokers, to facilitate the traffic distribution. + +```properties +# enable/disable namespace bundle auto split +loadBalancerAutoBundleSplitEnabled=true + +# enable/disable automatic unloading of split bundles +loadBalancerAutoUnloadSplitBundlesEnabled=true + +# maximum topics in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxTopics=1000 + +# maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxSessions=1000 + +# maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxMsgRate=30000 + +# maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered +loadBalancerNamespaceBundleMaxBandwidthMbytes=100 + +# maximum number of bundles in a namespace (for auto-split) +loadBalancerNamespaceMaximumBundles=128 +``` + +### Shed load automatically + +The support for automatic load shedding is avaliable in the load manager of Pulsar. This means that whenever the system recognizes a particular broker is overloaded, the system forces some traffic to be reassigned to less loaded brokers. + +When a broker is identified as overloaded, the broker forces to "unload" a subset of the bundles, the +ones with higher traffic, that make up for the overload percentage. + +For example, the default threshold is 85% and if a broker is over quota at 95% CPU usage, then the broker unloads the percent difference plus a 5% margin: `(95% - 85%) + 5% = 15%`. + +Given the selection of bundles to offload is based on traffic (as a proxy measure for cpu, network +and memory), broker unloads bundles for at least 15% of traffic. + +The automatic load shedding is enabled by default and you can disable the automatic load shedding with this setting: + +```properties +# Enable/disable automatic bundle unloading for load-shedding +loadBalancerSheddingEnabled=true +``` + +Additional settings that apply to shedding: + +```properties +# Load shedding interval. Broker periodically checks whether some traffic should be offload from +# some over-loaded broker to other under-loaded brokers +loadBalancerSheddingIntervalMinutes=1 + +# Prevent the same topics to be shed and moved to other brokers more that once within this timeframe +loadBalancerSheddingGracePeriodMinutes=30 +``` + +#### Broker overload thresholds + +The determinations of when a broker is overloaded is based on threshold of CPU, network and memory usage. Whenever either of those metrics reaches the threshold, the system triggers the shedding (if enabled). + +By default, overload threshold is set at 85%: + +```properties +# Usage threshold to determine a broker as over-loaded +loadBalancerBrokerOverloadedThresholdPercentage=85 +``` + +Pulsar gathers the usage stats from the system metrics. + +In case of network utilization, in some cases the network interface speed that Linux reports is +not correct and needs to be manually overridden. This is the case in AWS EC2 instances with 1Gbps +NIC speed for which the OS reports 10Gbps speed. + +Because of the incorrect max speed, the Pulsar load manager might think the broker has not reached the NIC capacity, while in fact the broker already uses all the bandwidth and the traffic is slowed down. + +You can use the following setting to correct the max NIC speed: + +```properties +# Override the auto-detection of the network interfaces max speed. +# This option is useful in some environments (eg: EC2 VMs) where the max speed +# reported by Linux is not reflecting the real bandwidth available to the broker. +# Since the network usage is employed by the load manager to decide when a broker +# is overloaded, it is important to make sure the info is correct or override it +# with the right value here. The configured value can be a double (eg: 0.8) and that +# can be used to trigger load-shedding even before hitting on NIC limits. +loadBalancerOverrideBrokerNicSpeedGbps= +``` + +When the value is empty, Pulsar uses the value that the OS reports. + diff --git a/site2/website/versioned_docs/version-2.5.0/administration-proxy.md b/site2/website/versioned_docs/version-2.5.0/administration-proxy.md new file mode 100644 index 0000000000000..6c9d930d93699 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-proxy.md @@ -0,0 +1,105 @@ +--- +id: version-2.5.0-administration-proxy +title: The Pulsar proxy +sidebar_label: Pulsar proxy +original_id: administration-proxy +--- + +The [Pulsar proxy](concepts-architecture-overview.md#pulsar-proxy) is an optional gateway that you can run in front of the brokers in a Pulsar cluster. You can run a Pulsar proxy in cases when direction connections between clients and Pulsar brokers are either infeasible, undesirable, or both, for example when you run Pulsar in a cloud environment or on [Kubernetes](https://kubernetes.io) or an analogous platform. + +## Configure the proxy + +The proxy must have some way to find the addresses of the brokers of the cluster. You can do this by either configuring the proxy to connect directly to service discovery or by specifying a broker URL in the configuration. + +### Option 1: Use service discovery + +Pulsar uses [ZooKeeper](https://zookeeper.apache.org) for service discovery. To connect the proxy to ZooKeeper, specify the following in `conf/proxy.conf`. +```properties +zookeeperServers=zk-0,zk-1,zk-2 +configurationStoreServers=zk-0:2184,zk-remote:2184 +``` + +> If you use service discovery, the network ACL must allow the proxy to talk to the ZooKeeper nodes on the zookeeper client port, which is usually 2181, and on the configuration store client port, which is 2184 by default. Opening the network ACLs means that if someone compromises a proxy, they have full access to ZooKeeper. For this reason, using broker URLs to configure the proxy is more secure. + +### Option 2: Use broker URLs + +The more secure method of configuring the proxy is to specify a URL to connect to the brokers. + +> [Authorization](security-authorization#enable-authorization-and-assign-superusers) at the proxy requires access to ZooKeeper, so if you use these broker URLs to connect to the brokers, you should disable the Proxy level authorization. Brokers still authorize requests after the proxy forwards them. + +You can configure the broker URLs in `conf/proxy.conf` as follows. + +```properties +brokerServiceURL=pulsar://brokers.example.com:6650 +brokerWebServiceURL=http://brokers.example.com:8080 +functionWorkerWebServiceURL=http://function-workers.example.com:8080 +``` + +Or if you use TLS: +```properties +brokerServiceURLTLS=pulsar+ssl://brokers.example.com:6651 +brokerWebServiceURLTLS=https://brokers.example.com:8443 +functionWorkerWebServiceURL=https://function-workers.example.com:8443 +``` + +The hostname in the URLs provided should be a DNS entry which points to multiple brokers or a Virtual IP which is backed by multiple broker IP addresses so that the proxy does not lose connectivity to the pulsar cluster if a single broker becomes unavailable. + +The ports to connect to the brokers (6650 and 8080, or in the case of TLS, 6651 and 8443) should be open in the network ACLs. + +Note that if you do not use functions, then you do not need to configure `functionWorkerWebServiceURL`. + +## Start the proxy + +To start the proxy: + +```bash +$ cd /path/to/pulsar/directory +$ bin/pulsar proxy +``` + +> You can run as many instances of the Pulsar proxy in a cluster as you want. + + +## Stop the proxy + +The Pulsar proxy runs by default in the foreground. To stop the proxy, simply stop the process in which the proxy is running. + +## Proxy frontends + +You can run the Pulsar proxy behind some kind of load-distributing frontend, such as an [HAProxy](https://www.digitalocean.com/community/tutorials/an-introduction-to-haproxy-and-load-balancing-concepts) load balancer. + +## Use Pulsar clients with the proxy + +Once your Pulsar proxy is up and running, preferably behind a load-distributing [frontend](#proxy-frontends), clients can connect to the proxy via whichever address that the frontend uses. If the address is the DNS address `pulsar.cluster.default`, for example, then the connection URL for clients is `pulsar://pulsar.cluster.default:6650`. + +## Proxy configuration + +You can configure the Pulsar proxy using the [`proxy.conf`](reference-configuration.md#proxy) configuration file. The following parameters are available in that file: + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The ZooKeeper quorum connection string (as a comma-separated list) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds) |30000| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath | Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticateMetricsEndpoint| Whether the '/metrics' endpoint requires authentication. Defaults to true. 'authenticationEnabled' must also be set for this to take effect. |true| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they are able to perform all admin || +|forwardAuthorizationCredentials| Whether client authorization credentials are forwared to the broker for re-authorization. Authentication must be enabled via authenticationEnabled=true for this to take effect. |false| +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy rejects requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy errors out requests beyond that. |50000| +|tlsEnabledInProxy| Whether TLS is enabled for the proxy |false| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate is not trusted. |false| diff --git a/site2/website/versioned_docs/version-2.5.0/administration-pulsar-manager.md b/site2/website/versioned_docs/version-2.5.0/administration-pulsar-manager.md new file mode 100644 index 0000000000000..3a7bcc21468c1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-pulsar-manager.md @@ -0,0 +1,133 @@ +--- +id: version-2.5.0-administration-pulsar-manager +title: Pulsar Manager +sidebar_label: Pulsar Manager +original_id: administration-pulsar-manager +--- + +Pulsar Manager is a web-based GUI management and monitoring tool that helps administrators and users manage and monitor tenants, namespaces, topics, subscriptions, brokers, clusters, and so on, and supports dynamic configuration of multiple environments. + +## Install + +The easiest way to use the Pulsar Manager is to run it inside a [Docker](https://www.docker.com/products/docker) container. + + +``` +docker pull apachepulsar/pulsar-manager:v0.1.0 +docker run -it -p 9527:9527 -e REDIRECT_HOST=http://192.168.0.104 -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -v $PWD:/data apachepulsar/pulsar-manager:v0.1.0 /bin/sh +``` + +* REDIRECT_HOST: the IP address of the front-end server. + +* REDIRECT_PORT: the port of the front-end server. + +* DRIVER_CLASS_NAME: the driver class name of PostgreSQL. + +* URL: the URL of PostgreSQL JDBC, For example, `jdbc:postgresql://127.0.0.1:5432/pulsar_manager`. + +* USERNAME: the username of PostgreSQL. + +* PASSWORD: the password of PostgreSQL. + +* LOG_LEVEL: level of log. + +You can find the in the [Docker](https://github.com/apache/pulsar-manager/tree/master/docker) directory and build an image from scratch as well: + +``` +git clone https://github.com/apache/pulsar-manager +cd pulsar-manager +./gradlew build -x test +cd front-end +npm install --save +npm run build:prod +cd .. +docker build -f docker/Dockerfile --build-arg BUILD_DATE=`date -u +"%Y-%m-%dT%H:%M:%SZ"` --build-arg VCS_REF=`latest` --build-arg VERSION=`latest` -t apachepulsar/pulsar-manager . +``` + +### Use custom databases + +If you have a large amount of data, you can use a custom database. The following is an example of PostgreSQL. + +1. Initialize database and table structures using the [file](https://github.com/apache/pulsar-manager/tree/master/src/main/resources/META-INF/sql/postgresql-schema.sql). + +2. Modify the [configuration file](https://github.com/apache/pulsar-manager/blob/master/src/main/resources/application.properties) and add PostgreSQL configuration. + +``` +spring.datasource.driver-class-name=org.postgresql.Driver +spring.datasource.url=jdbc:postgresql://127.0.0.1:5432/pulsar_manager +spring.datasource.username=postgres +spring.datasource.password=postgres +``` + +3. Compile to generate a new executable jar package. + +``` +./gradlew -x build -x test +``` + +### Enable JWT authentication + +If you want to turn on JWT authentication, configure the following parameters: + +* `backend.jwt.token`: token for the superuser. You need to configure this parameter during cluster initialization. +* `jwt.broker.token.mode`: two modes of generating token, SECRET and PRIVATE. +* `jwt.broker.public.key`: configure this option if you are using the PRIVATE mode. +* `jwt.broker.private.key`: configure this option if you are using the PRIVATE mode. +* `jwt.broker.secret.key`: configure this option if you are using the SECRET mode. + +For more information, see [Token Authentication Admin of Pulsar](http://pulsar.apache.org/docs/en/security-token-admin/). + + +If you want to enable JWT authentication, use one of the following methods. + + +* Method 1: use command-line tool + +``` +./build/distributions/pulsar-manager/bin/pulsar-manager --redirect.host=http://localhost --redirect.port=9527 insert.stats.interval=600000 --backend.jwt.token=token --jwt.broker.token.mode=PRIVATE --jwt.broker.private.key=file:///path/broker-private.key --jwt.broker.public.key=file:///path/broker-public.key +``` + +* Method 2: configure the application.properties file + +``` +backend.jwt.token=token + +jwt.broker.token.mode=PRIVATE +jwt.broker.public.key=file:///path/broker-public.key +jwt.broker.private.key=file:///path/broker-private.key + +or +jwt.broker.token.mode=SECRET +jwt.broker.secret.key=file:///path/broker-secret.key +``` + +* Method 3: use Docker and turn on token authentication. + +``` +export JWT_TOKEN="your-token" +docker run -it -p 9527:9527 -e REDIRECT_HOST=http://192.168.55.182 -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -v $PWD:/data apachepulsar/pulsar-manager:v0.1.0 /bin/sh +``` + +* Method 4: use Docker and turn on **token authentication** and **token management** by private key and public key. + +``` +export JWT_TOKEN="your-token" +export PRIVATE_KEY="file:///private-key-path" +export PUBLIC_KEY="file:///public-key-path" +docker run -it -p 9527:9527 -e REDIRECT_HOST=http://192.168.55.182 -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -e PRIVATE_KEY=$PRIVATE_KEY -e PUBLIC_KEY=$PUBLIC_KEY -v $PWD:/data -v $PWD/private-key-path:/pulsar-manager/private-key-path -v $PWD/public-key-path:/pulsar-manager/public-key-path apachepulsar/pulsar-manager:v0.1.0 /bin/sh +``` + +* Method 5: use Docker and turn on **token authentication** and **token management** by secret key. + +``` +export JWT_TOKEN="your-token" +export SECRET_KEY="file:///secret-key-path" +docker run -it -p 9527:9527 -e REDIRECT_HOST=http://192.168.55.182 -e REDIRECT_PORT=9527 -e DRIVER_CLASS_NAME=org.postgresql.Driver -e URL='jdbc:postgresql://127.0.0.1:5432/pulsar_manager' -e USERNAME=pulsar -e PASSWORD=pulsar -e LOG_LEVEL=DEBUG -e JWT_TOKEN=$JWT_TOKEN -e PRIVATE_KEY=$PRIVATE_KEY -e PUBLIC_KEY=$PUBLIC_KEY -v $PWD:/data -v $PWD/secret-key-path:/pulsar-manager/secret-key-path apachepulsar/pulsar-manager:v0.1.0 /bin/sh +``` + +* For more information about backend configurations, see [here](https://github.com/apache/pulsar-manager/blob/8b1f26f7d7c725e6d056c41b98235fbc5deb9f49/src/README.md). +* For more information about frontend configurations, see [here](https://github.com/apache/pulsar-manager/blob/master/front-end/README.md). + +## Log in + +Visit http://localhost:9527 to log in. diff --git a/site2/website/versioned_docs/version-2.5.0/administration-stats.md b/site2/website/versioned_docs/version-2.5.0/administration-stats.md new file mode 100644 index 0000000000000..8081344baeb9b --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-stats.md @@ -0,0 +1,64 @@ +--- +id: version-2.5.0-administration-stats +title: Pulsar stats +sidebar_label: Pulsar statistics +original_id: administration-stats +--- + +## Partitioned topics + +|Stat|Description| +|---|---| +|msgRateIn| The sum of publish rates of all local and replication publishers in messages per second.| +|msgThroughputIn| Same as msgRateIn but in bytes per second instead of messages per second.| +|msgRateOut| The sum of dispatch rates of all local and replication consumers in messages per second.| +|msgThroughputOut| Same as msgRateOut but in bytes per second instead of messages per second.| +|averageMsgSize| Average message size, in bytes, from this publisher within the last interval.| +|storageSize| The sum of storage size of the ledgers for this topic.| +|publishers| The list of all local publishers into the topic. Publishers can be anywhere from zero to thousands.| +|producerId| Internal identifier for this producer on this topic.| +|producerName| Internal identifier for this producer, generated by the client library.| +|address| IP address and source port for the connection of this producer.| +|connectedSince| Timestamp this producer is created or last reconnected.| +|subscriptions| The list of all local subscriptions to the topic.| +|my-subscription| The name of this subscription (client defined).| +|msgBacklog| The count of messages in backlog for this subscription.| +|type| This subscription type.| +|msgRateExpired| The rate at which messages are discarded instead of dispatched from this subscription due to TTL.| +|consumers| The list of connected consumers for this subscription.| +|consumerName| Internal identifier for this consumer, generated by the client library.| +|availablePermits| The number of messages this consumer has space for in the listen queue of client library. A value of 0 means the queue of client library is full and receive() is not being called. A nonzero value means this consumer is ready to be dispatched messages.| +|replication| This section gives the stats for cross-colo replication of this topic.| +|replicationBacklog| The outbound replication backlog in messages.| +|connected| Whether the outbound replicator is connected.| +|replicationDelayInSeconds| How long the oldest message has been waiting to be sent through the connection, if connected is true.| +|inboundConnection| The IP and port of the broker in the publisher connection of remote cluster to this broker. | +|inboundConnectedSince| The TCP connection being used to publish messages to the remote cluster. If no local publishers are connected, this connection is automatically closed after a minute.| + + +## Topics + +|Stat|Description| +|---|---| +|entriesAddedCounter| Messages published since this broker loads this topic.| +|numberOfEntries| Total number of messages being tracked.| +|totalSize| Total storage size in bytes of all messages.| +|currentLedgerEntries| Count of messages written to the ledger currently open for writing.| +|currentLedgerSize| Size in bytes of messages written to ledger currently open for writing.| +|lastLedgerCreatedTimestamp| Time when last ledger is created.| +|lastLedgerCreationFailureTimestamp| Time when last ledger is failed.| +|waitingCursorsCount| How many cursors are caught up and waiting for a new message to be published.| +|pendingAddEntriesCount| How many messages have (asynchronous) write requests you are waiting on completion.| +|lastConfirmedEntry| The ledgerid:entryid of the last message successfully written. If the entryid is -1, then the ledger is opened or is being currently opened but has no entries written yet.| +|state| The state of the cursor ledger. Open means you have a cursor ledger for saving updates of the markDeletePosition.| +|ledgers| The ordered list of all ledgers for this topic holding its messages.| +|cursors| The list of all cursors on this topic. Every subscription you saw in the topic stats has one.| +|markDeletePosition| The ack position: the last message the subscriber acknowledges receiving.| +|readPosition| The latest position of subscriber for reading message.| +|waitingReadOp| This is true when the subscription reads the latest message that is published to the topic and waits on new messages to be published.| +|pendingReadOps| The counter for how many outstanding read requests to the BookKeepers you have in progress.| +|messagesConsumedCounter| Number of messages this cursor acks since this broker loads this topic.| +|cursorLedger| The ledger used to persistently store the current markDeletePosition.| +|cursorLedgerLastEntry| The last entryid used to persistently store the current markDeletePosition.| +|individuallyDeletedMessages| If Acks are done out of order, shows the ranges of messages Acked between the markDeletePosition and the read-position.| +|lastLedgerSwitchTimestamp| The last time the cursor ledger is rolled over.| diff --git a/site2/website/versioned_docs/version-2.5.0/administration-upgrade.md b/site2/website/versioned_docs/version-2.5.0/administration-upgrade.md new file mode 100644 index 0000000000000..43a48b518a0cb --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-upgrade.md @@ -0,0 +1,151 @@ +--- +id: version-2.5.0-administration-upgrade +title: Upgrade Guide +sidebar_label: Upgrade +original_id: administration-upgrade +--- + +## Upgrade guidelines + +Apache Pulsar is comprised of multiple components, ZooKeeper, bookies, and brokers. These components are either stateful or stateless. You do not have to upgrade ZooKeeper nodes unless you have special requirement. While you upgrade, you need to pay attention to bookies (stateful), brokers and proxies (stateless). + +The following are some guidelines on upgrading a Pulsar cluster. Read the guidelines before upgrading. + +- Backup all your configuration files before upgrading. +- Read guide entirely, make a plan, and then execute the plan. When you make upgrade plan, you need to take your specific requirements and environment into consideration. +- Pay attention to the upgrading order of components. In general, you do not need to upgrade your ZooKeeper or configuration store cluster (the global ZooKeeper cluster). You need to upgrade bookies first, and then upgrade brokers, proxies, and your clients. +- If `autorecovery` is enabled, you need to disable `autorecovery` in the upgrade process, and re-enable it after completing the process. +- Read the release notes carefully for each release. Release notes contain features, configuration changes that might impact your upgrade. +- Upgrade a small subset of nodes of each type to canary test the new version before upgrading all nodes of that type in the cluster. When you have upgraded the canary nodes, run for a while to ensure that they work correctly. +- Upgrade one data center to verify new version before upgrading all data centers if your cluster runs in multi-cluster replicated mode. + +> Note: Currently, Apache Pulsar is compatible between versions. + +## Upgrade sequence + +To upgrade an Apache Pulsar cluster, follow the upgrade sequence. + +1. Upgrade ZooKeeper (optional) +- Canary test: test an upgraded version in one or a small set of ZooKeeper nodes. +- Rolling upgrade: rollout the upgraded version to all ZooKeeper servers incrementally, one at a time. Monitor your dashboard during the whole rolling upgrade process. +2. Upgrade bookies +- Canary test: test an upgraded version in one or a small set of bookies. +- Rolling upgrade: + - a. Disable `autorecovery` with the following command. + ```shell + bin/bookkeeper shell autorecovery -disable + ``` + - b. Rollout the upgraded version to all bookies in the cluster after you determine that a version is safe after canary. + - c. After you upgrade all bookies, re-enable `autorecovery` with the following command. + ```shell + bin/bookkeeper shell autorecovery -enable + ``` +3. Upgrade brokers +- Canary test: test an upgraded version in one or a small set of brokers. +- Rolling upgrade: rollout the upgraded version to all brokers in the cluster after you determine that a version is safe after canary. +4. Upgrade proxies +- Canary test: test an upgraded version in one or a small set of proxies. +- Rolling upgrade: rollout the upgraded version to all proxies in the cluster after you determine that a version is safe after canary. + +## Upgrade ZooKeeper (optional) +While you upgrade ZooKeeper servers, you can do canary test first, and then upgrade all ZooKeeper servers in the cluster. + +### Canary test + +You can test an upgraded version in one of ZooKeeper servers before upgrading all ZooKeeper servers in your cluster. + +To upgrade ZooKeeper server to a new version, complete the following steps: + +1. Stop a ZooKeeper server. +2. Upgrade the binary and configuration files. +3. Start the ZooKeeper server with the new binary files. +4. Use `pulsar zookeeper-shell` to connect to the newly upgraded ZooKeeper server and run a few commands to verify if it works as expected. +5. Run the ZooKeeper server for a few days, observe and make sure the ZooKeeper cluster runs well. + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic ZooKeeper node, revert the binary and configuration, and restart the ZooKeeper with the reverted binary. + +### Upgrade all ZooKeeper servers + +After canary test to upgrade one ZooKeeper in your cluster, you can upgrade all ZooKeeper servers in your cluster. + +You can upgrade all ZooKeeper servers one by one by following steps in canary test. + +## Upgrade bookies + +While you upgrade bookies, you can do canary test first, and then upgrade all bookies in the cluster. +For more details, you can read Apache BookKeeper [Upgrade guide](http://bookkeeper.apache.org/docs/latest/admin/upgrade). + +### Canary test + +You can test an upgraded version in one or a small set of bookies before upgrading all bookies in your cluster. + +To upgrade bookie to a new version, complete the following steps: + +1. Stop a bookie. +2. Upgrade the binary and configuration files. +3. Start the bookie in `ReadOnly` mode to verify if the bookie of this new version runs well for read workload. + ```shell + bin/pulsar bookie --readOnly + ``` +4. When the bookie runs successfully in `ReadOnly` mode, stop the bookie and restart it in `Write/Read` mode. + ```shell + bin/pulsar bookie + ``` +5. Observe and make sure the cluster serves both write and read traffic. + +#### Canary rollback + +If issues occur during the canary test, you can shut down the problematic bookie node. Other bookies in the cluster replaces this problematic bookie node with autorecovery. + +### Upgrade all bookies + +After canary test to upgrade some bookies in your cluster, you can upgrade all bookies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, upgrade one bookie at a time. In a downtime upgrade scenario, shut down the entire cluster, upgrade each bookie, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each bookie. + +1. Stop the bookie. +2. Upgrade the software (either new binary or new configuration files). +2. Start the bookie. + +> **Advanced operations** +> When you upgrade a large BookKeeper cluster in a rolling upgrade scenario, upgrading one bookie at a time is slow. If you configure rack-aware or region-aware placement policy, you can upgrade bookies rack by rack or region by region, which speeds up the whole upgrade process. + +## Upgrade brokers and proxies + +The upgrade procedure for brokers and proxies is the same. Brokers and proxies are `stateless`, so upgrading the two services is easy. + +### Canary test + +You can test an upgraded version in one or a small set of nodes before upgrading all nodes in your cluster. + +To upgrade to a new version, complete the following steps: + +1. Stop a broker (or proxy). +2. Upgrade the binary and configuration file. +3. Start a broker (or proxy). + +#### Canary rollback + +If issues occur during canary test, you can shut down the problematic broker (or proxy) node. Revert to the old version and restart the broker (or proxy). + +### Upgrade all brokers or proxies + +After canary test to upgrade some brokers or proxies in your cluster, you can upgrade all brokers or proxies in your cluster. + +Before upgrading, you have to decide whether to upgrade the whole cluster at once, including downtime and rolling upgrade scenarios. + +In a rolling upgrade scenario, you can upgrade one broker or one proxy at a time if the size of the cluster is small. If your cluster is large, you can upgrade brokers or proxies in batches. When you upgrade a batch of brokers or proxies, make sure the remaining brokers and proxies in the cluster have enough capacity to handle the traffic during upgrade. + +In a downtime upgrade scenario, shut down the entire cluster, upgrade each broker or proxy, and then start the cluster. + +While you upgrade in both scenarios, the procedure is the same for each broker or proxy. + +1. Stop the broker or proxy. +2. Upgrade the software (either new binary or new configuration files). +3. Start the broker or proxy. diff --git a/site2/website/versioned_docs/version-2.5.0/administration-zk-bk.md b/site2/website/versioned_docs/version-2.5.0/administration-zk-bk.md new file mode 100644 index 0000000000000..c955d4478ae26 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/administration-zk-bk.md @@ -0,0 +1,322 @@ +--- +id: version-2.5.0-administration-zk-bk +title: ZooKeeper and BookKeeper administration +sidebar_label: ZooKeeper and BookKeeper +original_id: administration-zk-bk +--- + +Pulsar relies on two external systems for essential tasks: + +* [ZooKeeper](https://zookeeper.apache.org/) is responsible for a wide variety of configuration-related and coordination-related tasks. +* [BookKeeper](http://bookkeeper.apache.org/) is responsible for [persistent storage](concepts-architecture-overview.md#persistent-storage) of message data. + +ZooKeeper and BookKeeper are both open-source [Apache](https://www.apache.org/) projects. + +> Skip to the [How Pulsar uses ZooKeeper and BookKeeper](#how-pulsar-uses-zookeeper-and-bookkeeper) section below for a more schematic explanation of the role of these two systems in Pulsar. + + +## ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploy-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Configuration Store](#deploy-configuration-store) operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +To deploy a Pulsar instance, you need to stand up one local ZooKeeper cluster *per Pulsar cluster*. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +On each host, you need to specify the node ID in `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you can set the `myid` value like this: + +```shell +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid +``` + +On `zk2.us-west.example.com` the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell +$ bin/pulsar-daemon start zookeeper +``` + +### Deploy configuration store + +The ZooKeeper cluster configured and started up in the section above is a *local* ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a [single-cluster](#single-cluster-pulsar-instance) instance, you do not need a separate cluster for the configuration store. If, however, you deploy a [multi-cluster](#multi-cluster-pulsar-instance) instance, you need to stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorom uses to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions and that other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can share the same hosts used for the local ZooKeeper quorum. + +For example, you can assume a Pulsar instance with the following clusters `us-west`, `us-east`, `us-central`, `eu-central`, `ap-south`. Also you can assume, each cluster has its own local ZK servers named such as + +``` +zk[1-3].${CLUSTER}.example.com +``` + +In this scenario you want to pick the quorum participants from few clusters and let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer +``` + +Additionally, ZK observers need to have: + +```properties +peerType=observer +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell +$ bin/pulsar-daemon start configuration-store +``` + + + +### ZooKeeper configuration + +In Pulsar, ZooKeeper configuration is handled by two separate configuration files in the `conf` directory of your Pulsar installation: `conf/zookeeper.conf` for [local ZooKeeper](#local-zookeeper) and `conf/global-zookeeper.conf` for [configuration store](#configuration-store). + +#### Local ZooKeeper + +The [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file handles the configuration for local ZooKeeper. The table below shows the available parameters: + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper stores in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server listens for connections. |2181| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, which triggers the ZooKeeper database purge task. Setting to a non-zero number enables auto purge; setting to 0 disables. Read this guide before enabling auto purge. |1| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + +#### Configuration Store + +The [`conf/global-zookeeper.conf`](reference-configuration.md#configuration-store) file handles the configuration for configuration store. The table below shows the available parameters: + + +## BookKeeper + +BookKeeper is responsible for all durable message storage in Pulsar. BookKeeper is a distributed [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) WAL system that guarantees read consistency of independent message logs calls ledgers. Individual BookKeeper servers are also called *bookies*. + +> For a guide to managing message persistence, retention, and expiry in Pulsar, see [this cookbook](cookbooks-retention-expiry.md). + +### Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. + +Each Pulsar broker needs to have its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Configure bookies + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important aspect of configuring each bookie is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for local ZooKeeper of the Pulsar cluster. + +### Start up bookies + +You can start up a bookie in two ways: in the foreground or as a background daemon. + +To start up a bookie in the foreground, use the [`bookeeper`](reference-cli-tools.md#bookkeeper) CLI tool: + +```bash +$ bin/bookkeeper bookie +``` + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start bookie +``` + +You can verify that the bookie works properly using the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): + +```shell +$ bin/bookkeeper shell bookiesanity +``` + +This command creates a new ledger on the local bookie, writes a few entries, reads them back and finally deletes the ledger. + +### Hardware considerations + +Bookie hosts are responsible for storing message data on disk. In order for bookies to provide optimal performance, ensuring that the bookies have a suitable hardware configuration is essential. You can choose two key dimensions to bookie hardware capacity: + +* Disk I/O capacity read/write +* Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker. To ensure low write latency, BookKeeper is designed to use multiple devices: + +* A **journal** to ensure durability. For sequential writes, having fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts is critical. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID)s controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +* A **ledger storage device** is where data is stored until all consumers have acknowledged the message. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + + + +### Configure BookKeeper + +you can find configurable parameters for BookKeeper bookies in the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) file. + +Minimum configuration changes required in `conf/bookkeeper.conf` are: + +```properties +# Change to point to journal disk mount point +journalDirectory=data/bookkeeper/journal + +# Point to ledger storage disk mount point +ledgerDirectories=data/bookkeeper/ledgers + +# Point to local ZK quorum +zkServers=zk1.example.com:2181,zk2.example.com:2181,zk3.example.com:2181 + +# Change the ledger manager type +ledgerManagerType=hierarchical +``` + +To change the zookeeper root path that Bookkeeper uses, use zkLedgersRootPath=/MY-PREFIX/ledgers instead of zkServers=localhost:2181/MY-PREFIX + +> Consult the official [BookKeeper docs](http://bookkeeper.apache.org) for more information about BookKeeper. + +## BookKeeper persistence policies + +In Pulsar, you can set *persistence policies*, at the namespace level, that determine how BookKeeper handles persistent storage of messages. Policies determine four things: + +* The number of acks (guaranteed copies) to wait for each ledger entry. +* The number of bookies to use for a topic. +* The number of writes to make for each ledger entry. +* The throttling rate for mark-delete operations. + +### Set persistence policies + +You can set persistence policies for BookKeeper at the [namespace](reference-terminology.md#namespace) level. + +#### Pulsar-admin + +Use the [`set-persistence`](reference-pulsar-admin.md#namespaces-set-persistence) subcommand and specify a namespace as well as any policies that you want to apply. The available flags are: + +Flag | Description | Default +:----|:------------|:------- +`-a`, `--bookkeeper-ack-quorom` | The number of acks (guaranteed copies) to wait on for each entry | 0 +`-e`, `--bookkeeper-ensemble` | The number of [bookies](reference-terminology.md#bookie) to use for topics in the namespace | 0 +`-w`, `--bookkeeper-write-quorum` | The number of writes to make for each entry | 0 +`-r`, `--ml-mark-delete-max-rate` | Throttling rate for mark-delete operations (0 means no throttle) | 0 + +The following is an example: + +```shell +$ pulsar-admin namespaces set-persistence my-tenant/my-ns \ + --bookkeeper-ack-quorom 3 \ + --bookeeper-ensemble 2 +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/setPersistence} + +#### Java + +```java +int bkEnsemble = 2; +int bkQuorum = 3; +int bkAckQuorum = 2; +double markDeleteRate = 0.7; +PersistencePolicies policies = + new PersistencePolicies(ensemble, quorum, ackQuorum, markDeleteRate); +admin.namespaces().setPersistence(namespace, policies); +``` + +### List persistence policies + +You can see which persistence policy currently applies to a namespace. + +#### Pulsar-admin + +Use the [`get-persistence`](reference-pulsar-admin.md#namespaces-get-persistence) subcommand and specify the namespace. + +The following is an example: + +```shell +$ pulsar-admin namespaces get-persistence my-tenant/my-ns +{ + "bookkeeperEnsemble": 1, + "bookkeeperWriteQuorum": 1, + "bookkeeperAckQuorum", 1, + "managedLedgerMaxMarkDeleteRate": 0 +} +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/persistence|operation/getPersistence} + +#### Java + +```java +PersistencePolicies policies = admin.namespaces().getPersistence(namespace); +``` + +## How Pulsar uses ZooKeeper and BookKeeper + +This diagram illustrates the role of ZooKeeper and BookKeeper in a Pulsar cluster: + +![ZooKeeper and BookKeeper](assets/pulsar-system-architecture.png) + +Each Pulsar cluster consists of one or more message brokers. Each broker relies on an ensemble of bookies. diff --git a/site2/website/versioned_docs/version-2.5.0/client-libraries-cpp.md b/site2/website/versioned_docs/version-2.5.0/client-libraries-cpp.md new file mode 100644 index 0000000000000..4a6c4ec9cb7f9 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/client-libraries-cpp.md @@ -0,0 +1,178 @@ +--- +id: version-2.5.0-client-libraries-cpp +title: Pulsar C++ client +sidebar_label: C++ +original_id: client-libraries-cpp +--- + +## Supported platforms + +Pulsar C++ client is supported on **Linux** and **MacOS** platforms. + +## Linux + +> Since 2.1.0 release, Pulsar ships pre-built RPM and Debian packages. You can download and install those packages directly. + +### Install RPM + +1. Download a RPM package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client]({{pulsar:dist_rpm:client}}) | [asc]({{pulsar:dist_rpm:client}}.asc), [sha512]({{pulsar:dist_rpm:client}}.sha512) | +| [client-debuginfo]({{pulsar:dist_rpm:client-debuginfo}}) | [asc]({{pulsar:dist_rpm:client-debuginfo}}.asc), [sha512]({{pulsar:dist_rpm:client-debuginfo}}.sha512) | +| [client-devel]({{pulsar:dist_rpm:client-devel}}) | [asc]({{pulsar:dist_rpm:client-devel}}.asc), [sha512]({{pulsar:dist_rpm:client-devel}}.sha512) | + +2. Install the package using the following command. + +```bash +$ rpm -ivh apache-pulsar-client*.rpm +``` +### Install Debian + +1. Download a Debian package from the links in the table. + +| Link | Crypto files | +|------|--------------| +| [client]({{pulsar:deb:client}}) | [asc]({{pulsar:dist_deb:client}}.asc), [sha512]({{pulsar:dist_deb:client}}.sha512) | +| [client-devel]({{pulsar:deb:client-devel}}) | [asc]({{pulsar:dist_deb:client-devel}}.asc), [sha512]({{pulsar:dist_deb:client-devel}}.sha512) | + +2. Install the package using the following command: + +```bash +$ apt install ./apache-pulsar-client*.deb +``` +### Build + +> If you want to build RPM and Debian packages from the latest master, follow the instructions below. All the instructions are run at the root directory of your cloned Pulsar repository. + +There are recipes that build RPM and Debian packages containing a +statically linked `libpulsar.so` / `libpulsar.a` with all the required +dependencies. + +To build the C++ library packages, build the Java packages first. + +```shell +mvn install -DskipTests +``` + +#### RPM + +```shell +pulsar-client-cpp/pkg/rpm/docker-build-rpm.sh +``` + +This builds the RPM inside a Docker container and it leaves the RPMs in `pulsar-client-cpp/pkg/rpm/RPMS/x86_64/`. + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` | +| pulsar-client-devel | Static library `libpulsar.a` and C++ and C headers | +| pulsar-client-debuginfo | Debug symbols for `libpulsar.so` | + +#### Debian + +To build Debian packages, enter the following command. + +```shell +pulsar-client-cpp/pkg/deb/docker-build-deb.sh +``` + +Debian packages are created at `pulsar-client-cpp/pkg/deb/BUILD/DEB/`. + +| Package name | Content | +|-----|-----| +| pulsar-client | Shared library `libpulsar.so` | +| pulsar-client-dev | Static library `libpulsar.a` and C++ and C headers | + +## MacOS + +Pulsar releases are available in the [Homebrew](https://brew.sh/) core repository. You can install the C++ client library with the following command. The package is installed with the library and headers. + +```shell +brew install libpulsar +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a Pulsar protocol URL. + +Pulsar protocol URLs are assigned to specific clusters, you can use the Pulsar URI scheme. The default port is `6650`. The following is an example for localhost. + +```http +pulsar://localhost:6650 +``` + +In a Pulsar cluster in production, the URL looks as follows: +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you use TLS authentication, you need to add `ssl`, and the default port is `6651`. The following is an example. +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Create a consumer +To connect to Pulsar as a consumer, you need to create a consumer on the C++ client. The following is an example. + +```c++ +Client client("pulsar://localhost:6650"); + +Consumer consumer; +Result result = client.subscribe("my-topic", "my-subscription-name", consumer); +if (result != ResultOk) { + LOG_ERROR("Failed to subscribe: " << result); + return -1; +} + +Message msg; + +while (true) { + consumer.receive(msg); + LOG_INFO("Received: " << msg + << " with payload '" << msg.getDataAsString() << "'"); + + consumer.acknowledge(msg); +} + +client.close(); +``` + +## Create a producer +To connect to Pulsar as a producer, you need to create a producer on the C++ client. The following is an example. + +```c++ +Client client("pulsar://localhost:6650"); + +Producer producer; +Result result = client.createProducer("my-topic", producer); +if (result != ResultOk) { + LOG_ERROR("Error creating producer: " << result); + return -1; +} + +// Publish 10 messages to the topic +for (int i = 0; i < 10; i++){ + Message msg = MessageBuilder().setContent("my-message").build(); + Result res = producer.send(msg); + LOG_INFO("Message sent: " << res); +} +client.close(); +``` + +## Enable authentication in connection URLs +If you use TLS authentication when connecting to Pulsar, you need to add `ssl` in the connection URLs, and the default port is `6651`. The following is an example. + +```cpp +ClientConfiguration config = ClientConfiguration(); +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/cacert.pem"); +config.setTlsAllowInsecureConnection(false); +config.setAuth(pulsar::AuthTls::create( + "/path/to/client-cert.pem", "/path/to/client-key.pem");); + +Client client("pulsar+ssl://my-broker.com:6651", config); +``` + +For complete examples, refer to [C++ client examples](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/examples). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/client-libraries-go.md b/site2/website/versioned_docs/version-2.5.0/client-libraries-go.md new file mode 100644 index 0000000000000..b9366cae94be1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/client-libraries-go.md @@ -0,0 +1,493 @@ +--- +id: version-2.5.0-client-libraries-go +title: The Pulsar Go client +sidebar_label: Go +original_id: client-libraries-go +--- + +The Pulsar Go client can be used to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Go (aka Golang). + +> #### API docs available as well +> For standard API docs, consult the [Godoc](https://godoc.org/github.com/apache/pulsar/pulsar-client-go/pulsar). + + +## Installation + +### Requirements + +Pulsar Go client library is based on the C++ client library. Follow +the instructions for [C++ library](client-libraries-cpp.md) for installing the binaries +through [RPM](client-libraries-cpp.md#rpm), [Deb](client-libraries-cpp.md#deb) or [Homebrew packages](client-libraries-cpp.md#macos). + +### Installing go package + +> #### Compatibility Warning +> The version number of the Go client **must match** the version number of the Pulsar C++ client library. + +You can install the `pulsar` library locally using `go get`. Note that `go get` doesn't support fetching a specific tag - it will always pull in master's version of the Go client. You'll need a C++ client library that matches master. + +```bash +$ go get -u github.com/apache/pulsar/pulsar-client-go/pulsar +``` + +Or you can use [dep](https://github.com/golang/dep) for managing the dependencies. + +```bash +$ dep ensure -add github.com/apache/pulsar/pulsar-client-go/pulsar@v{{pulsar:version}} +``` + +Once installed locally, you can import it into your project: + +```go +import "github.com/apache/pulsar/pulsar-client-go/pulsar" +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here's an example for `localhost`: + +```http +pulsar://localhost:6650 +``` + +A URL for a production Pulsar cluster may look something like this: + +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you're using [TLS](security-tls-authentication.md) authentication, the URL will look like something like this: + +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Creating a client + +In order to interact with Pulsar, you'll first need a `Client` object. You can create a client object using the `NewClient` function, passing in a `ClientOptions` object (more on configuration [below](#client-configuration)). Here's an example: + + +```go +import ( + "log" + "runtime" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + OperationTimeoutSeconds: 5, + MessageListenerThreads: runtime.NumCPU(), + }) + + if err != nil { + log.Fatalf("Could not instantiate Pulsar client: %v", err) + } +} +``` + +The following configurable parameters are available for Pulsar clients: + +Parameter | Description | Default +:---------|:------------|:------- +`URL` | The connection URL for the Pulsar cluster. See [above](#urls) for more info | +`IOThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker) | 1 +`OperationTimeoutSeconds` | The timeout for some Go client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries will occur until this threshold is reached, at which point the operation will fail. | 30 +`MessageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)) | 1 +`ConcurrentLookupRequests` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 5000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 5000 +`Logger` | A custom logger implementation for the client (as a function that takes a log level, file path, line number, and message). All info, warn, and error messages will be routed to this function. | `nil` +`TLSTrustCertsFilePath` | The file path for the trusted TLS certificate | +`TLSAllowInsecureConnection` | Whether the client accepts untrusted TLS certificates from the broker | `false` +`Authentication` | Configure the authentication provider. (default: no authentication). Example: `Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem")` | `nil` +`StatsIntervalInSeconds` | The interval (in seconds) at which client stats are published | 60 + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Go producers using a `ProducerOptions` object. Here's an example: + +```go +producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", +}) + +if err != nil { + log.Fatalf("Could not instantiate Pulsar producer: %v", err) +} + +defer producer.Close() + +msg := pulsar.ProducerMessage{ + Payload: []byte("Hello, Pulsar"), +} + +if err := producer.Send(context.Background(), msg); err != nil { + log.Fatalf("Producer could not send message: %v", err) +} +``` + +> #### Blocking operation +> When you create a new Pulsar producer, the operation will block (waiting on a go channel) until either a producer is successfully created or an error is thrown. + + +### Producer operations + +Pulsar Go producers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Fetches the producer's [topic](reference-terminology.md#topic)| `string` +`Name()` | Fetches the producer's name | `string` +`Send(context.Context, ProducerMessage)` | Publishes a [message](#messages) to the producer's topic. This call will block until the message is successfully acknowledged by the Pulsar broker, or an error will be thrown if the timeout set using the `SendTimeout` in the producer's [configuration](#producer-configuration) is exceeded. | `error` +`SendAndGetMsgID(context.Context, ProducerMessage)`| Send a message, this call will be blocking until is successfully acknowledged by the Pulsar broker. | (MessageID, error) +`SendAsync(context.Context, ProducerMessage, func(ProducerMessage, error))` | Publishes a [message](#messages) to the producer's topic asynchronously. The third argument is a callback function that specifies what happens either when the message is acknowledged or an error is thrown. | +`SendAndGetMsgIDAsync(context.Context, ProducerMessage, func(MessageID, error))`| Send a message in asynchronous mode. The callback will report back the message being published and the eventual error in publishing | +`LastSequenceID()` | Get the last sequence id that was published by this producer. his represent either the automatically assigned or custom sequence id (set on the ProducerMessage) that was published and acknowledged by the broker. | int64 +`Flush()`| Flush all the messages buffered in the client and wait until all messages have been successfully persisted. | error +`Close()` | Closes the producer and releases all resources allocated to it. If `Close()` is called then no more messages will be accepted from the publisher. This method will block until all pending publish requests have been persisted by Pulsar. If an error is thrown, no pending writes will be retried. | `error` +`Schema()` | | Schema + +Here's a more involved example usage of a producer: + +```go +import ( + "context" + "fmt" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client to instantiate a producer + producer, err := client.CreateProducer(pulsar.ProducerOptions{ + Topic: "my-topic", + }) + + if err != nil { log.Fatal(err) } + + ctx := context.Background() + + // Send 10 messages synchronously and 10 messages asynchronously + for i := 0; i < 10; i++ { + // Create a message + msg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("message-%d", i)), + } + + // Attempt to send the message + if err := producer.Send(ctx, msg); err != nil { + log.Fatal(err) + } + + // Create a different message to send asynchronously + asyncMsg := pulsar.ProducerMessage{ + Payload: []byte(fmt.Sprintf("async-message-%d", i)), + } + + // Attempt to send the message asynchronously and handle the response + producer.SendAsync(ctx, asyncMsg, func(msg pulsar.ProducerMessage, err error) { + if err != nil { log.Fatal(err) } + + fmt.Printf("the %s successfully published", string(msg.Payload)) + }) + } +} +``` + +### Producer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer will publish messages | +`Name` | A name for the producer. If you don't explicitly assign a name, Pulsar will automatically generate a globally unique name that you can access later using the `Name()` method. If you choose to explicitly assign a name, it will need to be unique across *all* Pulsar clusters, otherwise the creation operation will throw an error. | +`Properties`| Attach a set of application defined properties to the producer. This properties will be visible in the topic stats | +`SendTimeout` | When publishing a message to a topic, the producer will wait for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error will be thrown. If you set `SendTimeout` to -1, the timeout will be set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30 seconds +`MaxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `Send` and `SendAsync` methods will fail *unless* `BlockIfQueueFull` is set to `true`. | +`MaxPendingMessagesAcrossPartitions` | Set the number of max pending messages across all the partitions. This setting will be used to lower the max pending messages for each partition `MaxPendingMessages(int)`, if the total exceeds the configured value.| +`BlockIfQueueFull` | If set to `true`, the producer's `Send` and `SendAsync` methods will block when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `MaxPendingMessages` parameter); if set to `false` (the default), `Send` and `SendAsync` operations will fail and throw a `ProducerQueueIsFullError` when the queue is full. | `false` +`MessageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`pulsar.RoundRobinDistribution`, the default), publishing all messages to a single partition (`pulsar.UseSinglePartition`), or a custom partitioning scheme (`pulsar.CustomPartition`). | `pulsar.RoundRobinDistribution` +`HashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `pulsar.JavaStringHash` (the equivalent of `String.hashCode()` in Java), `pulsar.Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `pulsar.BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library) | `pulsar.JavaStringHash` +`CompressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), [`ZLIB`](https://zlib.net/), [`ZSTD`](https://facebook.github.io/zstd/) and [`SNAPPY`](https://google.github.io/snappy/). | No compression +`MessageRouter` | By default, Pulsar uses a round-robin routing scheme for [partitioned topics](cookbooks-partitioned.md). The `MessageRouter` parameter enables you to specify custom routing logic via a function that takes the Pulsar message and topic metadata as an argument and returns an integer (where the ), i.e. a function signature of `func(Message, TopicMetadata) int`. | +`Batching` | Control whether automatic batching of messages is enabled for the producer. | false +`BatchingMaxPublishDelay` | Set the time period within which the messages sent will be batched (default: 10ms) if batch messages are enabled. If set to a non zero value, messages will be queued until this time interval or until | 10ms +`BatchingMaxMessages` | Set the maximum number of messages permitted in a batch. (default: 1000) If set to a value greater than 1, messages will be queued until this threshold is reached or batch interval has elapsed | 1000 + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Go consumers using a `ConsumerOptions` object. Here's a basic example that uses channels: + +```go +msgChannel := make(chan pulsar.ConsumerMessage) + +consumerOpts := pulsar.ConsumerOptions{ + Topic: "my-topic", + SubscriptionName: "my-subscription-1", + Type: pulsar.Exclusive, + MessageChannel: msgChannel, +} + +consumer, err := client.Subscribe(consumerOpts) + +if err != nil { + log.Fatalf("Could not establish subscription: %v", err) +} + +defer consumer.Close() + +for cm := range msgChannel { + msg := cm.Message + + fmt.Printf("Message ID: %s", msg.ID()) + fmt.Printf("Message value: %s", string(msg.Payload())) + + consumer.Ack(msg) +} +``` + +> #### Blocking operation +> When you create a new Pulsar consumer, the operation will block (on a go channel) until either a producer is successfully created or an error is thrown. + + +### Consumer operations + +Pulsar Go consumers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the consumer's [topic](reference-terminology.md#topic) | `string` +`Subscription()` | Returns the consumer's subscription name | `string` +`Unsubcribe()` | Unsubscribes the consumer from the assigned topic. Throws an error if the unsubscribe operation is somehow unsuccessful. | `error` +`Receive(context.Context)` | Receives a single message from the topic. This method blocks until a message is available. | `(Message, error)` +`Ack(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) | `error` +`AckID(MessageID)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID | `error` +`AckCumulative(Message)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `AckCumulative` method will block until the ack has been sent to the broker. After that, the messages will *not* be redelivered to the consumer. Cumulative acking can only be used with a [shared](concepts-messaging.md#shared) subscription type. | `error` +`AckCumulativeID(MessageID)` |Ack the reception of all the messages in the stream up to (and including) the provided message. This method will block until the acknowledge has been sent to the broker. After that, the messages will not be re-delivered to this consumer. | error +`Nack(Message)` | Acknowledge the failure to process a single message. | `error` +`NackID(MessageID)` | Acknowledge the failure to process a single message. | `error` +`Close()` | Closes the consumer, disabling its ability to receive messages from the broker | `error` +`RedeliverUnackedMessages()` | Redelivers *all* unacknowledged messages on the topic. In [failover](concepts-messaging.md#failover) mode, this request is ignored if the consumer isn't active on the specified topic; in [shared](concepts-messaging.md#shared) mode, redelivered messages are distributed across all consumers connected to the topic. **Note**: this is a *non-blocking* operation that doesn't throw an error. | +`Seek(msgID MessageID)` | Reset the subscription associated with this consumer to a specific message id. The message id can either be a specific message or represent the first or last messages in the topic. | error + +#### Receive example + +Here's an example usage of a Go consumer that uses the `Receive()` method to process incoming messages: + +```go +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatal(err) } + + // Use the client object to instantiate a consumer + consumer, err := client.Subscribe(pulsar.ConsumerOptions{ + Topic: "my-golang-topic", + SubscriptionName: "sub-1", + Type: pulsar.Exclusive, + }) + + if err != nil { log.Fatal(err) } + + defer consumer.Close() + + ctx := context.Background() + + // Listen indefinitely on the topic + for { + msg, err := consumer.Receive(ctx) + if err != nil { log.Fatal(err) } + + // Do something with the message + err = processMessage(msg) + + if err == nil { + // Message processed successfully + consumer.Ack(msg) + } else { + // Failed to process messages + consumer.Nack(msg) + } + } +} +``` + +### Consumer configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the consumer will establish a subscription and listen for messages | +`Topics` | Specify a list of topics this consumer will subscribe on. Either a topic, a list of topics or a topics pattern are required when subscribing | +`TopicsPattern` | Specify a regular expression to subscribe to multiple topics under the same namespace. Either a topic, a list of topics or a topics pattern are required when subscribing | +`SubscriptionName` | The subscription name for this consumer | +`Properties` | Attach a set of application defined properties to the consumer. This properties will be visible in the topic stats| +`Name` | The name of the consumer | +`AckTimeout` | Set the timeout for unacked messages | 0 +`NackRedeliveryDelay` | The delay after which to redeliver the messages that failed to be processed. Default is 1min. (See `Consumer.Nack()`) | 1 minute +`Type` | Available options are `Exclusive`, `Shared`, and `Failover` | `Exclusive` +`SubscriptionInitPos` | InitialPosition at which the cursor will be set when subscribe | Latest +`MessageChannel` | The Go channel used by the consumer. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `Receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 +`MaxTotalReceiverQueueSizeAcrossPartitions` |Set the max total receiver queue size across partitions. This setting will be used to reduce the receiver queue size for individual partitions if the total exceeds this value | 50000 +`ReadCompacted` | If enabled, the consumer will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the consumer will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal. | + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recent unacked message). You can [configure](#reader-configuration) Go readers using a `ReaderOptions` object. Here's an example: + +```go +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageId: pulsar.LatestMessage, +}) +``` + +> #### Blocking operation +> When you create a new Pulsar reader, the operation will block (on a go channel) until either a reader is successfully created or an error is thrown. + + +### Reader operations + +Pulsar Go readers have the following methods available: + +Method | Description | Return type +:------|:------------|:----------- +`Topic()` | Returns the reader's [topic](reference-terminology.md#topic) | `string` +`Next(context.Context)` | Receives the next message on the topic (analogous to the `Receive` method for [consumers](#consumer-operations)). This method blocks until a message is available. | `(Message, error)` +`HasNext()` | Check if there is any message available to read from the current position| (bool, error) +`Close()` | Closes the reader, disabling its ability to receive messages from the broker | `error` + +#### "Next" example + +Here's an example usage of a Go reader that uses the `Next()` method to process incoming messages: + +```go +import ( + "context" + "log" + + "github.com/apache/pulsar/pulsar-client-go/pulsar" +) + +func main() { + // Instantiate a Pulsar client + client, err := pulsar.NewClient(pulsar.ClientOptions{ + URL: "pulsar://localhost:6650", + }) + + if err != nil { log.Fatalf("Could not create client: %v", err) } + + // Use the client to instantiate a reader + reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: pulsar.EarliestMessage, + }) + + if err != nil { log.Fatalf("Could not create reader: %v", err) } + + defer reader.Close() + + ctx := context.Background() + + // Listen on the topic for incoming messages + for { + msg, err := reader.Next(ctx) + if err != nil { log.Fatalf("Error reading from topic: %v", err) } + + // Process the message + } +} +``` + +In the example above, the reader begins reading from the earliest available message (specified by `pulsar.EarliestMessage`). The reader can also begin reading from the latest message (`pulsar.LatestMessage`) or some other message ID specified by bytes using the `DeserializeMessageID` function, which takes a byte array and returns a `MessageID` object. Here's an example: + +```go +lastSavedId := // Read last saved message id from external store as byte[] + +reader, err := client.CreateReader(pulsar.ReaderOptions{ + Topic: "my-golang-topic", + StartMessageID: DeserializeMessageID(lastSavedId), +}) +``` + +### Reader configuration + +Parameter | Description | Default +:---------|:------------|:------- +`Topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader will establish a subscription and listen for messages | +`Name` | The name of the reader | +`StartMessageID` | THe initial reader position, i.e. the message at which the reader begins processing messages. The options are `pulsar.EarliestMessage` (the earliest available message on the topic), `pulsar.LatestMessage` (the latest available message on the topic), or a `MessageID` object for a position that isn't earliest or latest. | +`MessageChannel` | The Go channel used by the reader. Messages that arrive from the Pulsar topic(s) will be passed to this channel. | +`ReceiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `Next`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 +`SubscriptionRolePrefix` | The subscription role prefix. | `reader` +`ReadCompacted` | If enabled, the reader will read messages from the compacted topic rather than reading the full message backlog of the topic. This means that, if the topic has been compacted, the reader will only see the latest value for each key in the topic, up until the point in the topic message backlog that has been compacted. Beyond that point, the messages will be sent as normal.| + +## Messages + +The Pulsar Go client provides a `ProducerMessage` interface that you can use to construct messages to producer on Pulsar topics. Here's an example message: + +```go +msg := pulsar.ProducerMessage{ + Payload: []byte("Here is some message data"), + Key: "message-key", + Properties: map[string]string{ + "foo": "bar", + }, + EventTime: time.Now(), + ReplicationClusters: []string{"cluster1", "cluster3"}, +} + +if err := producer.send(msg); err != nil { + log.Fatalf("Could not publish message due to: %v", err) +} +``` + +The following methods parameters are available for `ProducerMessage` objects: + +Parameter | Description +:---------|:----------- +`Payload` | The actual data payload of the message +`Value` | Value and payload is mutually exclusive, `Value interface{}` for schema message. +`Key` | The optional key associated with the message (particularly useful for things like topic compaction) +`Properties` | A key-value map (both keys and values must be strings) for any application-specific metadata attached to the message +`EventTime` | The timestamp associated with the message +`ReplicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. +`SequenceID` | Set the sequence id to assign to the current message + +## TLS encryption and authentication + +In order to use [TLS encryption](security-tls-transport.md), you'll need to configure your client to do so: + + * Use `pulsar+ssl` URL type + * Set `TLSTrustCertsFilePath` to the path to the TLS certs used by your client and the Pulsar broker + * Configure `Authentication` option + +Here's an example: + +```go +opts := pulsar.ClientOptions{ + URL: "pulsar+ssl://my-cluster.com:6651", + TLSTrustCertsFilePath: "/path/to/certs/my-cert.csr", + Authentication: NewAuthenticationTLS("my-cert.pem", "my-key.pem"), +} +``` diff --git a/site2/website/versioned_docs/version-2.5.0/client-libraries-java.md b/site2/website/versioned_docs/version-2.5.0/client-libraries-java.md new file mode 100644 index 0000000000000..ae21e04cbe257 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/client-libraries-java.md @@ -0,0 +1,825 @@ +--- +id: version-2.5.0-client-libraries-java +title: Pulsar Java client +sidebar_label: Java +original_id: client-libraries-java +--- + +You can use Pulsar Java client to create Java producers, consumers, and [readers](#reader-interface) of messages and to perform [administrative tasks](admin-api-overview.md). The current version of the Java client is **{{pulsar:version}}**. + +Javadoc for the Pulsar client is divided into two domains by package as follows. + +Package | Description | Maven Artifact +:-------|:------------|:-------------- +[`org.apache.pulsar.client.api`](/api/client) | The producer and consumer API | [org.apache.pulsar:pulsar-client:{{pulsar:version}}](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C{{pulsar:version}}%7Cjar) +[`org.apache.pulsar.client.admin`](/api/admin) | The Java [admin API](admin-api-overview.md) | [org.apache.pulsar:pulsar-client-admin:{{pulsar:version}}](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client-admin%7C{{pulsar:version}}%7Cjar) + +This document focuses only on the client API for producing and consuming messages on Pulsar topics. For how to use the Java admin client, see [Pulsar admin interface](admin-api-overview.md). + +## Installation + +The latest version of the Pulsar Java client library is available via [Maven Central](http://search.maven.org/#artifactdetails%7Corg.apache.pulsar%7Cpulsar-client%7C{{pulsar:version}}%7Cjar). To use the latest version, add the `pulsar-client` library to your build configuration. + +### Maven + +If you use Maven, add the following information to the `pom.xml` file. + +```xml + +{{pulsar:version}} + + + + org.apache.pulsar + pulsar-client + ${pulsar.version} + +``` + +### Gradle + +If you use Gradle, add the following information to the `build.gradle` file. + +```groovy +def pulsarVersion = '{{pulsar:version}}' + +dependencies { + compile group: 'org.apache.pulsar', name: 'pulsar-client', version: pulsarVersion +} +``` + +## Connection URLs + +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +You can assign Pulsar protocol URLs to specific clusters and use the `pulsar` scheme. The default port is `6650`. The following is an example of `localhost`. + +```http +pulsar://localhost:6650 +``` + +If you have multiple brokers, the URL is as follows. + +```http +pulsar://localhost:6550,localhost:6651,localhost:6652 +``` + +A URL for a production Pulsar cluster is as follows. + +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you use [TLS](security-tls-authentication.md) authentication, the URL is as follows. + +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Client + +You can instantiate a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object using just a URL for the target Pulsar [cluster](reference-terminology.md#cluster) like this: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +``` + +If you have multiple brokers, you can initiate a PulsarClient like this: +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650,localhost:6651,localhost:6652") + .build(); +``` + +> ### Default broker URLs for standalone clusters +> If you run a cluster in [standalone mode](getting-started-standalone.md), the broker is available at the `pulsar://localhost:6650` URL by default. + +If you create a client, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Type | Name |
Description
| Default +|---|---|---|--- +String | `serviceUrl` |Service URL provider for Pulsar service | None +String | `authPluginClassName` | Name of the authentication plugin | None +String | `authParams` | String represents parameters for the authentication plugin

**Example**
key1:val1,key2:val2|None +long|`operationTimeoutMs`|Operation timeout |30000 +long|`statsIntervalSeconds`|Interval between each stats info

Stats is activated with positive `statsInterval`

Set `statsIntervalSeconds` to 1 second at least |60 +int|`numIoThreads`| The number of threads used for handling connections to brokers | 1 +int|`numListenerThreads`|The number of threads used for handling message listeners | 1 +boolean|`useTcpNoDelay`|Whether to use TCP no-delay flag on the connection to disable Nagle algorithm |true +boolean |`useTls` |Whether to use TLS encryption on the connection| false +string | `tlsTrustCertsFilePath` |Path to the trusted TLS certificate file|None +boolean|`tlsAllowInsecureConnection`|Whether the Pulsar client accepts untrusted TLS certificate from broker | false +boolean | `tlsHostnameVerificationEnable` | Whether to enable TLS hostname verification|false +int|`concurrentLookupRequest`|The number of concurrent lookup requests allowed to send on each broker connection to prevent overload on broker|5000 +int|`maxLookupRequest`|The maximum number of lookup requests allowed on each broker connection to prevent overload on broker | 50000 +int|`maxNumberOfRejectedRequestPerConnection`|The maximum number of rejected requests of a broker in a certain time frame (30 seconds) after the current connection is closed and the client creates a new connection to connect to a different broker|50 +int|`keepAliveIntervalSeconds`|Seconds of keeping alive interval for each client broker connection|30 +int|`connectionTimeoutMs`|Duration of waiting for a connection to a broker to be established

If the duration passes without a response from a broker, the connection attempt is dropped|10000 +int|`requestTimeoutMs`|Maximum duration for completing a request |60000 +int|`defaultBackoffIntervalNanos`| Default duration for a backoff interval | TimeUnit.MILLISECONDS.toNanos(100); +long|`maxBackoffIntervalNanos`|Maximum duration for a backoff interval|TimeUnit.SECONDS.toNanos(30) + +Check out the Javadoc for the {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} class for a full list of configurable parameters. + +> In addition to client-level configuration, you can also apply [producer](#configuring-producers) and [consumer](#configuring-consumers) specific configuration as described in sections below. + +## Producer + +In Pulsar, producers write messages to topics. Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object (as in the section [above](#client-configuration)), you can create a {@inject: javadoc:Producer:/client/org/apache/pulsar/client/api/Producer} for a specific Pulsar [topic](reference-terminology.md#topic). + +```java +Producer producer = client.newProducer() + .topic("my-topic") + .create(); + +// You can then send messages to the broker and topic you specified: +producer.send("My message".getBytes()); +``` + +By default, producers produce messages that consist of byte arrays. You can produce different types by specifying a message [schema](#schemas). + +```java +Producer stringProducer = client.newProducer(Schema.STRING) + .topic("my-topic") + .create(); +stringProducer.send("My message"); +``` + +> Make sure that you close your producers, consumers, and clients when you do not need them. +> ```java +> producer.close(); +> consumer.close(); +> client.close(); +> ``` +> +> Close operations can also be asynchronous: +> ```java +> producer.closeAsync() +> .thenRun(() -> System.out.println("Producer closed")) +> .exceptionally((ex) -> { +> System.err.println("Failed to close producer: " + ex); +> return null; +> }); +> ``` + +### Configure producer + +If you instantiate a `Producer` object by specifying only a topic name as the example above, use the default configuration for producer. + +If you create a producer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +Type | Name|
Description
| Default +|---|---|---|--- +String| `topicName`| Topic name| null| +String|`producerName`|Producer name| null +long|`sendTimeoutMs`|Message send timeout in ms.

If a message is not acknowledged by a server before the `sendTimeout` expires, an error occurs.|30000 +boolean|`blockIfQueueFull`|If it is set to `true`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer block, rather than failing and throwing errors.

If it is set to `false`, when the outgoing message queue is full, the `Send` and `SendAsync` methods of producer fail and `ProducerQueueIsFullError` exceptions occur.

The `MaxPendingMessages` parameter determines the size of the outgoing message queue.|false +int|`maxPendingMessages`|The maximum size of a queue holding pending messages.

For example, a message waiting to receive an acknowledgment from a [broker](reference-terminology.md#broker).

By default, when the queue is full, all calls to the `Send` and `SendAsync` methods fail **unless** you set `BlockIfQueueFull` to `true`.|1000 +int|`maxPendingMessagesAcrossPartitions`|The maximum number of pending messages across partitions.

Use the setting to lower the max pending messages for each partition ({@link #setMaxPendingMessages(int)}) if the total number exceeds the configured value.|50000 +MessageRoutingMode|`messageRoutingMode`|Message routing logic for producers on [partitioned topics](concepts-architecture-overview.md#partitioned-topics).

Apply the logic only when setting no key on messages.

Available options are as follows:

  • `pulsar.RoundRobinDistribution`: round robin

  • `pulsar.UseSinglePartition`: publish all messages to a single partition

  • `pulsar.CustomPartition`: a custom partitioning scheme|`pulsar.RoundRobinDistribution` +HashingScheme|`hashingScheme`|Hashing function determining the partition where you publish a particular message (**partitioned topics only**).

    Available options are as follows:

  • `pulsar.JavaStringHash`: the equivalent of `String.hashCode()` in Java

  • `pulsar.Murmur3_32Hash`: applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function

  • `pulsar.BoostHash`: applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library |`HashingScheme.JavaStringHash` +ProducerCryptoFailureAction|`cryptoFailureAction`|Producer should take action when encryption fails.

  • **FAIL**: if encryption fails, unencrypted messages fail to send.

  • **SEND**: if encryption fails, unencrypted messages are sent. |`ProducerCryptoFailureAction.FAIL` +long|`batchingMaxPublishDelayMicros`|Batching time period of sending messages.|TimeUnit.MILLISECONDS.toMicros(1) +int|batchingMaxMessages|The maximum number of messages permitted in a batch.|1000 +boolean|`batchingEnabled`|Enable batching of messages. |true +CompressionType|`compressionType`|Message data compression type used by a producer.

    Available options:
  • [`LZ4`](https://github.com/lz4/lz4)
  • [`ZLIB`](https://zlib.net/)
  • [`ZSTD`](https://facebook.github.io/zstd/)
  • [`SNAPPY`](https://google.github.io/snappy/)| No compression + +You can configure parameters if you do not want to use the default configuration. + +For a full list, see the Javadoc for the {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder} class. The following is an example. + +```java +Producer producer = client.newProducer() + .topic("my-topic") + .batchingMaxPublishDelay(10, TimeUnit.MILLISECONDS) + .sendTimeout(10, TimeUnit.SECONDS) + .blockIfQueueFull(true) + .create(); +``` + +### Message routing + +When using partitioned topics, you can specify the routing mode whenever you publish messages using a producer. For more information on specifying a routing mode using the Java client, see the [Partitioned Topics](cookbooks-partitioned.md) cookbook. + +### Async send + +You can publish messages [asynchronously](concepts-messaging.md#send-modes) using the Java client. With async send, the producer puts the message in a blocking queue and returns it immediately. Then the client library sends the message to the broker in the background. If the queue is full (max size configurable), the producer is blocked or fails immediately when calling the API, depending on arguments passed to the producer. + +The following is an example. + +```java +producer.sendAsync("my-async-message".getBytes()).thenAccept(msgId -> { + System.out.printf("Message with ID %s successfully sent", msgId); +}); +``` + +As you can see from the example above, async send operations return a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId} wrapped in a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Configure messages + +In addition to a value, you can set additional items on a given message: + +```java +producer.newMessage() + .key("my-message-key") + .value("my-async-message".getBytes()) + .property("my-key", "my-value") + .property("my-other-key", "my-other-value") + .send(); +``` + +You can terminate the builder chain with `sendAsync()` and get a future return. + +## Consumer + +In Pulsar, consumers subscribe to topics and handle messages that producers publish to those topics. You can instantiate a new [consumer](reference-terminology.md#consumer) by first instantiating a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object and passing it a URL for a Pulsar broker (as [above](#client-configuration)). + +Once you've instantiated a {@inject: javadoc:PulsarClient:/client/org/apache/pulsar/client/api/PulsarClient} object, you can create a {@inject: javadoc:Consumer:/client/org/apache/pulsar/client/api/Consumer} by specifying a [topic](reference-terminology.md#topic) and a [subscription](concepts-messaging.md#subscription-modes). + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscribe(); +``` + +The `subscribe` method will auto subscribe the consumer to the specified topic and subscription. One way to make the consumer listen on the topic is to set up a `while` loop. In this example loop, the consumer listens for messages, prints the contents of any received message, and then [acknowledges](reference-terminology.md#acknowledgment-ack) that the message has been processed. If the processing logic fails, you can use [negative acknowledgement](reference-terminology.md#acknowledgment-ack) to redeliver the message later. + +```java +while (true) { + // Wait for a message + Message msg = consumer.receive(); + + try { + // Do something with the message + System.out.printf("Message received: %s", new String(msg.getData())); + + // Acknowledge the message so that it can be deleted by the message broker + consumer.acknowledge(msg); + } catch (Exception e) { + // Message failed to process, redeliver later + consumer.negativeAcknowledge(msg); + } +} +``` + +### Configure consumer + +If you instantiate a `Consumer` object by specifying only a topic and subscription name as in the example above, the consumer uses the default configuration. + +When you create a consumer, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +Type | Name|
    Description
    | Default +|---|---|---|--- +Set<String>| `topicNames`| Topic name| Sets.newTreeSet() +Pattern| `topicsPattern`| Topic pattern |None +String| `subscriptionName`| Subscription name| None +SubscriptionType| `subscriptionType`| Subscription type

    Three subscription types are available:
  • Exclusive
  • Failover
  • Shared
  • |SubscriptionType.Exclusive +int | `receiverQueueSize` | Size of a consumer's receiver queue.

    For example, the number of messages accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.| 1000 +long|`acknowledgementsGroupTimeMicros`|Group a consumer acknowledgment for a specified time.

    By default, a consumer uses 100ms grouping time to send out acknowledgments to a broker.

    Setting a group time of 0 sends out acknowledgments immediately.

    A longer ack group time is more efficient at the expense of a slight increase in message re-deliveries after a failure.|TimeUnit.MILLISECONDS.toMicros(100) +long|`negativeAckRedeliveryDelayMicros`|Delay to wait before redelivering messages that failed to be processed.

    When an application uses {@link Consumer#negativeAcknowledge(Message)}, failed messages are redelivered after a fixed timeout. |TimeUnit.MINUTES.toMicros(1) +int |`maxTotalReceiverQueueSizeAcrossPartitions`|The max total receiver queue size across partitions.

    This setting reduces the receiver queue size for individual partitions if the total receiver queue size exceeds this value.|50000 +String|`consumerName`|Consumer name|null +long|`ackTimeoutMillis`|Timeout of unacked messages|0 +long|`tickDurationMillis`|Granularity of the ack-timeout redelivery.

    Using an higher `tickDurationMillis` reduces the memory overhead to track messages when setting ack-timeout to a bigger value (for example, 1 hour).|1000 +int|`priorityLevel`|Priority level for a consumer to which a broker gives more priority while dispatching messages in the shared subscription mode.

    The broker follows descending priorities. For example, 0=max-priority, 1, 2,...

    In shared subscription mode, the broker **first dispatches messages to the max priority level consumers if they have permits**. Otherwise, the broker considers next priority level consumers.

    **Example 1**

    If a subscription has consumerA with `priorityLevel` 0 and consumerB with `priorityLevel` 1, then the broker **only dispatches messages to consumerA until it runs out permits** and then starts dispatching messages to consumerB.

    **Example 2**

    Consumer Priority, Level, Permits
    C1, 0, 2
    C2, 0, 1
    C3, 0, 1
    C4, 1, 2
    C5, 1, 1

    Order in which a broker dispatches messages to consumers is: C1, C2, C3, C1, C4, C5, C4.|0 +ConsumerCryptoFailureAction|`cryptoFailureAction`|Consumer should take action when it receives a message that can not be decrypted.

  • **FAIL**: this is the default option to fail messages until crypto succeeds.

  • **DISCARD**:silently acknowledge and not deliver message to an application.

  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

    The decompression of message fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.|ConsumerCryptoFailureAction.FAIL
  • +SortedMap|`properties`|A name or value property of this consumer.

    `properties` is application defined metadata attached to a consumer.

    When getting a topic stats, associate this metadata with the consumer stats for easier identification.|new TreeMap<>() +boolean|`readCompacted`|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than reading a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    Only enabling `readCompacted` on subscriptions to persistent topics, which have a single active consumer (like failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +SubscriptionInitialPosition|`subscriptionInitialPosition`|Initial position at which to set cursor when subscribing to a topic at first time.|SubscriptionInitialPosition.Latest +int|`patternAutoDiscoveryPeriod`|Topic auto discovery period when using a pattern for topic's consumer.

    The default and minimum value is 1 minute.|1 +RegexSubscriptionMode|`regexSubscriptionMode`|When subscribing to a topic using a regular expression, you can pick a certain type of topics.

  • **PersistentOnly**: only subscribe to persistent topics.

  • **NonPersistentOnly**: only subscribe to non-persistent topics.

  • **AllTopics**: subscribe to both persistent and non-persistent topics.
  • |RegexSubscriptionMode.PersistentOnly +DeadLetterPolicy|`deadLetterPolicy`|Dead letter policy for consumers.

    By default, some messages are probably redelivered many times, even to the extent that it never stops.

    By using the dead letter mechanism, messages have the max redelivery count. **When exceeding the maximum number of redeliveries, messages are sent to the Dead Letter Topic and acknowledged automatically**.

    You can enable the dead letter mechanism by setting `deadLetterPolicy`.

    **Example**

    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10).build())
    .subscribe();


    Default dead letter topic name is `{TopicName}-{Subscription}-DLQ`.

    To set a custom dead letter topic name:
    client.newConsumer()
    .deadLetterPolicy(DeadLetterPolicy.builder().maxRedeliverCount(10)
    .deadLetterTopic("your-topic-name").build())
    .subscribe();


    When specifying the dead letter policy while not specifying `ackTimeoutMillis`, you can set the ack timeout to 30000 millisecond.|None +boolean|`autoUpdatePartitions`|If `autoUpdatePartitions` is enabled, a consumer subscribes to partition increasement automatically.

    **Note**: this is only for partitioned consumers.|true +boolean|`replicateSubscriptionState`|If `replicateSubscriptionState` is enabled, a subscription state is replicated to geo-replicated clusters.|false + +You can configure parameters if you do not want to use the default configuration. For a full list, see the Javadoc for the {@inject: javadoc:ConsumerBuilder:/client/org/apache/pulsar/client/api/ConsumerBuilder} class. + +The following is an example. + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .ackTimeout(10, TimeUnit.SECONDS) + .subscriptionType(SubscriptionType.Exclusive) + .subscribe(); +``` + +### Async receive + +The `receive` method receives messages synchronously (the consumer process is blocked until a message is available). You can also use [async receive](concepts-messaging.md#receive-modes), which returns a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) object immediately once a new message is available. + +The following is an example. + +```java +CompletableFuture asyncMessage = consumer.receiveAsync(); +``` + +Async receive operations return a {@inject: javadoc:Message:/client/org/apache/pulsar/client/api/Message} wrapped inside of a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture). + +### Batch receive + +Use `batchReceive` to receive multiple messages for each call. + +The following is an example. + +```java +Messages messages = consumer.batchReceive(); +for (message in messages) { + // do something +} +consumer.acknowledge(messages) +``` + +> Note: +> +> Batch receive policy limits the number and bytes of messages in a single batch. You can specify a timeout to wait for enough messages. +> +> The batch receive is completed if any of the following condition is met: enough number of messages, bytes of messages, wait timeout. +> +> ```java +> Consumer consumer = client.newConsumer() +> .topic("my-topic") +> .subscriptionName("my-subscription") +> .batchReceivePolicy(BatchReceivePolicy.builder() +> .maxNumMessages(100) +> .maxNumBytes(1024 * 1024) +> .timeout(200, TimeUnit.MILLISECONDS) +> .build()) +> .subscribe(); +> ``` +> The default batch receive policy is: +> ```java +> BatchReceivePolicy.builder() +> .maxNumMessage(-1) +> .maxNumBytes(10 * 1024 * 1024) +> .timeout(100, TimeUnit.MILLISECONDS) +> .build(); +> ``` + +### Multi-topic subscriptions + +In addition to subscribing a consumer to a single Pulsar topic, you can also subscribe to multiple topics simultaneously using [multi-topic subscriptions](concepts-messaging.md#multi-topic-subscriptions). To use multi-topic subscriptions you can supply either a regular expression (regex) or a `List` of topics. If you select topics via regex, all topics must be within the same Pulsar namespace. + +The followings are some examples. + +```java +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +ConsumerBuilder consumerBuilder = pulsarClient.newConsumer() + .subscriptionName(subscription); + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(allTopicsInNamespace) + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer allTopicsConsumer = consumerBuilder + .topicsPattern(someTopicsInNamespace) + .subscribe(); +``` + +You can also subscribe to an explicit list of topics (across namespaces if you wish): + +```java +List topics = Arrays.asList( + "topic-1", + "topic-2", + "topic-3" +); + +Consumer multiTopicConsumer = consumerBuilder + .topics(topics) + .subscribe(); + +// Alternatively: +Consumer multiTopicConsumer = consumerBuilder + .topics( + "topic-1", + "topic-2", + "topic-3" + ) + .subscribe(); +``` + +You can also subscribe to multiple topics asynchronously using the `subscribeAsync` method rather than the synchronous `subscribe` method. The following is an example. + +```java +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default.*"); +consumerBuilder + .topics(topics) + .subscribeAsync() + .thenAccept(this::receiveMessageFromConsumer); + +private void receiveMessageFromConsumer(Consumer consumer) { + consumer.receiveAsync().thenAccept(message -> { + // Do something with the received message + receiveMessageFromConsumer(consumer); + }); +} +``` + +### Subscription modes + +Pulsar has various [subscription modes](concepts-messaging#subscription-modes) to match different scenarios. A topic can have multiple subscriptions with different subscription modes. However, a subscription can only have one subscription mode at a time. + +A subscription is identical with the subscription name which can specify only one subscription mode at a time. You cannot change the subscription mode unless all existing consumers of this subscription are offline. + +Different subscription modes have different message distribution modes. This section describes the differences of subscription modes and how to use them. + +In order to better describe their differences, assuming you have a topic named "my-topic", and the producer has published 10 messages. + +```java +Producer producer = client.newProducer(Schema.STRING) + .topic("my-topic") + .enableBatching(false) + .create(); +// 3 messages with "key-1", 3 messages with "key-2", 2 messages with "key-3" and 2 messages with "key-4" +producer.newMessage().key("key-1").value("message-1-1").send(); +producer.newMessage().key("key-1").value("message-1-2").send(); +producer.newMessage().key("key-1").value("message-1-3").send(); +producer.newMessage().key("key-2").value("message-2-1").send(); +producer.newMessage().key("key-2").value("message-2-2").send(); +producer.newMessage().key("key-2").value("message-2-3").send(); +producer.newMessage().key("key-3").value("message-3-1").send(); +producer.newMessage().key("key-3").value("message-3-2").send(); +producer.newMessage().key("key-4").value("message-4-1").send(); +producer.newMessage().key("key-4").value("message-4-2").send(); +``` + +#### Exclusive + +Create a new consumer and subscribe with the `Exclusive` subscription mode. + +```java +Consumer consumer = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Exclusive) + .subscribe() +``` + +Only the first consumer is allowed to the subscription, other consumers receive an error. The first consumer receives all 10 messages, and the consuming order is the same as the producing order. + +> Note: +> +> If topic is a partitioned topic, the first consumer subscribes to all partitioned topics, other consumers are not assigned with partitions and receive an error. + +#### Failover + +Create new consumers and subscribe with the`Failover` subscription mode. + +```java +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Failover) + .subscribe() +//conumser1 is the active consumer, consumer2 is the standby consumer. +//consumer1 receives 5 messages and then crashes, consumer2 takes over as an active consumer. + + +``` + +Multiple consumers can attach to the same subscription, yet only the first consumer is active, and others are standby. When the active consumer is disconnected, messages will be dispatched to one of standby consumers, and the standby consumer then becomes active consumer. + +If the first active consumer is disconnected after receiving 5 messages, the standby consumer becomes active consumer. Consumer1 will receive: + +``` +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-2", "message-2-1") +("key-2", "message-2-2") +``` + +consumer2 will receive: + +``` +("key-2", "message-2-3") +("key-3", "message-3-1") +("key-3", "message-3-2") +("key-4", "message-4-1") +("key-4", "message-4-2") +``` + +> Note: +> +> If a topic is a partitioned topic, each partition has only one active consumer, messages of one partition are distributed to only one consumer, and messages of multiple partitions are distributed to multiple consumers. + +#### Shared + +Create new consumers and subscribe with `Shared` subscription mode: + +```java +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .subscribe() +//Both consumer1 and consumer 2 is active consumers. +``` + +In shared subscription mode, multiple consumers can attach to the same subscription and messages are delivered in a round robin distribution across consumers. + +If a broker dispatches only one message at a time, consumer1 receives the following information. + +``` +("key-1", "message-1-1") +("key-1", "message-1-3") +("key-2", "message-2-2") +("key-3", "message-3-1") +("key-4", "message-4-1") +``` + +consumer2 receives the follwoing information. + +``` +("key-1", "message-1-2") +("key-2", "message-2-1") +("key-2", "message-2-3") +("key-3", "message-3-2") +("key-4", "message-4-2") +``` + +`Shared` subscription is different from `Exclusive` and `Failover` subscription modes. `Shared` subscription has better flexibility, but cannot provide order guarantee. + +#### Key_shared + +This is a new subscription mode since 2.4.0 release, create new consumers and subscribe with `Key_Shared` subscription mode. + +```java +Consumer consumer1 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() + +Consumer consumer2 = client.newConsumer() + .topic("my-topic") + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Key_Shared) + .subscribe() +//Both consumer1 and consumer2 are active consumers. +``` + +`Key_Shared` subscription is like `Shared` subscription, all consumers can attach to the same subscription. But it is different from `Key_Shared` subscription, messages with the same key are delivered to only one consumer in order. The possible distribution of messages between different consumers (by default we do not know in advance which keys will be assigned to a consumer, but a key will only be assigned to a consumer at the same time). + +consumer1 receives the follwoing information. + +``` +("key-1", "message-1-1") +("key-1", "message-1-2") +("key-1", "message-1-3") +("key-3", "message-3-1") +("key-3", "message-3-2") +``` + +consumer2 receives the follwoing information. + +``` +("key-2", "message-2-1") +("key-2", "message-2-2") +("key-2", "message-2-3") +("key-4", "message-4-1") +("key-4", "message-4-2") +``` + +> Note: +> +> If the message key is not specified, messages without key are dispatched to one consumer in order by default. + +## Reader + +With the [reader interface](concepts-clients.md#reader-interface), Pulsar clients can "manually position" themselves within a topic and reading all messages from a specified message onward. The Pulsar API for Java enables you to create {@inject: javadoc:Reader:/client/org/apache/pulsar/client/api/Reader} objects by specifying a topic, a {@inject: javadoc:MessageId:/client/org/apache/pulsar/client/api/MessageId}, and {@inject: javadoc:ReaderConfiguration:/client/org/apache/pulsar/client/api/ReaderConfiguration}. + +The following is an example. + +```java +ReaderConfiguration conf = new ReaderConfiguration(); +byte[] msgIdBytes = // Some message ID byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); + +while (true) { + Message message = reader.readNext(); + // Process message +} +``` + +In the example above, a `Reader` object is instantiated for a specific topic and message (by ID); the reader iterates over each message in the topic after the message is identified by `msgIdBytes` (how that value is obtained depends on the application). + +The code sample above shows pointing the `Reader` object to a specific message (by ID), but you can also use `MessageId.earliest` to point to the earliest available message on the topic of `MessageId.latest` to point to the most recent available message. + +When you create a reader, you can use the `loadConf` configuration. The following parameters are available in `loadConf`. + +| Type | Name |
    Description
    | Default +|---|---|---|--- +String|`topicName`|Topic name. |None +int|`receiverQueueSize`|Size of a consumer's receiver queue.

    For example, the number of messages that can be accumulated by a consumer before an application calls `Receive`.

    A value higher than the default value increases consumer throughput, though at the expense of more memory utilization.|1000 +ReaderListener<T>|`readerListener`|A listener that is called for message received.|None +String|`readerName`|Read name.|null +String|`subscriptionRolePrefix`|Prefix of subscription role. |null +CryptoKeyReader|`cryptoKeyReader`|Interface that abstracts the access to a key store.|null +ConsumerCryptoFailureAction|`cryptoFailureAction`|Consumer should take action when it receives a message that can not be decrypted.

  • **FAIL**: this is the default option to fail messages until crypto succeeds.

  • **DISCARD**: silently acknowledge and not deliver message to an application.

  • **CONSUME**: deliver encrypted messages to applications. It is the application's responsibility to decrypt the message.

    The message decompression fails.

    If messages contain batch messages, a client is not be able to retrieve individual messages in batch.

    Delivered encrypted message contains {@link EncryptionContext} which contains encryption and compression information in it using which application can decrypt consumed message payload.|ConsumerCryptoFailureAction.FAIL
  • +boolean|`readCompacted`|If enabling `readCompacted`, a consumer reads messages from a compacted topic rather than a full message backlog of a topic.

    A consumer only sees the latest value for each key in the compacted topic, up until reaching the point in the topic message when compacting backlog. Beyond that point, send messages as normal.

    `readCompacted` can only be enabled on subscriptions to persistent topics, which have a single active consumer (for example, failure or exclusive subscriptions).

    Attempting to enable it on subscriptions to non-persistent topics or on shared subscriptions leads to a subscription call throwing a `PulsarClientException`.|false +boolean|`resetIncludeHead`|If set to true, the first message to be returned is the one specified by `messageId`.

    If set to false, the first message to be returned is the one next to the message specified by `messageId`.|false + +### Sticky key range reader + +In sticky key range reader, broker will only dispatch messages which hash of the message key contains by the specified key hash range. Multiple key hash ranges can be specified on a reader. + +The following is an example to create a sticky key range reader. + +```java +pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.earliest) + .keyHashRange(Range.of(0, 10000), Range.of(20001, 30000)) + .create(); +``` + +Total hash range size is 65536, so the max end of the range should be less than or equal to 65535. + +## Schema + +In Pulsar, all message data consists of byte arrays "under the hood." [Message schemas](schema-get-started.md) enable you to use other types of data when constructing and handling messages (from simple types like strings to more complex, application-specific types). If you construct, say, a [producer](#producers) without specifying a schema, then the producer can only produce messages of type `byte[]`. The following is an example. + +```java +Producer producer = client.newProducer() + .topic(topic) + .create(); +``` + +The producer above is equivalent to a `Producer` (in fact, you should *always* explicitly specify the type). If you'd like to use a producer for a different type of data, you'll need to specify a **schema** that informs Pulsar which data type will be transmitted over the [topic](reference-terminology.md#topic). + +### Schema example + +Let's say that you have a `SensorReading` class that you'd like to transmit over a Pulsar topic: + +```java +public class SensorReading { + public float temperature; + + public SensorReading(float temperature) { + this.temperature = temperature; + } + + // A no-arg constructor is required + public SensorReading() { + } + + public float getTemperature() { + return temperature; + } + + public void setTemperature(float temperature) { + this.temperature = temperature; + } +} +``` + +You could then create a `Producer` (or `Consumer`) like this: + +```java +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-readings") + .create(); +``` + +The following schema formats are currently available for Java: + +* No schema or the byte array schema (which can be applied using `Schema.BYTES`): + + ```java + Producer bytesProducer = client.newProducer(Schema.BYTES) + .topic("some-raw-bytes-topic") + .create(); + ``` + + Or, equivalently: + + ```java + Producer bytesProducer = client.newProducer() + .topic("some-raw-bytes-topic") + .create(); + ``` + +* `String` for normal UTF-8-encoded string data. Apply the schema using `Schema.STRING`: + + ```java + Producer stringProducer = client.newProducer(Schema.STRING) + .topic("some-string-topic") + .create(); + ``` + +* Create JSON schemas for POJOs using `Schema.JSON`. The following is an example. + + ```java + Producer pojoProducer = client.newProducer(Schema.JSON(MyPojo.class)) + .topic("some-pojo-topic") + .create(); + ``` + +* Generate Protobuf schemas using `Schema.PROTOBUF`. The following example shows how to create the Protobuf schema and use it to instantiate a new producer: + + ```java + Producer protobufProducer = client.newProducer(Schema.PROTOBUF(MyProtobuf.class)) + .topic("some-protobuf-topic") + .create(); + ``` + +* Define Avro schemas with `Schema.AVRO`. The following code snippet demonstrates how to create and use Avro schema. + + ```java + Producer avroProducer = client.newProducer(Schema.AVRO(MyAvro.class)) + .topic("some-avro-topic") + .create(); + ``` + +## Authentication + +Pulsar currently supports two authentication schemes: [TLS](security-tls-authentication.md) and [Athenz](security-athenz.md). You can use the Pulsar Java client with both. + +### TLS Authentication + +To use [TLS](security-tls-authentication.md), you need to set TLS to `true` using the `setUseTls` method, point your Pulsar client to a TLS cert path, and provide paths to cert and key files. + +The following is an example. + +```java +Map authParams = new HashMap<>(); +authParams.put("tlsCertFile", "/path/to/client-cert.pem"); +authParams.put("tlsKeyFile", "/path/to/client-key.pem"); + +Authentication tlsAuth = AuthenticationFactory + .create(AuthenticationTls.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(tlsAuth) + .build(); +``` + +### Athenz + +To use [Athenz](security-athenz.md) as an authentication provider, you need to [use TLS](#tls-authentication) and provide values for four parameters in a hash: + +* `tenantDomain` +* `tenantService` +* `providerDomain` +* `privateKey` + +You can also set an optional `keyId`. The following is an example. + +```java +Map authParams = new HashMap<>(); +authParams.put("tenantDomain", "shopping"); // Tenant domain name +authParams.put("tenantService", "some_app"); // Tenant service name +authParams.put("providerDomain", "pulsar"); // Provider domain name +authParams.put("privateKey", "file:///path/to/private.pem"); // Tenant private key path +authParams.put("keyId", "v1"); // Key id for the tenant private key (optional, default: "0") + +Authentication athenzAuth = AuthenticationFactory + .create(AuthenticationAthenz.class.getName(), authParams); + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://my-broker.com:6651") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/cacert.pem") + .authentication(athenzAuth) + .build(); +``` + +> #### Supported pattern formats +> The `privateKey` parameter supports the following three pattern formats: +> * `file:///path/to/file` +> * `file:/path/to/file` +> * `data:application/x-pem-file;base64,` diff --git a/site2/website/versioned_docs/version-2.5.0/client-libraries-node.md b/site2/website/versioned_docs/version-2.5.0/client-libraries-node.md new file mode 100644 index 0000000000000..3e23333a9e43b --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/client-libraries-node.md @@ -0,0 +1,402 @@ +--- +id: version-2.5.0-client-libraries-node +title: The Pulsar Node.js client +sidebar_label: Node.js +original_id: client-libraries-node +--- + +The Pulsar Node.js client can be used to create Pulsar [producers](#producers), [consumers](#consumers), and [readers](#readers) in Node.js. + +## Installation + +You can install the [`pusar-client`](https://www.npmjs.com/package/pulsar-client) library via [npm](https://www.npmjs.com/). + +### Requirements +Pulsar Node.js client library is based on the C++ client library. +Follow [these instructions](client-libraries-cpp.md#compilation) and install the Pulsar C++ client library. + +### Compatibility + +Compatibility between each version of the Node.js client and the C++ client is as follows: + +| Node.js client | C++ client | +| :------------- | :------------- | +| 1.0.0 | 2.3.0 or later | + +If an incompatible version of the C++ client is installed, you may fail to build or run this library. + +### Installation using npm + +Install the `pulsar-client` library via [npm](https://www.npmjs.com/): + +```shell +$ npm install pulsar-client +``` + +> #### Note +> +> Also, this library works only in Node.js 10.x or later because it uses the [`node-addon-api`](https://github.com/nodejs/node-addon-api) module to wrap the C++ library. + +## Connection URLs +To connect to Pulsar using client libraries, you need to specify a [Pulsar protocol](developing-binary-protocol.md) URL. + +Pulsar protocol URLs are assigned to specific clusters, use the `pulsar` scheme and have a default port of 6650. Here is an example for `localhost`: + +```http +pulsar://localhost:6650 +``` + +A URL for a production Pulsar cluster may look something like this: + +```http +pulsar://pulsar.us-west.example.com:6650 +``` + +If you are using [TLS encryption](security-tls-transport.md) or [TLS Authentication](security-tls-authentication.md), the URL will look like something like this: + +```http +pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +## Create a client + +In order to interact with Pulsar, you will first need a client object. You can create a client instance using a `new` operator and the `Client` method, passing in a client options object (more on configuration [below](#client-configuration)). + +Here is an example: + +```JavaScript +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + await client.close(); +})(); +``` + +### Client configuration + +The following configurable parameters are available for Pulsar clients: + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `serviceUrl` | The connection URL for the Pulsar cluster. See [above](#connection-urls) for more info. | | +| `authentication` | Configure the authentication provider. (default: no authentication). See [TLS Authentication](security-tls-authentication.md) for more info. | | +| `operationTimeoutSeconds` | The timeout for Node.js client operations (creating producers, subscribing to and unsubscribing from [topics](reference-terminology.md#topic)). Retries will occur until this threshold is reached, at which point the operation will fail. | 30 | +| `ioThreads` | The number of threads to use for handling connections to Pulsar [brokers](reference-terminology.md#broker). | 1 | +| `messageListenerThreads` | The number of threads used by message listeners ([consumers](#consumers) and [readers](#readers)). | 1 | +| `concurrentLookupRequest` | The number of concurrent lookup requests that can be sent on each broker connection. Setting a maximum helps to keep from overloading brokers. You should set values over the default of 50000 only if the client needs to produce and/or subscribe to thousands of Pulsar topics. | 50000 | +| `tlsTrustCertsFilePath` | The file path for the trusted TLS certificate. | | +| `tlsValidateHostname` | The boolean value of setup whether to enable TLS hostname verification. | `false` | +| `tlsAllowInsecureConnection` | The boolean value of setup whether the Pulsar client accepts untrusted TLS certificate from broker. | `false` | +| `statsIntervalInSeconds` | Interval between each stat info. Stats is activated with positive statsInterval. The value should be set to 1 second at least | 600 | + +## Producers + +Pulsar producers publish messages to Pulsar topics. You can [configure](#producer-configuration) Node.js producers using a producer configuration object. + +Here is an example: + +```JavaScript +const producer = await client.createProducer({ + topic: 'my-topic', +}); + +await producer.send({ + data: Buffer.from("Hello, Pulsar"), +}); + +await producer.close(); +``` + +> #### Promise operation +> When you create a new Pulsar producer, the operation will return `Promise` object and get producer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Producer operations + +Pulsar Node.js producers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `send(Object)` | Publishes a [message](#messages) to the producer's topic. When the message is successfully acknowledged by the Pulsar broker, or an error will be thrown, the Promise object run executor function. | `Promise` | +| `flush()` | Sends message from send queue to Pulser broker. When the message is successfully acknowledged by the Pulsar broker, or an error will be thrown, the Promise object run executor function. | `Promise` | +| `close()` | Closes the producer and releases all resources allocated to it. If `close()` is called then no more messages will be accepted from the publisher. This method will return Promise object, and when all pending publish requests have been persisted by Pulsar then run executor function. If an error is thrown, no pending writes will be retried. | `Promise` | + +### Producer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) to which the producer will publish messages. | | +| `producerName` | A name for the producer. If you do not explicitly assign a name, Pulsar will automatically generate a globally unique name. If you choose to explicitly assign a name, it will need to be unique across *all* Pulsar clusters, otherwise the creation operation will throw an error. | | +| `sendTimeoutMs` | When publishing a message to a topic, the producer will wait for an acknowledgment from the responsible Pulsar [broker](reference-terminology.md#broker). If a message is not acknowledged within the threshold set by this parameter, an error will be thrown. If you set `sendTimeoutMs` to -1, the timeout will be set to infinity (and thus removed). Removing the send timeout is recommended when using Pulsar's [message de-duplication](cookbooks-deduplication.md) feature. | 30000 | +| `initialSequenceId` | The initial sequence ID of the message. When producer send message, add sequence ID to message. The ID is increased each time to send. | | +| `maxPendingMessages` | The maximum size of the queue holding pending messages (i.e. messages waiting to receive an acknowledgment from the [broker](reference-terminology.md#broker)). By default, when the queue is full all calls to the `send` method will fail *unless* `blockIfQueueFull` is set to `true`. | 1000 | +| `maxPendingMessagesAcrossPartitions` | The maximum size of the sum of partition's pending queue. | 50000 | +| `blockIfQueueFull` | If set to `true`, the producer's `send` method will wait when the outgoing message queue is full rather than failing and throwing an error (the size of that queue is dictated by the `maxPendingMessages` parameter); if set to `false` (the default), `send` operations will fail and throw a error when the queue is full. | `false` | +| `messageRoutingMode` | The message routing logic (for producers on [partitioned topics](concepts-messaging.md#partitioned-topics)). This logic is applied only when no key is set on messages. The available options are: round robin (`RoundRobinDistribution`), or publishing all messages to a single partition (`UseSinglePartition`, the default). | `UseSinglePartition` | +| `hashingScheme` | The hashing function that determines the partition on which a particular message is published (partitioned topics only). The available options are: `JavaStringHash` (the equivalent of `String.hashCode()` in Java), `Murmur3_32Hash` (applies the [Murmur3](https://en.wikipedia.org/wiki/MurmurHash) hashing function), or `BoostHash` (applies the hashing function from C++'s [Boost](https://www.boost.org/doc/libs/1_62_0/doc/html/hash.html) library). | `BoostHash` | +| `compressionType` | The message data compression type used by the producer. The available options are [`LZ4`](https://github.com/lz4/lz4), and [`Zlib`](https://zlib.net/). | Compression None | +| `batchingEnabled` | If set to `true`, the producer send message as batch. | `true` | +| `batchingMaxPublishDelayMs` | The maximum time of delay sending message in batching. | 10 | +| `batchingMaxMessages` | The maximum size of sending message in each time of batching. | 1000 | +| `properties` | The metadata of producer. | | + +### Producer example + +This example creates a Node.js producer for the `my-topic` topic and sends 10 messages to that topic: + +```JavaScript +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a producer + const producer = await client.createProducer({ + topic: 'my-topic', + }); + + // Send messages + for (let i = 0; i < 10; i += 1) { + const msg = `my-message-${i}`; + producer.send({ + data: Buffer.from(msg), + }); + console.log(`Sent message: ${msg}`); + } + await producer.flush(); + + await producer.close(); + await client.close(); +})(); +``` + +## Consumers + +Pulsar consumers subscribe to one or more Pulsar topics and listen for incoming messages produced on that topic/those topics. You can [configure](#consumer-configuration) Node.js consumers using a consumer configuration object. + +Here is an example: + +```JavaScript +const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', +}); + +const msg = await consumer.receive(); +console.log(msg.getData().toString()); +consumer.acknowledge(msg); + +await consumer.close(); +``` + +> #### Promise operation +> When you create a new Pulsar consumer, the operation will return `Promise` object and get consumer instance or an error through executor function. +> In this example, using await operator instead of executor function. + +### Consumer operations + +Pulsar Node.js consumers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `receive()` | Receives a single message from the topic. When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `receive(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `acknowledge(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message object. | `void` | +| `acknowledgeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) a message to the Pulsar [broker](reference-terminology.md#broker) by message ID object. | `void` | +| `acknowledgeCumulative(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message. The `acknowledgeCumulative` method will return void, and send the ack to the broker asynchronously. After that, the messages will *not* be redelivered to the consumer. Cumulative acking can not be used with a [shared](concepts-messaging.md#shared) subscription type. | `void` | +| `acknowledgeCumulativeId(Object)` | [Acknowledges](reference-terminology.md#acknowledgment-ack) *all* the messages in the stream, up to and including the specified message ID. | `void` | +| `close()` | Closes the consumer, disabling its ability to receive messages from the broker. | `Promise` | + +### Consumer configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) on which the consumer will establish a subscription and listen for messages. | | +| `subscription` | The subscription name for this consumer. | | +| `subscriptionType` | Available options are `Exclusive`, `Shared`, and `Failover`. | `Exclusive` | +| `ackTimeoutMs` | Acknowledge timeout in milliseconds. | 0 | +| `receiverQueueSize` | Sets the size of the consumer's receiver queue, i.e. the number of messages that can be accumulated by the consumer before the application calls `receive`. A value higher than the default of 1000 could increase consumer throughput, though at the expense of more memory utilization. | 1000 | +| `receiverQueueSizeAcrossPartitions` | Set the max total receiver queue size across partitions. This setting will be used to reduce the receiver queue size for individual partitions if the total exceeds this value. | 50000 | +| `consumerName` | The name of consumer. Currently(v2.4.1), [failover](concepts-messaging.md#failover) mode use consumer name in ordering. | | +| `properties` | The metadata of consumer. | | + +### Consumer example + +This example creates a Node.js consumer with the `my-subscription` subscription on the `my-topic` topic, receives messages, prints the content that arrive, and acknowledges each message to the Pulsar broker for 10 times: + +```JavaScript +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + }); + + // Create a consumer + const consumer = await client.subscribe({ + topic: 'my-topic', + subscription: 'my-subscription', + subscriptionType: 'Exclusive', + }); + + // Receive messages + for (let i = 0; i < 10; i += 1) { + const msg = await consumer.receive(); + console.log(msg.getData().toString()); + consumer.acknowledge(msg); + } + + await consumer.close(); + await client.close(); +})(); +``` + +## Readers + +Pulsar readers process messages from Pulsar topics. Readers are different from consumers because with readers you need to explicitly specify which message in the stream you want to begin with (consumers, on the other hand, automatically begin with the most recently unacked message). You can [configure](#reader-configuration) Node.js readers using a reader configuration object. + +Here is an example: + +```JavaScript +const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), +}); + +const msg = await reader.readNext(); +console.log(msg.getData().toString()); + +await reader.close(); +``` + +### Reader operations + +Pulsar Node.js readers have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `readNext()` | Receives the next message on the topic (analogous to the `receive` method for [consumers](#consumer-operations)). When the message is available, the Promise object run executor function and get message object. | `Promise` | +| `readNext(Number)` | Receives a single message from the topic with specific timeout in milliseconds. | `Promise` | +| `hasNext()` | Return whether Proker has next message in target topic. | `Boolean` | +| `close()` | Closes the reader, disabling its ability to receive messages from the broker. | `Promise` | + +### Reader configuration + +| Parameter | Description | Default | +| :-------- | :---------- | :------ | +| `topic` | The Pulsar [topic](reference-terminology.md#topic) on which the reader will establish a subscription and listen for messages. | | +| `startMessageId` | The initial reader position, i.e. the message at which the reader begins processing messages. The options are `Pulsar.MessageId.earliest` (the earliest available message on the topic), `Pulsar.MessageId.latest` (the latest available message on the topic), or a message ID object for a position that is not earliest or latest. | | +| `receiverQueueSize` | Sets the size of the reader's receiver queue, i.e. the number of messages that can be accumulated by the reader before the application calls `readNext`. A value higher than the default of 1000 could increase reader throughput, though at the expense of more memory utilization. | 1000 | +| `readerName` | The name of the reader. | | +| `subscriptionRolePrefix` | The subscription role prefix. | | + +### Reader example + +This example creates a Node.js reader with the `my-topic` topic, reads messages, and prints the content that arrive for 10 times: + +```JavaScript +const Pulsar = require('pulsar-client'); + +(async () => { + // Create a client + const client = new Pulsar.Client({ + serviceUrl: 'pulsar://localhost:6650', + operationTimeoutSeconds: 30, + }); + + // Create a reader + const reader = await client.createReader({ + topic: 'my-topic', + startMessageId: Pulsar.MessageId.earliest(), + }); + + // read messages + for (let i = 0; i < 10; i += 1) { + const msg = await reader.readNext(); + console.log(msg.getData().toString()); + } + + await reader.close(); + await client.close(); +})(); +``` + +## Messages + +In Pulsar Node.js client, you have to construct producer message object for producer. + +Here is an example message: + +```JavaScript +const msg = { + data: Buffer.from('Hello, Pulsar'), + partitionKey: 'key1', + properties: { + 'foo': 'bar', + }, + eventTimestamp: Date.now(), + replicationClusters: [ + 'cluster1', + 'cluster2', + ], +} + +await producer.send(msg); +``` + +The following keys are available for producer message objects: + +| Parameter | Description | +| :-------- | :---------- | +| `data` | The actual data payload of the message. | +| `properties` | A Object for any application-specific metadata attached to the message. | +| `eventTimestamp` | The timestamp associated with the message. | +| `sequenceId` | The sequence ID of the message. | +| `partitionKey` | The optional key associated with the message (particularly useful for things like topic compaction). | +| `replicationClusters` | The clusters to which this message will be replicated. Pulsar brokers handle message replication automatically; you should only change this setting if you want to override the broker default. | + +### Message object operations + +In Pulsar Node.js client, you can receive (or read) message object as consumer (or reader). + +The message object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `getTopicName()` | Getter method of topic name. | `String` | +| `getProperties()` | Getter method of properties. | `Array` | +| `getData()` | Getter method of message data. | `Buffer` | +| `getMessageId()` | Getter method of [message id object](#message-id-object-operations). | `Object` | +| `getPublishTimestamp()` | Getter method of publish timestamp. | `Number` | +| `getEventTimestamp()` | Getter method of event timestamp. | `Number` | +| `getPartitionKey()` | Getter method of partition key. | `String` | + +### Message ID object operations + +In Pulsar Node.js client, you can get message id object from message object. + +The message id object have the following methods available: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `serialize()` | Serialize the message id into a Buffer for storing. | `Buffer` | +| `toString()` | Get message id as String. | `String` | + +The client has static method of message id object. You can access it as `Pulsar.MessageId.someStaticMethod` too. + +The following static methods are available for the message id object: + +| Method | Description | Return type | +| :----- | :---------- | :---------- | +| `earliest()` | MessageId representing the earliest, or oldest available message stored in the topic. | `Object` | +| `latest()` | MessageId representing the latest, or last published message in the topic. | `Object` | +| `deserialize(Buffer)` | Deserialize a message id object from a Buffer. | `Object` | + diff --git a/site2/website/versioned_docs/version-2.5.0/client-libraries-python.md b/site2/website/versioned_docs/version-2.5.0/client-libraries-python.md new file mode 100644 index 0000000000000..c9f39255d972b --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/client-libraries-python.md @@ -0,0 +1,249 @@ +--- +id: version-2.5.0-client-libraries-python +title: The Pulsar Python client +sidebar_label: Python +original_id: client-libraries-python +--- + +The Pulsar Python client library is a wrapper over the existing [C++ client library](client-libraries-cpp.md) and exposes all of the [same features](/api/cpp). You can find the code in the [`python` subdirectory](https://github.com/apache/pulsar/tree/master/pulsar-client-cpp/python) of the C++ client code. + +## Installation + +You can install the [`pulsar-client`](https://pypi.python.org/pypi/pulsar-client) library either via [PyPi](https://pypi.python.org/pypi), using [pip](#installation-using-pip), or by building the library from source. + +### Installation using pip + +To install the `pulsar-client` library as a pre-built package using the [pip](https://pip.pypa.io/en/stable/) package manager: + +```shell +$ pip install pulsar-client=={{pulsar:version_number}} +``` + +Installation via PyPi is available for the following Python versions: + +Platform | Supported Python versions +:--------|:------------------------- +MacOS
    10.13 (High Sierra), 10.14 (Mojave)
    | 2.7, 3.7 +Linux | 2.7, 3.4, 3.5, 3.6, 3.7 + +### Installing from source + +To install the `pulsar-client` library by building from source, follow [these instructions](client-libraries-cpp.md#compilation) and compile the Pulsar C++ client library. That will also build the Python binding for the library. + +To install the built Python bindings: + +```shell +$ git clone https://github.com/apache/pulsar +$ cd pulsar/pulsar-client-cpp/python +$ sudo python setup.py install +``` + +## API Reference + +The complete Python API reference is available at [api/python](/api/python). + +## Examples + +Below you'll find a variety of Python code examples for the `pulsar-client` library. + +### Producer example + +This creates a Python producer for the `my-topic` topic and send 10 messages on that topic: + +```python +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') + +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('Hello-%d' % i).encode('utf-8')) + +client.close() +``` + +### Consumer example + +This creates a consumer with the `my-subscription` subscription on the `my-topic` topic, listen for incoming messages, print the content and ID of messages that arrive, and acknowledge each message to the Pulsar broker: + +```python +consumer = client.subscribe('my-topic', 'my-subscription') + +while True: + msg = consumer.receive() + try: + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) + +client.close() +``` + +### Reader interface example + +You can use the Pulsar Python API to use the Pulsar [reader interface](concepts-clients.md#reader-interface). Here's an example: + +```python +# MessageId taken from a previously fetched message +msg_id = msg.message_id() + +reader = client.create_reader('my-topic', msg_id) + +while True: + msg = reader.read_next() + print("Received message '{}' id='{}'".format(msg.data(), msg.message_id())) + # No acknowledgment +``` + + +## Schema + +### Declaring and validating schema + +A schema can be declared by passing a class that inherits +from `pulsar.schema.Record` and defines the fields as +class variables. For example: + +```python +from pulsar.schema import * + +class Example(Record): + a = String() + b = Integer() + c = Boolean() +``` + +With this simple schema definition we can then create producers, +consumers and readers instances that will be referring to that. + +```python +producer = client.create_producer( + topic='my-topic', + schema=AvroSchema(Example) ) + +producer.send(Example(a='Hello', b=1)) +``` + +When the producer is created, the Pulsar broker will validate that +the existing topic schema is indeed of "Avro" type and that the +format is compatible with the schema definition of the `Example` +class. + +If there is a mismatch, the producer creation will raise an +exception. + +Once a producer is created with a certain schema definition, +it will only accept objects that are instances of the declared +schema class. + +Similarly, for a consumer/reader, the consumer will return an +object, instance of the schema record class, rather than the raw +bytes: + +```python +consumer = client.subscribe( + topic='my-topic', + subscription_name='my-subscription', + schema=AvroSchema(Example) ) + +while True: + msg = consumer.receive() + ex = msg.value() + try: + print("Received message a={} b={} c={}".format(ex.a, ex.b, ex.c)) + # Acknowledge successful processing of the message + consumer.acknowledge(msg) + except: + # Message failed to be processed + consumer.negative_acknowledge(msg) +``` + +### Supported schema types + +There are different builtin schema types that can be used in Pulsar. +All the definitions are in the `pulsar.schema` package. + +| Schema | Notes | +| ------ | ----- | +| `BytesSchema` | Get the raw payload as a `bytes` object. No serialization/deserialization are performed. This is the default schema mode | +| `StringSchema` | Encode/decode payload as a UTF-8 string. Uses `str` objects | +| `JsonSchema` | Require record definition. Serializes the record into standard JSON payload | +| `AvroSchema` | Require record definition. Serializes in AVRO format | + +### Schema definition reference + +The schema definition is done through a class that inherits from +`pulsar.schema.Record`. + +This class can have a number of fields which can be of either +`pulsar.schema.Field` type or even another nested `Record`. All the +fields are also specified in the `pulsar.schema` package. The fields +are matching the AVRO fields types. + +| Field Type | Python Type | Notes | +| ---------- | ----------- | ----- | +| `Boolean` | `bool` | | +| `Integer` | `int` | | +| `Long` | `int` | | +| `Float` | `float` | | +| `Double` | `float` | | +| `Bytes` | `bytes` | | +| `String` | `str` | | +| `Array` | `list` | Need to specify record type for items | +| `Map` | `dict` | Key is always `String`. Need to specify value type | + +Additionally, any Python `Enum` type can be used as a valid field +type + +#### Fields parameters + +When adding a field these parameters can be used in the constructor: + +| Argument | Default | Notes | +| ---------- | --------| ----- | +| `default` | `None` | Set a default value for the field. Eg: `a = Integer(default=5)` | +| `required` | `False` | Mark the field as "required". This will set it in the schema accordingly. | + +#### Schema definition examples + +##### Simple definition + +```python +class Example(Record): + a = String() + b = Integer() + c = Array(String()) + i = Map(String()) +``` + +##### Using enums + +```python +from enum import Enum + +class Color(Enum): + red = 1 + green = 2 + blue = 3 + +class Example(Record): + name = String() + color = Color +``` + +##### Complex types + +```python +class MySubRecord(Record): + x = Integer() + y = Long() + z = String() + +class Example(Record): + a = String() + sub = MySubRecord() +``` diff --git a/site2/website/versioned_docs/version-2.5.0/concepts-clients.md b/site2/website/versioned_docs/version-2.5.0/concepts-clients.md new file mode 100644 index 0000000000000..c679dc156f757 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/concepts-clients.md @@ -0,0 +1,82 @@ +--- +id: version-2.5.0-concepts-clients +title: Pulsar Clients +sidebar_label: Clients +original_id: concepts-clients +--- + +Pulsar exposes a client API with language bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). The client API optimizes and encapsulates Pulsar's client-broker communication protocol and exposes a simple and intuitive API for use by applications. + +Under the hood, the current official Pulsar client libraries support transparent reconnection and/or connection failover to brokers, queuing of messages until acknowledged by the broker, and heuristics such as connection retries with backoff. + +> #### Custom client libraries +> If you'd like to create your own client library, we recommend consulting the documentation on Pulsar's custom [binary protocol](developing-binary-protocol.md) + + +## Client setup phase + +When an application wants to create a producer/consumer, the Pulsar client library will initiate a setup phase that is composed of two steps: + +1. The client will attempt to determine the owner of the topic by sending an HTTP lookup request to the broker. The request could reach one of the active brokers which, by looking at the (cached) zookeeper metadata will know who is serving the topic or, in case nobody is serving it, will try to assign it to the least loaded broker. +1. Once the client library has the broker address, it will create a TCP connection (or reuse an existing connection from the pool) and authenticate it. Within this connection, client and broker exchange binary commands from a custom protocol. At this point the client will send a command to create producer/consumer to the broker, which will comply after having validated the authorization policy. + +Whenever the TCP connection breaks, the client will immediately re-initiate this setup phase and will keep trying with exponential backoff to re-establish the producer or consumer until the operation succeeds. + +## Reader interface + +In Pulsar, the "standard" [consumer interface](concepts-messaging.md#consumers) involves using consumers to listen on [topics](reference-terminology.md#topic), process incoming messages, and finally acknowledge those messages when they've been processed. Whenever a new subscription is created, it is initially positioned at the end of the topic (by default), and consumers associated with that subscription will begin reading with the first message created afterwards. Whenever a consumer connects to a topic using a pre-existing subscription, it begins reading from the earliest message un-acked within that subscription. In summary, with the consumer interface, subscription cursors are automatically managed by Pulsar in response to [message acknowledgements](concepts-messaging.md#acknowledgement). + +The **reader interface** for Pulsar enables applications to manually manage cursors. When you use a reader to connect to a topic---rather than a consumer---you need to specify *which* message the reader begins reading from when it connects to a topic. When connecting to a topic, the reader interface enables you to begin with: + +* The **earliest** available message in the topic +* The **latest** available message in the topic +* Some other message between the earliest and the latest. If you select this option, you'll need to explicitly provide a message ID. Your application will be responsible for "knowing" this message ID in advance, perhaps fetching it from a persistent data store or cache. + +The reader interface is helpful for use cases like using Pulsar to provide [effectively-once](https://streaml.io/blog/exactly-once/) processing semantics for a stream processing system. For this use case, it's essential that the stream processing system be able to "rewind" topics to a specific message and begin reading there. The reader interface provides Pulsar clients with the low-level abstraction necessary to "manually position" themselves within a topic. + +Internally, the reader interface is implemented as a consumer using an exclusive, non-durable subscription to the topic with a randomly-allocated name. + +![The Pulsar consumer and reader interfaces](assets/pulsar-reader-consumer-interfaces.png) + +> ### Non-partitioned topics only +> The reader interface for Pulsar cannot currently be used with [partitioned topics](concepts-messaging.md#partitioned-topics). + +Here's a Java example that begins reading from the earliest available message on a topic: + +```java +import org.apache.pulsar.client.api.Message; +import org.apache.pulsar.client.api.MessageId; +import org.apache.pulsar.client.api.Reader; + +// Create a reader on a topic and for a specific message (and onward) +Reader reader = pulsarClient.newReader() + .topic("reader-api-test") + .startMessageId(MessageId.earliest) + .create(); + +while (true) { + Message message = reader.readNext(); + + // Process the message +} +``` + +To create a reader that will read from the latest available message: + +```java +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(MessageId.latest) + .create(); +``` + +To create a reader that will read from some message between earliest and latest: + +```java +byte[] msgIdBytes = // Some byte array +MessageId id = MessageId.fromByteArray(msgIdBytes); +Reader reader = pulsarClient.newReader() + .topic(topic) + .startMessageId(id) + .create(); +``` diff --git a/site2/website/versioned_docs/version-2.5.0/concepts-messaging.md b/site2/website/versioned_docs/version-2.5.0/concepts-messaging.md new file mode 100644 index 0000000000000..86fae65d1b82f --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/concepts-messaging.md @@ -0,0 +1,445 @@ +--- +id: version-2.5.0-concepts-messaging +title: Messaging Concepts +sidebar_label: Messaging +original_id: concepts-messaging +--- + +Pulsar is built on the [publish-subscribe](https://en.wikipedia.org/wiki/Publish%E2%80%93subscribe_pattern) pattern, aka pub-sub. In this pattern, [producers](#producers) publish messages to [topics](#topics). [Consumers](#consumers) can then [subscribe](#subscription-modes) to those topics, process incoming messages, and send an acknowledgement when processing is complete. + +Once a subscription has been created, all messages will be [retained](concepts-architecture-overview.md#persistent-storage) by Pulsar, even if the consumer gets disconnected. Retained messages will be discarded only when a consumer acknowledges that they've been successfully processed. + +## Messages + +Messages are the basic "unit" of Pulsar. They're what producers publish to topics and what consumers then consume from topics (and acknowledge when the message has been processed). Messages are the analogue of letters in a postal service system. + +Component | Purpose +:---------|:------- +Value / data payload | The data carried by the message. All Pulsar messages carry raw bytes, although message data can also conform to data [schemas](schema-get-started.md) +Key | Messages can optionally be tagged with keys, which can be useful for things like [topic compaction](concepts-topic-compaction.md) +Properties | An optional key/value map of user-defined properties +Producer name | The name of the producer that produced the message (producers are automatically given default names, but you can apply your own explicitly as well) +Sequence ID | Each Pulsar message belongs to an ordered sequence on its topic. A message's sequence ID is its ordering in that sequence. +Publish time | The timestamp of when the message was published (automatically applied by the producer) +Event time | An optional timestamp that applications can attach to the message representing when something happened, e.g. when the message was processed. The event time of a message is 0 if none is explicitly set. + + +> For a more in-depth breakdown of Pulsar message contents, see the documentation on Pulsar's [binary protocol](developing-binary-protocol.md). + +## Producers + +A producer is a process that attaches to a topic and publishes messages to a Pulsar [broker](reference-terminology.md#broker) for processing. + +### Send modes + +Producers can send messages to brokers either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:-----------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync send | The producer will wait for acknowledgement from the broker after sending each message. If acknowledgment isn't received then the producer will consider the send operation a failure. | +| Async send | The producer will put the message in a blocking queue and return immediately. The client library will then send the message to the broker in the background. If the queue is full (max size [configurable](reference-configuration.md#broker)), the producer could be blocked or fail immediately when calling the API, depending on arguments passed to the producer. | + +### Compression + +Messages published by producers can be compressed during transportation in order to save bandwidth. Pulsar currently supports the following types of compression: + +* [LZ4](https://github.com/lz4/lz4) +* [ZLIB](https://zlib.net/) +* [ZSTD](https://facebook.github.io/zstd/) +* [SNAPPY](https://google.github.io/snappy/) + +### Batching + +If batching is enabled, the producer will accumulate and send a batch of messages in a single request. Batching size is defined by the maximum number of messages and maximum publish latency. + +## Consumers + +A consumer is a process that attaches to a topic via a subscription and then receives messages. + +### Receive modes + +Messages can be received from [brokers](reference-terminology.md#broker) either synchronously (sync) or asynchronously (async). + +| Mode | Description | +|:--------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Sync receive | A sync receive will be blocked until a message is available. | +| Async receive | An async receive will return immediately with a future value---a [`CompletableFuture`](http://www.baeldung.com/java-completablefuture) in Java, for example---that completes once a new message is available. | + +### Listeners + +Client libraries provide listener implementation for consumers. For example, the [Java client](client-libraries-java.md) provides a {@inject: javadoc:MesssageListener:/client/org/apache/pulsar/client/api/MessageListener} interface. In this interface, the `received` method is called whenever a new message is received. + +### Acknowledgement + +When a consumer has consumed a message successfully, the consumer sends an acknowledgement request to the broker, so that the broker will discard the message. Otherwise, it [stores](concepts-architecture-overview.md#persistent-storage) the message. + +Messages can be acknowledged either one by one or cumulatively. With cumulative acknowledgement, the consumer only needs to acknowledge the last message it received. All messages in the stream up to (and including) the provided message will not be re-delivered to that consumer. + + +> Cumulative acknowledgement cannot be used with [shared subscription mode](#subscription-modes), because shared mode involves multiple consumers having access to the same subscription. + +In the shared subscription mode, messages can be acknowledged individually. + +### Negative acknowledgement + +When a consumer does not consume a message successfully at a time, and wants to consume the message again, the consumer can send a negative acknowledgement to the broker, and then the broker will redeliver the message. + +Messages can be negatively acknowledged one by one or cumulatively, which depends on the consumption subscription mode. + +In the exclusive and failover subscription modes, consumers only negatively acknowledge the last message they have received. + +In the shared and Key_Shared subscription modes, you can negatively acknowledge messages individually. + +### Acknowledgement timeout + +When a message is not consumed successfully, and you want to trigger the broker to redeliver the message automatically, you can adopt the unacknowledged message automatic re-delivery mechanism. Client will track the unacknowledged messages within the entire `acktimeout` time range, and send a `redeliver unacknowledged messages` request to the broker automatically when the acknowledgement timeout is specified. + +> Note +> Use negative acknowledgement prior to acknowledgement timeout. Negative acknowledgement controls re-delivery of individual messages with more precise, and avoids invalid redeliveries when the message processing time exceeds the acknowledgement timeout. + +### Dead letter topic + +Dead letter topic enables you to consume new messages when some messages cannot be consumed successfully by a consumer. In this mechanism, messages that are failed to be consumed are stored in a separate topic, which is called dead letter topic. You can decide how to handle messages in the dead letter topic. + +The following example shows how to enable dead letter topic in a Java client using the default dead letter topic: + +```java +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .build()) + .subscribe(); + +``` +The default dead letter topic uses this format: +``` +--DLQ +``` + +If you want to specify the name of the dead letter topic, use this Java client example: + +```java +Consumer consumer = pulsarClient.newConsumer(Schema.BYTES) + .topic(topic) + .subscriptionName("my-subscription") + .subscriptionType(SubscriptionType.Shared) + .deadLetterPolicy(DeadLetterPolicy.builder() + .maxRedeliverCount(maxRedeliveryCount) + .deadLetterTopic("your-topic-name") + .build()) + .subscribe(); + +``` + + +Dead letter topic depends on message re-delivery. Messages are redelivered either due to [acknowledgement timeout](#acknowledgement-timeout) or [negative acknowledgement](#negative-acknowledgement). If you are going to use negative acknowledgement on a message, make sure it is negatively acknowledged before the acknowledgement timeout. + +> Note +> Currently, dead letter topic is enabled only in the shared subscription mode. + +## Topics + +As in other pub-sub systems, topics in Pulsar are named channels for transmitting messages from [producers](reference-terminology.md#producer) to [consumers](reference-terminology.md#consumer). Topic names are URLs that have a well-defined structure: + +```http +{persistent|non-persistent}://tenant/namespace/topic +``` + +Topic name component | Description +:--------------------|:----------- +`persistent` / `non-persistent` | This identifies the type of topic. Pulsar supports two kind of topics: [persistent](concepts-architecture-overview.md#persistent-storage) and [non-persistent](#non-persistent-topics) (persistent is the default, so if you don't specify a type the topic will be persistent). With persistent topics, all messages are durably [persisted](concepts-architecture-overview.md#persistent-storage) on disk (that means on multiple disks unless the broker is standalone), whereas data for [non-persistent](#non-persistent-topics) topics isn't persisted to storage disks. +`tenant` | The topic's tenant within the instance. Tenants are essential to multi-tenancy in Pulsar and can be spread across clusters. +`namespace` | The administrative unit of the topic, which acts as a grouping mechanism for related topics. Most topic configuration is performed at the [namespace](#namespaces) level. Each tenant can have multiple namespaces. +`topic` | The final part of the name. Topic names are freeform and have no special meaning in a Pulsar instance. + + +> #### No need to explicitly create new topics +> You don't need to explicitly create topics in Pulsar. If a client attempts to write or receive messages to/from a topic that does not yet exist, Pulsar will automatically create that topic under the [namespace](#namespaces) provided in the [topic name](#topics). + + +## Namespaces + +A namespace is a logical nomenclature within a tenant. A tenant can create multiple namespaces via the [admin API](admin-api-namespaces.md#create). For instance, a tenant with different applications can create a separate namespace for each application. A namespace allows the application to create and manage a hierarchy of topics. The topic `my-tenant/app1` is a namespace for the application `app1` for `my-tenant`. You can create any number of [topics](#topics) under the namespace. + +## Subscription modes + +A subscription is a named configuration rule that determines how messages are delivered to consumers. There are three available subscription modes in Pulsar: [exclusive](#exclusive), [shared](#shared), and [failover](#failover). These modes are illustrated in the figure below. + +![Subscription modes](assets/pulsar-subscription-modes.png) + +### Exclusive + +In *exclusive* mode, only a single consumer is allowed to attach to the subscription. If more than one consumer attempts to subscribe to a topic using the same subscription, the consumer receives an error. + +In the diagram below, only **Consumer A-0** is allowed to consume messages. + +> Exclusive mode is the default subscription mode. + +![Exclusive subscriptions](assets/pulsar-exclusive-subscriptions.png) + +### Failover + +In *failover* mode, multiple consumers can attach to the same subscription. The consumers will be lexically sorted by the consumer's name and the first consumer will initially be the only one receiving messages. This consumer is called the *master consumer*. + +When the master consumer disconnects, all (non-acked and subsequent) messages will be delivered to the next consumer in line. + +In the diagram below, **Consumer-B-0** is the master consumer while **Consumer-B-1** would be the next in line to receive messages if **Consumer-B-0** disconnected. + +![Failover subscriptions](assets/pulsar-failover-subscriptions.png) + +### Shared + +In *shared* or *round robin* mode, multiple consumers can attach to the same subscription. Messages are delivered in a round robin distribution across consumers, and any given message is delivered to only one consumer. When a consumer disconnects, all the messages that were sent to it and not acknowledged will be rescheduled for sending to the remaining consumers. + +In the diagram below, **Consumer-C-1** and **Consumer-C-2** are able to subscribe to the topic, but **Consumer-C-3** and others could as well. + +> #### Limitations of shared mode +> There are two important things to be aware of when using shared mode: +> * Message ordering is not guaranteed. +> * You cannot use cumulative acknowledgment with shared mode. + +![Shared subscriptions](assets/pulsar-shared-subscriptions.png) + +### Key_Shared + +In *Key_Shared* mode, multiple consumers can attach to the same subscription. Messages are delivered in a distribution across consumers and message with same key or same ordering key are delivered to only one consumer. No matter how many times the message is re-delivered, it is delivered to the same consumer. When a consumer connected or disconnected will cause served consumer change for some key of message. + +> #### Limitations of Key_Shared mode +> There are two important things to be aware of when using Key_Shared mode: +> * You need to specify a key or orderingKey for messages +> * You cannot use cumulative acknowledgment with Key_Shared mode. + +![Key_Shared subscriptions](assets/pulsar-key-shared-subscriptions.png) + +**Key_Shared subscription is a beta feature. You can disable it at broker.config.** + +## Multi-topic subscriptions + +When a consumer subscribes to a Pulsar topic, by default it subscribes to one specific topic, such as `persistent://public/default/my-topic`. As of Pulsar version 1.23.0-incubating, however, Pulsar consumers can simultaneously subscribe to multiple topics. You can define a list of topics in two ways: + +* On the basis of a [**reg**ular **ex**pression](https://en.wikipedia.org/wiki/Regular_expression) (regex), for example `persistent://public/default/finance-.*` +* By explicitly defining a list of topics + +> When subscribing to multiple topics by regex, all topics must be in the same [namespace](#namespaces) + +When subscribing to multiple topics, the Pulsar client will automatically make a call to the Pulsar API to discover the topics that match the regex pattern/list and then subscribe to all of them. If any of the topics don't currently exist, the consumer will auto-subscribe to them once the topics are created. + +> #### No ordering guarantees +> When a consumer subscribes to multiple topics, all ordering guarantees normally provided by Pulsar on single topics do not hold. If your use case for Pulsar involves any strict ordering requirements, we would strongly recommend against using this feature. + +Here are some multi-topic subscription examples for Java: + +```java +import java.util.regex.Pattern; + +import org.apache.pulsar.client.api.Consumer; +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient pulsarClient = // Instantiate Pulsar client object + +// Subscribe to all topics in a namespace +Pattern allTopicsInNamespace = Pattern.compile("persistent://public/default/.*"); +Consumer allTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(allTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); + +// Subscribe to a subsets of topics in a namespace, based on regex +Pattern someTopicsInNamespace = Pattern.compile("persistent://public/default/foo.*"); +Consumer someTopicsConsumer = pulsarClient.newConsumer() + .topicsPattern(someTopicsInNamespace) + .subscriptionName("subscription-1") + .subscribe(); +``` + +For code examples, see: + +* [Java](client-libraries-java.md#multi-topic-subscriptions) + +## Partitioned topics + +Normal topics can be served only by a single broker, which limits the topic's maximum throughput. *Partitioned topics* are a special type of topic that be handled by multiple brokers, which allows for much higher throughput. + +Behind the scenes, a partitioned topic is actually implemented as N internal topics, where N is the number of partitions. When publishing messages to a partitioned topic, each message is routed to one of several brokers. The distribution of partitions across brokers is handled automatically by Pulsar. + +The diagram below illustrates this: + +![](assets/partitioning.png) + +Here, the topic **Topic1** has five partitions (**P0** through **P4**) split across three brokers. Because there are more partitions than brokers, two brokers handle two partitions a piece, while the third handles only one (again, Pulsar handles this distribution of partitions automatically). + +Messages for this topic are broadcast to two consumers. The [routing mode](#routing-modes) determines both which broker handles each partition, while the [subscription mode](#subscription-modes) determines which messages go to which consumers. + +Decisions about routing and subscription modes can be made separately in most cases. In general, throughput concerns should guide partitioning/routing decisions while subscription decisions should be guided by application semantics. + +There is no difference between partitioned topics and normal topics in terms of how subscription modes work, as partitioning only determines what happens between when a message is published by a producer and processed and acknowledged by a consumer. + +Partitioned topics need to be explicitly created via the [admin API](admin-api-overview.md). The number of partitions can be specified when creating the topic. + +### Routing modes + +When publishing to partitioned topics, you must specify a *routing mode*. The routing mode determines which partition---that is, which internal topic---each message should be published to. + +There are three {@inject: javadoc:MessageRoutingMode:/client/org/apache/pulsar/client/api/MessageRoutingMode} available: + +Mode | Description +:--------|:------------ +`RoundRobinPartition` | If no key is provided, the producer will publish messages across all partitions in round-robin fashion to achieve maximum throughput. Please note that round-robin is not done per individual message but rather it's set to the same boundary of batching delay, to ensure batching is effective. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. This is the default mode. +`SinglePartition` | If no key is provided, the producer will randomly pick one single partition and publish all the messages into that partition. While if a key is specified on the message, the partitioned producer will hash the key and assign message to a particular partition. +`CustomPartition` | Use custom message router implementation that will be called to determine the partition for a particular message. User can create a custom routing mode by using the [Java client](client-libraries-java.md) and implementing the {@inject: javadoc:MessageRouter:/client/org/apache/pulsar/client/api/MessageRouter} interface. + +### Ordering guarantee + +The ordering of messages is related to MessageRoutingMode and Message Key. Usually, user would want an ordering of Per-key-partition guarantee. + +If there is a key attached to message, the messages will be routed to corresponding partitions based on the hashing scheme specified by {@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} in {@inject: javadoc:ProducerBuilder:/client/org/apache/pulsar/client/api/ProducerBuilder}, when using either `SinglePartition` or `RoundRobinPartition` mode. + +Ordering guarantee | Description | Routing Mode and Key +:------------------|:------------|:------------ +Per-key-partition | All the messages with the same key will be in order and be placed in same partition. | Use either `SinglePartition` or `RoundRobinPartition` mode, and Key is provided by each message. +Per-producer | All the messages from the same producer will be in order. | Use `SinglePartition` mode, and no Key is provided for each message. + +### Hashing scheme + +{@inject: javadoc:HashingScheme:/client/org/apache/pulsar/client/api/HashingScheme} is an enum that represent sets of standard hashing functions available when choosing the partition to use for a particular message. + +There are 2 types of standard hashing functions available: `JavaStringHash` and `Murmur3_32Hash`. +The default hashing function for producer is `JavaStringHash`. +Please pay attention that `JavaStringHash` is not useful when producers can be from different multiple language clients, under this use case, it is recommended to use `Murmur3_32Hash`. + + + +## Non-persistent topics + + +By default, Pulsar persistently stores *all* unacknowledged messages on multiple [BookKeeper](concepts-architecture-overview.md#persistent-storage) bookies (storage nodes). Data for messages on persistent topics can thus survive broker restarts and subscriber failover. + +Pulsar also, however, supports **non-persistent topics**, which are topics on which messages are *never* persisted to disk and live only in memory. When using non-persistent delivery, killing a Pulsar broker or disconnecting a subscriber to a topic means that all in-transit messages are lost on that (non-persistent) topic, meaning that clients may see message loss. + +Non-persistent topics have names of this form (note the `non-persistent` in the name): + +```http +non-persistent://tenant/namespace/topic +``` + +> For more info on using non-persistent topics, see the [Non-persistent messaging cookbook](cookbooks-non-persistent.md). + +In non-persistent topics, brokers immediately deliver messages to all connected subscribers *without persisting them* in [BookKeeper](concepts-architecture-overview.md#persistent-storage). If a subscriber is disconnected, the broker will not be able to deliver those in-transit messages, and subscribers will never be able to receive those messages again. Eliminating the persistent storage step makes messaging on non-persistent topics slightly faster than on persistent topics in some cases, but with the caveat that some of the core benefits of Pulsar are lost. + +> With non-persistent topics, message data lives only in memory. If a message broker fails or message data can otherwise not be retrieved from memory, your message data may be lost. Use non-persistent topics only if you're *certain* that your use case requires it and can sustain it. + +By default, non-persistent topics are enabled on Pulsar brokers. You can disable them in the broker's [configuration](reference-configuration.md#broker-enableNonPersistentTopics). You can manage non-persistent topics using the [`pulsar-admin topics`](referencereference--pulsar-admin/#topics-1) interface. + +### Performance + +Non-persistent messaging is usually faster than persistent messaging because brokers don't persist messages and immediately send acks back to the producer as soon as that message is delivered to connected brokers. Producers thus see comparatively low publish latency with non-persistent topic. + +### Client API + +Producers and consumers can connect to non-persistent topics in the same way as persistent topics, with the crucial difference that the topic name must start with `non-persistent`. All three subscription modes---[exclusive](#exclusive), [shared](#shared), and [failover](#failover)---are supported for non-persistent topics. + +Here's an example [Java consumer](client-libraries-java.md#consumers) for a non-persistent topic: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +String npTopic = "non-persistent://public/default/my-topic"; +String subscriptionName = "my-subscription-name"; + +Consumer consumer = client.newConsumer() + .topic(npTopic) + .subscriptionName(subscriptionName) + .subscribe(); +``` + +Here's an example [Java producer](client-libraries-java.md#producer) for the same non-persistent topic: + +```java +Producer producer = client.newProducer() + .topic(npTopic) + .create(); +``` + +## Message retention and expiry + +By default, Pulsar message brokers: + +* immediately delete *all* messages that have been acknowledged by a consumer, and +* [persistently store](concepts-architecture-overview.md#persistent-storage) all unacknowledged messages in a message backlog. + +Pulsar has two features, however, that enable you to override this default behavior: + +* Message **retention** enables you to store messages that have been acknowledged by a consumer +* Message **expiry** enables you to set a time to live (TTL) for messages that have not yet been acknowledged + +> All message retention and expiry is managed at the [namespace](#namespaces) level. For a how-to, see the [Message retention and expiry](cookbooks-retention-expiry.md) cookbook. + +The diagram below illustrates both concepts: + +![Message retention and expiry](assets/retention-expiry.png) + +With message retention, shown at the top, a retention policy applied to all topics in a namespace dicates that some messages are durably stored in Pulsar even though they've already been acknowledged. Acknowledged messages that are not covered by the retention policy are deleted. Without a retention policy, *all* of the acknowledged messages would be deleted. + +With message expiry, shown at the bottom, some messages are deleted, even though they haven't been acknowledged, because they've expired according to the TTL applied to the namespace (for example because a TTL of 5 minutes has been applied and the messages haven't been acknowledged but are 10 minutes old). + +## Message deduplication + +Message **duplication** occurs when a message is [persisted](concepts-architecture-overview.md#persistent-storage) by Pulsar more than once. Message ***de*duplication** is an optional Pulsar feature that prevents unnecessary message duplication by processing each message only once, *even if the message is received more than once*. + +The following diagram illustrates what happens when message deduplication is disabled vs. enabled: + +![Pulsar message deduplication](assets/message-deduplication.png) + + +Message deduplication is disabled in the scenario shown at the top. Here, a producer publishes message 1 on a topic; the message reaches a Pulsar broker and is [persisted](concepts-architecture-overview.md#persistent-storage) to BookKeeper. The producer then sends message 1 again (in this case due to some retry logic), and the message is received by the broker and stored in BookKeeper again, which means that duplication has occurred. + +In the second scenario at the bottom, the producer publishes message 1, which is received by the broker and persisted, as in the first scenario. When the producer attempts to publish the message again, however, the broker knows that it has already seen message 1 and thus does not persist the message. + +> Message deduplication is handled at the namespace level. For more instructions, see the [message deduplication cookbook](cookbooks-deduplication.md). + + +### Producer idempotency + +The other available approach to message deduplication is to ensure that each message is *only produced once*. This approach is typically called **producer idempotency**. The drawback of this approach is that it defers the work of message deduplication to the application. In Pulsar, this is handled at the [broker](reference-terminology.md#broker) level, which means that you don't need to modify your Pulsar client code. Instead, you only need to make administrative changes (see the [Managing message deduplication](cookbooks-deduplication.md) cookbook for a guide). + +### Deduplication and effectively-once semantics + +Message deduplication makes Pulsar an ideal messaging system to be used in conjunction with stream processing engines (SPEs) and other systems seeking to provide [effectively-once](https://streaml.io/blog/exactly-once) processing semantics. Messaging systems that don't offer automatic message deduplication require the SPE or other system to guarantee deduplication, which means that strict message ordering comes at the cost of burdening the application with the responsibility of deduplication. With Pulsar, strict ordering guarantees come at no application-level cost. + +> More in-depth information can be found in [this post](https://streaml.io/blog/pulsar-effectively-once/) on the [Streamlio blog](https://streaml.io/blog) + +## Delayed message delivery +Delayed message delivery enables you to consume a message later rather than immediately. In this mechanism, a message is stored in BookKeeper, `DelayedDeliveryTracker` maintains the time index(time -> messageId) in memory after published to a broker, and it is delivered to a consumer once the specific delayed time is passed. + +Delayed message delivery only works well in Shared subscription mode. In Exclusive and Failover subscription mode, the delayed message is dispatched immediately. + +The diagram below illustrates the concept of delayed message delivery: + +![Delayed Message Delivery](assets/message_delay.png) + +A broker saves a message without any check. When a consumer consumes a message, if the message is set to delay, then the message is added to `DelayedDeliveryTracker`. A subscription checks and gets timeout messages from `DelayedDeliveryTracker`. + +### Broker +Delayed message delivery is enabled by default. You can change it in the broker configuration file as below: + +``` +# Whether to enable the delayed delivery for messages. +# If disabled, messages are immediately delivered and there is no tracking overhead. +delayedDeliveryEnabled=true + +# Control the ticking time for the retry of delayed message delivery, +# affecting the accuracy of the delivery time compared to the scheduled time. +# Default is 1 second. +delayedDeliveryTickTimeMillis=1000 +``` + +### Producer +The following is an example of delayed message delivery for a producer in Java: +```java +// message to be delivered at the configured delay interval +producer.newMessage().deliverAfter(3L, TimeUnit.Minute).value("Hello Pulsar!").send(); +``` diff --git a/site2/website/versioned_docs/version-2.5.0/concepts-overview.md b/site2/website/versioned_docs/version-2.5.0/concepts-overview.md new file mode 100644 index 0000000000000..bcfccc463b1e2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/concepts-overview.md @@ -0,0 +1,31 @@ +--- +id: version-2.5.0-concepts-overview +title: Pulsar Overview +sidebar_label: Overview +original_id: concepts-overview +--- + +Pulsar is a multi-tenant, high-performance solution for server-to-server messaging. Pulsar was originally developed by Yahoo, it is under the stewardship of the [Apache Software Foundation](https://www.apache.org/). + +Key features of Pulsar are listed below: + +* Native support for multiple clusters in a Pulsar instance, with seamless [geo-replication](administration-geo.md) of messages across clusters. +* Very low publish and end-to-end latency. +* Seamless scalability to over a million topics. +* A simple [client API](concepts-clients.md) with bindings for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) and [C++](client-libraries-cpp.md). +* Multiple [subscription modes](concepts-messaging.md#subscription-modes) ([exclusive](concepts-messaging.md#exclusive), [shared](concepts-messaging.md#shared), and [failover](concepts-messaging.md#failover)) for topics. +* Guaranteed message delivery with [persistent message storage](concepts-architecture-overview.md#persistent-storage) provided by [Apache BookKeeper](http://bookkeeper.apache.org/). +* A serverless light-weight computing framework [Pulsar Functions](functions-overview.md) offers the capability for stream-native data processing. +* A serverless connector framework [Pulsar IO](io-overview.md), which is built on Pulsar Functions, makes it easier to move data in and out Apache Pulsar. +* [Tiered Storage](concepts-tiered-storage.md) offloads data from hot/warm storage to cold/longterm storage (such as S3 and GCS) when the data is aging out. + +## Contents + +- [Messaging Concepts](concepts-messaging.md) +- [Architecture Overview](concepts-architecture-overview.md) +- [Pulsar Clients](concepts-clients.md) +- [Geo Replication](concepts-replication.md) +- [Multi Tenancy](concepts-multi-tenancy.md) +- [Authentication and Authorization](concepts-authentication.md) +- [Topic Compaction](concepts-topic-compaction.md) +- [Tiered Storage](concepts-tiered-storage.md) diff --git a/site2/website/versioned_docs/version-2.5.0/concepts-tiered-storage.md b/site2/website/versioned_docs/version-2.5.0/concepts-tiered-storage.md new file mode 100644 index 0000000000000..2b7be1775ad73 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/concepts-tiered-storage.md @@ -0,0 +1,18 @@ +--- +id: version-2.5.0-concepts-tiered-storage +title: Tiered Storage +sidebar_label: Tiered Storage +original_id: concepts-tiered-storage +--- + +Pulsar's segment oriented architecture allows for topic backlogs to grow very large, effectively without limit. However, this can become expensive over time. + +One way to alleviate this cost is to use Tiered Storage. With tiered storage, older messages in the backlog can be moved from BookKeeper to a cheaper storage mechanism, while still allowing clients to access the backlog as if nothing had changed. + +![Tiered Storage](assets/pulsar-tiered-storage.png) + +> Data written to BookKeeper is replicated to 3 physical machines by default. However, once a segment is sealed in BookKeeper it becomes immutable and can be copied to long term storage. Long term storage can achieve cost savings by using mechanisms such as [Reed-Solomon error correction](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction) to require fewer physical copies of data. + +Pulsar currently supports S3, Google Cloud Storage (GCS), and filesystem for [long term store](https://pulsar.apache.org/docs/en/cookbooks-tiered-storage/). Offloading to long term storage triggered via a Rest API or command line interface. The user passes in the amount of topic data they wish to retain on BookKeeper, and the broker will copy the backlog data to long term storage. The original data will then be deleted from BookKeeper after a configured delay (4 hours by default). + +> For a guide for setting up tiered storage, see the [Tiered storage cookbook](cookbooks-tiered-storage.md). diff --git a/site2/website/versioned_docs/version-2.5.0/cookbooks-deduplication.md b/site2/website/versioned_docs/version-2.5.0/cookbooks-deduplication.md new file mode 100644 index 0000000000000..f232117b85afb --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/cookbooks-deduplication.md @@ -0,0 +1,121 @@ +--- +id: version-2.5.0-cookbooks-deduplication +title: Message deduplication +sidebar_label: Message deduplication +original_id: cookbooks-deduplication +--- + +When **Message deduplication** is enabled, it ensures that each message produced on Pulsar topics is persisted to disk *only once*, even if the message is produced more than once. Message deduplication is handled automatically on the server side. + +To use message deduplication in Pulsar, you have to [configure](#configure-message-deduplication) your Pulsar brokers and [clients](#pulsar-clients). + +> For more details on message deduplication, refer to [Concepts and Architecture](concepts-messaging.md#message-deduplication). + +## How it works + +You can enable or disable message deduplication on a per-namespace basis. By default, it is *disabled* on all namespaces. You can enable it in the following ways: + +* Enable for all namespaces at the broker-level +* Enable for specific namespaces with the `pulsar-admin namespaces` interface + +## Configure message deduplication + +You can configure message deduplication in Pulsar using the [`broker.conf`](reference-configuration.md#broker) configuration file. The following deduplication-related parameters are available. + +Parameter | Description | Default +:---------|:------------|:------- +`brokerDeduplicationEnabled` | Sets the default behavior for message deduplication in the Pulsar [broker](reference-terminology.md#broker). If it is set to `true`, message deduplication is enabled by default on all namespaces; if it is set to `false` (the default), you have to enable or disable deduplication on a per-namespace basis. | `false` +`brokerDeduplicationMaxNumberOfProducers` | The maximum number of producers for which information is stored for deduplication purposes. | `10000` +`brokerDeduplicationEntriesInterval` | The number of entries after which a deduplication informational snapshot is taken. A larger interval leads to fewer snapshots being taken, though this lengthens the topic recovery time (the time required for entries published after the snapshot to be replayed). | `1000` +`brokerDeduplicationProducerInactivityTimeoutMinutes` | The time of inactivity (in minutes) after which the broker discards deduplication information related to a disconnected producer. | `360` (6 hours) + +### Set default value at the broker-level + +By default, message deduplication is *disabled* on all Pulsar namespaces. To enable it by default on all namespaces, set the `brokerDeduplicationEnabled` parameter to `true` and re-start the broker. + +Even if you set the value for `brokerDeduplicationEnabled`, enabling or disabling via Pulsar admin CLI will override the default settings at the broker-level. + +### Enable message deduplication + +Though message deduplication is disabled by default at broker-level, you can enable message deduplication for specific namespaces using the [`pulsar-admin namespace set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) command. You can use the `--enable`/`-e` flag and specify the namespace. The following is an example with `/`: + +```bash +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --enable # or just -e +``` + +### Disable message deduplication + +Even if you enable message deduplication at broker-level, you can disable message deduplication for a specific namespace using the [`pulsar-admin namespace set-deduplication`](reference-pulsar-admin.md#namespace-set-deduplication) command. Use the `--disable`/`-d` flag and specify the namespace. The following is an example with `/`: + +```bash +$ bin/pulsar-admin namespaces set-deduplication \ + public/default \ + --disable # or just -d +``` + +## Pulsar clients + +If you enable message deduplication in Pulsar brokers, you need complete the following tasks for your client producers: + +1. Specify a name for the producer. +1. Set the message timeout to `0` (namely, no timeout). + +The instructions for Java, Python, and C++ clients are different. + + + + +To enable message deduplication on a [Java producer](client-libraries-java.md#producers), set the producer name using the `producerName` setter, and set the timeout to `0` using the `sendTimeout` setter. + +```java +import org.apache.pulsar.client.api.Producer; +import org.apache.pulsar.client.api.PulsarClient; +import java.util.concurrent.TimeUnit; + +PulsarClient pulsarClient = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); +Producer producer = pulsarClient.newProducer() + .producerName("producer-1") + .topic("persistent://public/default/topic-1") + .sendTimeout(0, TimeUnit.SECONDS) + .create(); +``` + + + +To enable message deduplication on a [Python producer](client-libraries-python.md#producers), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```python +import pulsar + +client = pulsar.Client("pulsar://localhost:6650") +producer = client.create_producer( + "persistent://public/default/topic-1", + producer_name="producer-1", + send_timeout_millis=0) +``` + + +To enable message deduplication on a [C++ producer](client-libraries-cpp.md#producer), set the producer name using `producer_name`, and set the timeout to `0` using `send_timeout_millis`. + +```cpp +#include + +std::string serviceUrl = "pulsar://localhost:6650"; +std::string topic = "persistent://some-tenant/ns1/topic-1"; +std::string producerName = "producer-1"; + +Client client(serviceUrl); + +ProducerConfiguration producerConfig; +producerConfig.setSendTimeout(0); +producerConfig.setProducerName(producerName); + +Producer producer; + +Result result = client.createProducer(topic, producerConfig, producer); +``` + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/cookbooks-retention-expiry.md b/site2/website/versioned_docs/version-2.5.0/cookbooks-retention-expiry.md new file mode 100644 index 0000000000000..f0ddd53d756e4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/cookbooks-retention-expiry.md @@ -0,0 +1,291 @@ +--- +id: version-2.5.0-cookbooks-retention-expiry +title: Message retention and expiry +sidebar_label: Message retention and expiry +original_id: cookbooks-retention-expiry +--- + +Pulsar brokers are responsible for handling messages that pass through Pulsar, including [persistent storage](concepts-architecture-overview.md#persistent-storage) of messages. By default, for each topic, brokers only retain messages that are in at least one backlog. A backlog is the set of unacknowledged messages for a particular subscription. As a topic can have multiple subscriptions, a topic can have multiple backlogs. + +As a consequence, no messages are retained (by default) on a topic that has not had any subscriptions created for it. + +(Note that messages that are no longer being stored are not necessarily immediately deleted, and may in fact still be accessible until the next ledger rollover. Because clients cannot predict when rollovers may happen, it is not wise to rely on a rollover not happening at an inconvenient point in time.) + +In Pulsar, you can modify this behavior, with namespace granularity, in two ways: + +* You can persistently store messages that are not within a backlog (because they've been acknowledged by on every existing subscription, or because there are no subscriptions) by setting [retention policies](#retention-policies). +* Messages that are not acknowledged within a specified timeframe can be automatically acknowledged, by specifying the [time to live](#time-to-live-ttl) (TTL). + +Pulsar's [admin interface](admin-api-overview.md) enables you to manage both retention policies and TTL with namespace granularity (and thus within a specific tenant and either on a specific cluster or in the [`global`](concepts-architecture-overview.md#global-cluster) cluster). + + +> #### Retention and TTL solve two different problems +> * Message retention: Keep the data for at least X hours (even if acknowledged) +> * Time-to-live: Discard data after some time (by automatically acknowledging) +> +> Most applications will want to use at most one of these. + + +## Retention policies + +By default, when a Pulsar message arrives at a broker it will be stored until it has been acknowledged on all subscriptions, at which point it will be marked for deletion. You can override this behavior and retain even messages that have already been acknowledged on all subscriptions by setting a *retention policy* for all topics in a given namespace. Retention policies are either a *size limit* or a *time limit*. + +Retention policies are particularly useful if you intend to exclusively use the Reader interface. Because the Reader interface does not use acknowledgements, messages will never exist within backlogs. Most realistic Reader-only use cases require that retention be configured. + +When you set a size limit of, say, 10 gigabytes, then acknowledged messages in all topics in the namespace will be retained until the size limit for the topic is reached; if you set a time limit of, say, 1 day, then acknowledged messages for all topics in the namespace will be retained for 24 hours. The retention settings apply to all messages on topics that do not have any subscriptions, or if there are subscriptions, to messages that have been acked by all subscriptions. The retention policy settings do not affect unacknowledged messages on topics with subscriptions -- these are instead controlled by the backlog quota (see below). + +When a retention limit is exceeded, the oldest message is marked for deletion until the set of retained messages falls within the specified limits again. + +It is also possible to set *unlimited* retention time or size by setting `-1` for either time or size retention. + +### Defaults + +There are two configuration parameters that you can use to set [instance](reference-terminology.md#instance)-wide defaults for message retention: [`defaultRetentionTimeInMinutes=0`](reference-configuration.md#broker-defaultRetentionTimeInMinutes) and [`defaultRetentionSizeInMB=0`](reference-configuration.md#broker-defaultRetentionSizeInMB). + +Both of these parameters are in the [`broker.conf`](reference-configuration.md#broker) configuration file. + +### Set retention policy + +You can set a retention policy for a namespace by specifying the namespace as well as both a size limit *and* a time limit. + +#### pulsar-admin + +Use the [`set-retention`](reference-pulsar-admin.md#namespaces-set-retention) subcommand and specify a namespace, a size limit using the `-s`/`--size` flag, and a time limit using the `-t`/`--time` flag. + +##### Examples + +To set a size limit of 10 gigabytes and a time limit of 3 hours for the `my-tenant/my-ns` namespace: + +```shell +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 10G \ + --time 3h +``` + +To set retention with a size limit but without a time limit: + +```shell +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size 1T \ + --time -1 +``` + +Retention can be configured to be unlimited both in size and time: + +```shell +$ pulsar-admin namespaces set-retention my-tenant/my-ns \ + --size -1 \ + --time -1 +``` + + + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/retention|operation/setRetention} + +#### Java + +```java +int retentionTime = 10; // 10 minutes +int retentionSize = 500; // 500 megabytes +RetentionPolicies policies = new RetentionPolicies(retentionTime, retentionSize); +admin.namespaces().setRetention(namespace, policies); +``` + +### Get retention policy + +You can fetch the retention policy for a namespace by specifying the namespace. The output will be a JSON object with two keys: `retentionTimeInMinutes` and `retentionSizeInMB`. + +#### pulsar-admin + +Use the [`get-retention`](reference-pulsar-admin.md#namespaces) subcommand and specify the namespace. + +##### Example + +```shell +$ pulsar-admin namespaces get-retention my-tenant/my-ns +{ + "retentionTimeInMinutes": 10, + "retentionSizeInMB": 0 +} +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/retention|operation/getRetention} + +#### Java + +```java +admin.namespaces().getRetention(namespace); +``` + +## Backlog quotas + +*Backlogs* are sets of unacknowledged messages for a topic that have been stored by bookies. Pulsar stores all unacknowledged messages in backlogs until they are processed and acknowledged. + +You can control the allowable size of backlogs, at the namespace level, using *backlog quotas*. Setting a backlog quota involves setting: + +TODO: Expand on is this per backlog or per topic? + +* an allowable *size threshold* for each topic in the namespace +* a *retention policy* that determines which action the [broker](reference-terminology.md#broker) takes if the threshold is exceeded. + +The following retention policies are available: + +Policy | Action +:------|:------ +`producer_request_hold` | The broker will hold and not persist produce request payload +`producer_exception` | The broker will disconnect from the client by throwing an exception +`consumer_backlog_eviction` | The broker will begin discarding backlog messages + + +> #### Beware the distinction between retention policy types +> As you may have noticed, there are two definitions of the term "retention policy" in Pulsar, one that applies to persistent storage of messages not in backlogs, and one that applies to messages within backlogs. + + +Backlog quotas are handled at the namespace level. They can be managed via: + +### Set size thresholds and backlog retention policies + +You can set a size threshold and backlog retention policy for all of the topics in a [namespace](reference-terminology.md#namespace) by specifying the namespace, a size limit, and a policy by name. + +#### pulsar-admin + +Use the [`set-backlog-quota`](reference-pulsar-admin.md#namespaces) subcommand and specify a namespace, a size limit using the `-l`/`--limit` flag, and a retention policy using the `-p`/`--policy` flag. + +##### Example + +```shell +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ + --limit 2G \ + --policy producer_request_hold +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/getBacklogQuotaMap} + +#### Java + +```java +long sizeLimit = 2147483648L; +BacklogQuota.RetentionPolicy policy = BacklogQuota.RetentionPolicy.producer_request_hold; +BacklogQuota quota = new BacklogQuota(sizeLimit, policy); +admin.namespaces().setBacklogQuota(namespace, quota); +``` + +### Get backlog threshold and backlog retention policy + +You can see which size threshold and backlog retention policy has been applied to a namespace. + +#### pulsar-admin + +Use the [`get-backlog-quotas`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-backlog-quotas) subcommand and specify a namespace. Here's an example: + +```shell +$ pulsar-admin namespaces get-backlog-quotas my-tenant/my-ns +{ + "destination_storage": { + "limit" : 2147483648, + "policy" : "producer_request_hold" + } +} +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/backlogQuotaMap|operation/getBacklogQuotaMap} + +#### Java + +```java +Map quotas = + admin.namespaces().getBacklogQuotas(namespace); +``` + +### Remove backlog quotas + +#### pulsar-admin + +Use the [`remove-backlog-quota`](reference-pulsar-admin.md#pulsar-admin-namespaces-remove-backlog-quota) subcommand and specify a namespace. Here's an example: + +```shell +$ pulsar-admin namespaces remove-backlog-quota my-tenant/my-ns +``` + +#### REST API + +{@inject: endpoint|DELETE|/admin/v2/namespaces/:tenant/:namespace/backlogQuota|operation/removeBacklogQuota} + +#### Java + +```java +admin.namespaces().removeBacklogQuota(namespace); +``` + +### Clear backlog + +#### pulsar-admin + +Use the [`clear-backlog`](reference-pulsar-admin.md#pulsar-admin-namespaces-clear-backlog) subcommand. + +##### Example + +```shell +$ pulsar-admin namespaces clear-backlog my-tenant/my-ns +``` + +By default, you will be prompted to ensure that you really want to clear the backlog for the namespace. You can override the prompt using the `-f`/`--force` flag. + +## Time to live (TTL) + +By default, Pulsar stores all unacknowledged messages forever. This can lead to heavy disk space usage in cases where a lot of messages are going unacknowledged. If disk space is a concern, you can set a time to live (TTL) that determines how long unacknowledged messages will be retained. + +### Set the TTL for a namespace + +#### pulsar-admin + +Use the [`set-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-set-message-ttl) subcommand and specify a namespace and a TTL (in seconds) using the `-ttl`/`--messageTTL` flag. + +##### Example + +```shell +$ pulsar-admin namespaces set-message-ttl my-tenant/my-ns \ + --messageTTL 120 # TTL of 2 minutes +``` + +#### REST API + +{@inject: endpoint|POST|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/setNamespaceMessageTTL} + +#### Java + +```java +admin.namespaces().setNamespaceMessageTTL(namespace, ttlInSeconds); +``` + +### Get the TTL configuration for a namespace + +#### pulsar-admin + +Use the [`get-message-ttl`](reference-pulsar-admin.md#pulsar-admin-namespaces-get-message-ttl) subcommand and specify a namespace. + +##### Example + +```shell +$ pulsar-admin namespaces get-message-ttl my-tenant/my-ns +60 +``` + +#### REST API + +{@inject: endpoint|GET|/admin/v2/namespaces/:tenant/:namespace/messageTTL|operation/getNamespaceMessageTTL} + +#### Java + +```java +admin.namespaces().getNamespaceMessageTTL(namespace) +``` + diff --git a/site2/website/versioned_docs/version-2.5.0/cookbooks-tiered-storage.md b/site2/website/versioned_docs/version-2.5.0/cookbooks-tiered-storage.md new file mode 100644 index 0000000000000..1115797687210 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/cookbooks-tiered-storage.md @@ -0,0 +1,296 @@ +--- +id: version-2.5.0-cookbooks-tiered-storage +title: Tiered Storage +sidebar_label: Tiered Storage +original_id: cookbooks-tiered-storage +--- + +Pulsar's **Tiered Storage** feature allows older backlog data to be offloaded to long term storage, thereby freeing up space in BookKeeper and reducing storage costs. This cookbook walks you through using tiered storage in your Pulsar cluster. + +* Tiered storage uses [Apache jclouds](https://jclouds.apache.org) to support +[Amazon S3](https://aws.amazon.com/s3/) and [Google Cloud Storage](https://cloud.google.com/storage/)(GCS for short) +for long term storage. With Jclouds, it is easy to add support for more +[cloud storage providers](https://jclouds.apache.org/reference/providers/#blobstore-providers) in the future. + +* Tiered storage uses [Apache Hadoop](http://hadoop.apache.org/) to support filesystem for long term storage. +With Hadoop, it is easy to add support for more filesystem in the future. + +## When should I use Tiered Storage? + +Tiered storage should be used when you have a topic for which you want to keep a very long backlog for a long time. For example, if you have a topic containing user actions which you use to train your recommendation systems, you may want to keep that data for a long time, so that if you change your recommendation algorithm you can rerun it against your full user history. + +## The offloading mechanism + +A topic in Pulsar is backed by a log, known as a managed ledger. This log is composed of an ordered list of segments. Pulsar only every writes to the final segment of the log. All previous segments are sealed. The data within the segment is immutable. This is known as a segment oriented architecture. + +![Tiered storage](assets/pulsar-tiered-storage.png "Tiered Storage") + +The Tiered Storage offloading mechanism takes advantage of this segment oriented architecture. When offloading is requested, the segments of the log are copied, one-by-one, to tiered storage. All segments of the log, apart from the segment currently being written to can be offloaded. + +On the broker, the administrator must configure the bucket and credentials for the cloud storage service. +The configured bucket must exist before attempting to offload. If it does not exist, the offload operation will fail. + +Pulsar uses multi-part objects to upload the segment data. It is possible that a broker could crash while uploading the data. +We recommend you add a life cycle rule your bucket to expire incomplete multi-part upload after a day or two to avoid +getting charged for incomplete uploads. + +## Configuring the offload driver + +Offloading is configured in ```broker.conf```. + +At a minimum, the administrator must configure the driver, the bucket and the authenticating credentials. +There is also some other knobs to configure, like the bucket region, the max block size in backed storage, etc. + +Currently we support driver of types: + +- `aws-s3`: [Simple Cloud Storage Service](https://aws.amazon.com/s3/) +- `google-cloud-storage`: [Google Cloud Storage](https://cloud.google.com/storage/) +- `filesystem`: [Filesystem Storage](http://hadoop.apache.org/) + +> Driver names are case-insensitive for driver's name. There is a third driver type, `s3`, which is identical to `aws-s3`, +> though it requires that you specify an endpoint url using `s3ManagedLedgerOffloadServiceEndpoint`. This is useful if +> using a S3 compatible data store, other than AWS. + +```conf +managedLedgerOffloadDriver=aws-s3 +``` + +### "aws-s3" Driver configuration + +#### Bucket and Region + +Buckets are the basic containers that hold your data. +Everything that you store in Cloud Storage must be contained in a bucket. +You can use buckets to organize your data and control access to your data, +but unlike directories and folders, you cannot nest buckets. + +```conf +s3ManagedLedgerOffloadBucket=pulsar-topic-offload +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required +but a recommended configuration. If it is not configured, It will use the default region. + +With AWS S3, the default region is `US East (N. Virginia)`. Page +[AWS Regions and Endpoints](https://docs.aws.amazon.com/general/latest/gr/rande.html) contains more information. + +```conf +s3ManagedLedgerOffloadRegion=eu-west-3 +``` + +#### Authentication with AWS + +To be able to access AWS S3, you need to authenticate with AWS S3. +Pulsar does not provide any direct means of configuring authentication for AWS S3, +but relies on the mechanisms supported by the +[DefaultAWSCredentialsProviderChain](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/auth/DefaultAWSCredentialsProviderChain.html). + +Once you have created a set of credentials in the AWS IAM console, they can be configured in a number of ways. + +1. Using ec2 instance metadata credentials + +If you are on AWS instance with an instance profile that provides credentials, Pulsar will use these credentials +if no other mechanism is provided + +2. Set the environment variables **AWS_ACCESS_KEY_ID** and **AWS_SECRET_ACCESS_KEY** in ```conf/pulsar_env.sh```. + +```bash +export AWS_ACCESS_KEY_ID=ABC123456789 +export AWS_SECRET_ACCESS_KEY=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c +``` + +> \"export\" is important so that the variables are made available in the environment of spawned processes. + + +3. Add the Java system properties *aws.accessKeyId* and *aws.secretKey* to **PULSAR_EXTRA_OPTS** in `conf/pulsar_env.sh`. + +```bash +PULSAR_EXTRA_OPTS="${PULSAR_EXTRA_OPTS} ${PULSAR_MEM} ${PULSAR_GC} -Daws.accessKeyId=ABC123456789 -Daws.secretKey=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c -Dio.netty.leakDetectionLevel=disabled -Dio.netty.recycler.maxCapacity.default=1000 -Dio.netty.recycler.linkCapacity=1024" +``` + +4. Set the access credentials in ```~/.aws/credentials```. + +```conf +[default] +aws_access_key_id=ABC123456789 +aws_secret_access_key=ded7db27a4558e2ea8bbf0bf37ae0e8521618f366c +``` + +5. Assuming an IAM role + +If you want to assume an IAM role, this can be done via specifying the following: + +```conf +s3ManagedLedgerOffloadRole= +s3ManagedLedgerOffloadRoleSessionName=pulsar-s3-offload +``` + +This will use the `DefaultAWSCredentialsProviderChain` for assuming this role. + +> The broker must be rebooted for credentials specified in pulsar_env to take effect. + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to AWS S3. + +- ```s3ManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of + a "part" sent during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```s3ManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for + each individual read when reading back data from AWS S3. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "google-cloud-storage" Driver configuration + +Buckets are the basic containers that hold your data. Everything that you store in +Cloud Storage must be contained in a bucket. You can use buckets to organize your data and +control access to your data, but unlike directories and folders, you cannot nest buckets. + +```conf +gcsManagedLedgerOffloadBucket=pulsar-topic-offload +``` + +Bucket Region is the region where bucket located. Bucket Region is not a required but +a recommended configuration. If it is not configured, It will use the default region. + +Regarding GCS, buckets are default created in the `us multi-regional location`, +page [Bucket Locations](https://cloud.google.com/storage/docs/bucket-locations) contains more information. + +```conf +gcsManagedLedgerOffloadRegion=europe-west3 +``` + +#### Authentication with GCS + +The administrator needs to configure `gcsManagedLedgerOffloadServiceAccountKeyFile` in `broker.conf` +for the broker to be able to access the GCS service. `gcsManagedLedgerOffloadServiceAccountKeyFile` is +a Json file, containing the GCS credentials of a service account. +[Service Accounts section of this page](https://support.google.com/googleapi/answer/6158849) contains +more information of how to create this key file for authentication. More information about google cloud IAM +is available [here](https://cloud.google.com/storage/docs/access-control/iam). + +Usually these are the steps to create the authentication file: +1. Open the API Console Credentials page. +2. If it's not already selected, select the project that you're creating credentials for. +3. To set up a new service account, click New credentials and then select Service account key. +4. Choose the service account to use for the key. +5. Download the service account's public/private key as a JSON file that can be loaded by a Google API client library. + +```conf +gcsManagedLedgerOffloadServiceAccountKeyFile="/Users/hello/Downloads/project-804d5e6a6f33.json" +``` + +#### Configuring the size of block read/write + +Pulsar also provides some knobs to configure the size of requests sent to GCS. + +- ```gcsManagedLedgerOffloadMaxBlockSizeInBytes``` configures the maximum size of a "part" sent + during a multipart upload. This cannot be smaller than 5MB. Default is 64MB. +- ```gcsManagedLedgerOffloadReadBufferSizeInBytes``` configures the block size for each individual + read when reading back data from GCS. Default is 1MB. + +In both cases, these should not be touched unless you know what you are doing. + +### "filesystem" Driver configuration + + +#### Configure connection address + +You can configure the connection address in the `broker.conf` file. + +```conf +fileSystemURI="hdfs://127.0.0.1:9000" +``` +#### Configure Hadoop profile path + +The configuration file is stored in the Hadoop profile path. It contains various settings, such as base path, authentication, and so on. + +```conf +fileSystemProfilePath="../conf/filesystem_offload_core_site.xml" +``` + +The model for storing topic data uses `org.apache.hadoop.io.MapFile`. You can use all of the configurations in `org.apache.hadoop.io.MapFile` for Hadoop. + +**Example** + +```conf + + + fs.defaultFS + + + + + hadoop.tmp.dir + pulsar + + + + io.file.buffer.size + 4096 + + + + io.seqfile.compress.blocksize + 1000000 + + + + io.seqfile.compression.type + BLOCK + + + + io.map.index.interval + 128 + + +``` + +For more information about the configurations in `org.apache.hadoop.io.MapFile`, see [Filesystem Storage](http://hadoop.apache.org/). +## Configuring offload to run automatically + +Namespace policies can be configured to offload data automatically once a threshold is reached. The threshold is based on the size of data that the topic has stored on the pulsar cluster. Once the topic reaches the threshold, an offload operation will be triggered. Setting a negative value to the threshold will disable automatic offloading. Setting the threshold to 0 will cause the broker to offload data as soon as it possiby can. + +```bash +$ bin/pulsar-admin namespaces set-offload-threshold --size 10M my-tenant/my-namespace +``` + +> Automatic offload runs when a new segment is added to a topic log. If you set the threshold on a namespace, but few messages are being produced to the topic, offload will not until the current segment is full. + + +## Triggering offload manually + +Offloading can manually triggered through a REST endpoint on the Pulsar broker. We provide a CLI which will call this rest endpoint for you. + +When triggering offload, you must specify the maximum size, in bytes, of backlog which will be retained locally on the bookkeeper. The offload mechanism will offload segments from the start of the topic backlog until this condition is met. + +```bash +$ bin/pulsar-admin topics offload --size-threshold 10M my-tenant/my-namespace/topic1 +Offload triggered for persistent://my-tenant/my-namespace/topic1 for messages before 2:0:-1 +``` + +The command to triggers an offload will not wait until the offload operation has completed. To check the status of the offload, use offload-status. + +```bash +$ bin/pulsar-admin topics offload-status my-tenant/my-namespace/topic1 +Offload is currently running +``` + +To wait for offload to complete, add the -w flag. + +```bash +$ bin/pulsar-admin topics offload-status -w my-tenant/my-namespace/topic1 +Offload was a success +``` + +If there is an error offloading, the error will be propagated to the offload-status command. + +```bash +$ bin/pulsar-admin topics offload-status persistent://public/default/topic1 +Error in offload +null + +Reason: Error offloading: org.apache.bookkeeper.mledger.ManagedLedgerException: java.util.concurrent.CompletionException: com.amazonaws.services.s3.model.AmazonS3Exception: Anonymous users cannot initiate multipart uploads. Please authenticate. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 798758DE3F1776DF; S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g=), S3 Extended Request ID: dhBFz/lZm1oiG/oBEepeNlhrtsDlzoOhocuYMpKihQGXe6EG8puRGOkK6UwqzVrMXTWBxxHcS+g= +```` + diff --git a/site2/website/versioned_docs/version-2.5.0/deploy-aws.md b/site2/website/versioned_docs/version-2.5.0/deploy-aws.md new file mode 100644 index 0000000000000..7b00cee724700 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/deploy-aws.md @@ -0,0 +1,224 @@ +--- +id: version-2.5.0-deploy-aws +title: Deploying a Pulsar cluster on AWS using Terraform and Ansible +sidebar_label: Amazon Web Services +original_id: deploy-aws +--- + +> For instructions on deploying a single Pulsar cluster manually rather than using Terraform and Ansible, see [Deploying a Pulsar cluster on bare metal](deploy-bare-metal.md). For instructions on manually deploying a multi-cluster Pulsar instance, see [Deploying a Pulsar instance on bare metal](deploy-bare-metal-multi-cluster.md). + +One of the easiest ways to get a Pulsar [cluster](reference-terminology.md#cluster) running on [Amazon Web Services](https://aws.amazon.com/) (AWS) is to use the [Terraform](https://terraform.io) infrastructure provisioning tool and the [Ansible](https://www.ansible.com) server automation tool. Terraform can create the resources necessary for running the Pulsar cluster---[EC2](https://aws.amazon.com/ec2/) instances, networking and security infrastructure, etc.---While Ansible can install and run Pulsar on the provisioned resources. + +## Requirements and setup + +In order to install a Pulsar cluster on AWS using Terraform and Ansible, you need to prepare the following things: + +* An [AWS account](https://aws.amazon.com/account/) and the [`aws`](https://aws.amazon.com/cli/) command-line tool +* Python and [pip](https://pip.pypa.io/en/stable/) +* The [`terraform-inventory`](https://github.com/adammck/terraform-inventory) tool, which enables Ansible to use Terraform artifacts + +You also need to make sure that you are currently logged into your AWS account via the `aws` tool: + +```bash +$ aws configure +``` + +## Installation + +You can install Ansible on Linux or macOS using pip. + +```bash +$ pip install ansible +``` + +You can install Terraform using the instructions [here](https://www.terraform.io/intro/getting-started/install.html). + +You also need to have the Terraform and Ansible configuration for Pulsar locally on your machine. You can find them in the [GitHub repository](https://github.com/apache/pulsar) of Pulsar, which you can fetch using Git commands: + +```bash +$ git clone https://github.com/apache/pulsar +$ cd pulsar/deployment/terraform-ansible/aws +``` + +## SSH setup + +> If you already have an SSH key and want to use it, you can skip the step of generating an SSH key and update `private_key_file` setting +> in `ansible.cfg` file and `public_key_path` setting in `terraform.tfvars` file. +> +> For example, if you already have a private SSH key in `~/.ssh/pulsar_aws` and a public key in `~/.ssh/pulsar_aws.pub`, +> follow the steps below: +> +> 1. update `ansible.cfg` with following values: +> +> ```shell +> private_key_file=~/.ssh/pulsar_aws +> ``` +> +> 2. update `terraform.tfvars` with following values: +> +> ```shell +> public_key_path=~/.ssh/pulsar_aws.pub +> ``` + +In order to create the necessary AWS resources using Terraform, you need to create an SSH key. Enter the following commands to create a private SSH key in `~/.ssh/id_rsa` and a public key in `~/.ssh/id_rsa.pub`: + +```bash +$ ssh-keygen -t rsa +``` + +Do *not* enter a passphrase (hit **Enter** instead when the prompt comes out). Enter the following command to verify that a key has been created: + +```bash +$ ls ~/.ssh +id_rsa id_rsa.pub +``` + +## Create AWS resources using Terraform + +To start building AWS resources with Terraform, you need to install all Terraform dependencies. Enter the follwing command: + +```bash +$ terraform init +# This will create a .terraform folder +``` + +After that, you can apply the default Terraform configuration by entering this command: + +```bash +$ terraform apply +``` + +Then you see this prompt below: + +```bash +Do you want to perform these actions? + Terraform will perform the actions described above. + Only 'yes' will be accepted to approve. + + Enter a value: +``` + +Type `yes` and hit **Enter**. Applying the configuration could take several minutes. When the configuration applying finishes, you can see `Apply complete!` along with some other information, including the number of resources created. + +### Apply a non-default configuration + +You can apply a non-default Terraform configuration by changing the values in the `terraform.tfvars` file. The following variables are available: + +Variable name | Description | Default +:-------------|:------------|:------- +`public_key_path` | The path of the public key that you have generated. | `~/.ssh/id_rsa.pub` +`region` | The AWS region in which the Pulsar cluster runs | `us-west-2` +`availability_zone` | The AWS availability zone in which the Pulsar cluster runs | `us-west-2a` +`aws_ami` | The [Amazon Machine Image](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html) (AMI) that the cluster uses | `ami-9fa343e7` +`num_zookeeper_nodes` | The number of [ZooKeeper](https://zookeeper.apache.org) nodes in the ZooKeeper cluster | 3 +`num_bookie_nodes` | The number of bookies that runs in the cluster | 3 +`num_broker_nodes` | The number of Pulsar brokers that runs in the cluster | 2 +`num_proxy_nodes` | The number of Pulsar proxies that runs in the cluster | 1 +`base_cidr_block` | The root [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing) that network assets uses for the cluster | `10.0.0.0/16` +`instance_types` | The EC2 instance types to be used. This variable is a map with two keys: `zookeeper` for the ZooKeeper instances, `bookie` for the BookKeeper bookies and `broker` and `proxy` for Pulsar brokers and bookies | `t2.small` (ZooKeeper), `i3.xlarge` (BookKeeper) and `c5.2xlarge` (Brokers/Proxies) + +### What is installed + +When you run the Ansible playbook, the following AWS resources are used: + +* 9 total [Elastic Compute Cloud](https://aws.amazon.com/ec2) (EC2) instances running the [ami-9fa343e7](https://access.redhat.com/articles/3135091) Amazon Machine Image (AMI), which runs [Red Hat Enterprise Linux (RHEL) 7.4](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html-single/7.4_release_notes/index). By default, that includes: + * 3 small VMs for ZooKeeper ([t2.small](https://www.ec2instances.info/?selected=t2.small) instances) + * 3 larger VMs for BookKeeper [bookies](reference-terminology.md#bookie) ([i3.xlarge](https://www.ec2instances.info/?selected=i3.xlarge) instances) + * 2 larger VMs for Pulsar [brokers](reference-terminology.md#broker) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) + * 1 larger VMs for Pulsar [proxy](reference-terminology.md#proxy) ([c5.2xlarge](https://www.ec2instances.info/?selected=c5.2xlarge) instances) +* An EC2 [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) +* A [virtual private cloud](https://aws.amazon.com/vpc/) (VPC) for security +* An [API Gateway](https://aws.amazon.com/api-gateway/) for connections from the outside world +* A [route table](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Route_Tables.html) for the Pulsar cluster's VPC +* A [subnet](http://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Subnets.html) for the VPC + +All EC2 instances for the cluster run in the [us-west-2](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html) region. + +### Fetch your Pulsar connection URL + +When you apply the Terraform configuration by entering the command `terraform apply`, Terraform outputs a value for the `pulsar_service_url`. The value should look something like this: + +``` +pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650 +``` + +You can fetch that value at any time by entering the command `terraform output pulsar_service_url` or parsing the `terraform.tstate` file (which is JSON, even though the filename does not reflect that): + +```bash +$ cat terraform.tfstate | jq .modules[0].outputs.pulsar_service_url.value +``` + +### Destroy your cluster + +At any point, you can destroy all AWS resources associated with your cluster using Terraform's `destroy` command: + +```bash +$ terraform destroy +``` + +## Setup Disks + +Before you run the Pulsar playbook, you need to mount the disks to the correct directories on those bookie nodes. Since different type of machines have different disk layout, you need to update the task defined in `setup-disk.yaml` file after changing the `instance_types` in your terraform config, + +To setup disks on bookie nodes, enter this command: + +```bash +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + setup-disk.yaml +``` + +After that, the disks is mounted under `/mnt/journal` as journal disk, and `/mnt/storage` as ledger disk. +Remember to enter this command just only once. If you attempt to enter this command again after you have run Pulsar playbook, your disks might potentially be erased again, causing the bookies to fail to start up. + +## Run the Pulsar playbook + +Once you have created the necessary AWS resources using Terraform, you can install and run Pulsar on the Terraform-created EC2 instances using Ansible. To do so, enter this command: + +```bash +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + ../deploy-pulsar.yaml +``` + +If you have created a private SSH key at a location different from `~/.ssh/id_rsa`, you can specify the different location using the `--private-key` flag in the following command: + +```bash +$ ansible-playbook \ + --user='ec2-user' \ + --inventory=`which terraform-inventory` \ + --private-key="~/.ssh/some-non-default-key" \ + ../deploy-pulsar.yaml +``` + +## Access the cluster + +You can now access your running Pulsar using the unique Pulsar connection URL for your cluster, which you can obtain following the instructions [above](#fetching-your-pulsar-connection-url). + +For a quick demonstration of accessing the cluster, we can use the Python client for Pulsar and the Python shell. First, install the Pulsar Python module using pip: + +```bash +$ pip install pulsar-client +``` + +Now, open up the Python shell using the `python` command: + +```bash +$ python +``` + +Once you are in the shell, enter the following command: + +```python +>>> import pulsar +>>> client = pulsar.Client('pulsar://pulsar-elb-1800761694.us-west-2.elb.amazonaws.com:6650') +# Make sure to use your connection URL +>>> producer = client.create_producer('persistent://public/default/test-topic') +>>> producer.send('Hello world') +>>> client.close() +``` + +If all of these commands are successful, Pulsar clients can now use your cluster! + diff --git a/site2/website/versioned_docs/version-2.5.0/deploy-bare-metal-multi-cluster.md b/site2/website/versioned_docs/version-2.5.0/deploy-bare-metal-multi-cluster.md new file mode 100644 index 0000000000000..bbbb94a92b384 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/deploy-bare-metal-multi-cluster.md @@ -0,0 +1,426 @@ +--- +id: version-2.5.0-deploy-bare-metal-multi-cluster +title: Deploying a multi-cluster on bare metal +sidebar_label: Bare metal multi-cluster +original_id: deploy-bare-metal-multi-cluster +--- + +> ### Tips +> +> 1. Single-cluster Pulsar installations should be sufficient for all but the most ambitious use cases. If you are interested in experimenting with +> Pulsar or using it in a startup or on a single team, you had better opt for a single cluster. For instructions on deploying a single cluster, +> see the guide [here](deploy-bare-metal.md). +> +> 2. If you want to use all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you need to download `apache-pulsar-io-connectors` +> package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you +> run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +> +> 3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders` +> package and install `apache-pulsar-offloaders` under `offloaders` directory in the pulsar directory on every broker node. For more details of how to configure +> this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +A Pulsar *instance* consists of multiple Pulsar clusters working in unison. You can distribute clusters across data centers or geographical regions and replicate the clusters amongst themselves using [geo-replication](administration-geo.md). Deploying a multi-cluster Pulsar instance involves the following basic steps: + +* Deploying two separate [ZooKeeper](#deploy-zookeeper) quorums: a [local](#deploy-local-zookeeper) quorum for each cluster in the instance and a [configuration store](#configuration-store) quorum for instance-wide tasks +* Initializing [cluster metadata](#cluster-metadata-initialization) for each cluster +* Deploying a [BookKeeper cluster](#deploy-bookkeeper) of bookies in each Pulsar cluster +* Deploying [brokers](#deploy-brokers) in each Pulsar cluster + +If you want to deploy a single Pulsar cluster, see [Clusters and Brokers](getting-started-standalone.md#start-the-cluster). + +> #### Run Pulsar locally or on Kubernetes? +> This guide shows you how to deploy Pulsar in production in a non-Kubernetes environment. If you want to run a standalone Pulsar cluster on a single machine for development purposes, see the [Setting up a local cluster](getting-started-standalone.md) guide. If you want to run Pulsar on [Kubernetes](https://kubernetes.io), see the [Pulsar on Kubernetes](deploy-kubernetes.md) guide, which includes sections on running Pulsar on Kubernetes on [Google Kubernetes Engine](deploy-kubernetes#pulsar-on-google-kubernetes-engine) and on [Amazon Web Services](deploy-kubernetes#pulsar-on-amazon-web-services). + +## System requirement +Pulsar is currently available for **MacOS** and **Linux**. In order to use Pulsar, you need to install Java 8 from [Oracle download center](http://www.oracle.com/). + +## Install Pulsar + +To get started running Pulsar, download a binary tarball release in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar {{pulsar:version}} binary release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget 'https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=pulsar/pulsar-{{pulsar:version}}/apache-pulsar-{{pulsar:version}}-bin.tar.gz' -O apache-pulsar-{{pulsar:version}}-bin.tar.gz + ``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash +$ tar xvfz apache-pulsar-{{pulsar:version}}-bin.tar.gz +$ cd apache-pulsar-{{pulsar:version}} +``` + +## What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | [Command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`examples` | A Java JAR file containing example [Pulsar Functions](functions-overview.md) +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`licenses` | License files, in `.txt` form, for various components of the Pulsar codebase + +The following directories are created once you begin running Pulsar: + +Directory | Contains +:---------|:-------- +`data` | The data storage directory that ZooKeeper and BookKeeper use +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md) +`logs` | Logs that the installation creates + + +## Deploy ZooKeeper + +Each Pulsar instance relies on two separate ZooKeeper quorums. + +* [Local ZooKeeper](#deploy-local-zookeeper) operates at the cluster level and provides cluster-specific configuration management and coordination. Each Pulsar cluster needs to have a dedicated ZooKeeper cluster. +* [Configuration Store](#deploy-the-configuration-store) operates at the instance level and provides configuration management for the entire system (and thus across clusters). An independent cluster of machines or the same machines that local ZooKeeper uses can provide the configuration store quorum. + +The configuration store quorum can be provided by an independent cluster of machines or by the same machines used by local ZooKeeper. + + +### Deploy local ZooKeeper + +ZooKeeper manages a variety of essential coordination-related and configuration-related tasks for Pulsar. + +You need to stand up one local ZooKeeper cluster *per Pulsar cluster* for deploying a Pulsar instance. + +To begin, add all ZooKeeper servers to the quorum configuration specified in the [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) file. Add a `server.N` line for each node in the cluster to the configuration, where `N` is the number of the ZooKeeper node. The following is an example for a three-node cluster: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +On each host, you need to specify the ID of the node in the `myid` file of each node, which is in `data/zookeeper` folder of each server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you could set the `myid` value like this: + +```shell +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid +``` + +On `zk2.us-west.example.com` the command looks like `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and each server has the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell +$ bin/pulsar-daemon start zookeeper +``` + +### Deploy the configuration store + +The ZooKeeper cluster that is configured and started up in the section above is a *local* ZooKeeper cluster that you can use to manage a single Pulsar cluster. In addition to a local cluster, however, a full Pulsar instance also requires a configuration store for handling some instance-level configuration and coordination tasks. + +If you deploy a [single-cluster](#single-cluster-pulsar-instance) instance, you do not need a separate cluster for the configuration store. If, however, you deploy a [multi-cluster](#multi-cluster-pulsar-instance) instance, you should stand up a separate ZooKeeper cluster for configuration tasks. + +#### Single-cluster Pulsar instance + +If your Pulsar instance consists of just one cluster, then you can deploy a configuration store on the same machines as the local ZooKeeper quorum but run on different TCP ports. + +To deploy a ZooKeeper configuration store in a single-cluster instance, add the same ZooKeeper servers that the local quorom uses to the configuration file in [`conf/global_zookeeper.conf`](reference-configuration.md#configuration-store) using the same method for [local ZooKeeper](#local-zookeeper), but make sure to use a different port (2181 is the default for ZooKeeper). The following is an example that uses port 2184 for a three-node ZooKeeper cluster: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +``` + +As before, create the `myid` files for each server on `data/global-zookeeper/myid`. + +#### Multi-cluster Pulsar instance + +When you deploy a global Pulsar instance, with clusters distributed across different geographical regions, the configuration store serves as a highly available and strongly consistent metadata store that can tolerate failures and partitions spanning whole regions. + +The key here is to make sure the ZK quorum members are spread across at least 3 regions and that other regions run as observers. + +Again, given the very low expected load on the configuration store servers, you can +share the same hosts used for the local ZooKeeper quorum. + +For example, assume a Pulsar instance with the following clusters `us-west`, +`us-east`, `us-central`, `eu-central`, `ap-south`. Also assume, each cluster has its own local ZK servers named such as the following: + +``` +zk[1-3].${CLUSTER}.example.com +``` + +In this scenario if you want to pick the quorum participants from few clusters and +let all the others be ZK observers. For example, to form a 7 servers quorum, you can pick 3 servers from `us-west`, 2 from `us-central` and 2 from `us-east`. + +This method guarantees that writes to configuration store is possible even if one of these regions is unreachable. + +The ZK configuration in all the servers looks like: + +```properties +clientPort=2184 +server.1=zk1.us-west.example.com:2185:2186 +server.2=zk2.us-west.example.com:2185:2186 +server.3=zk3.us-west.example.com:2185:2186 +server.4=zk1.us-central.example.com:2185:2186 +server.5=zk2.us-central.example.com:2185:2186 +server.6=zk3.us-central.example.com:2185:2186:observer +server.7=zk1.us-east.example.com:2185:2186 +server.8=zk2.us-east.example.com:2185:2186 +server.9=zk3.us-east.example.com:2185:2186:observer +server.10=zk1.eu-central.example.com:2185:2186:observer +server.11=zk2.eu-central.example.com:2185:2186:observer +server.12=zk3.eu-central.example.com:2185:2186:observer +server.13=zk1.ap-south.example.com:2185:2186:observer +server.14=zk2.ap-south.example.com:2185:2186:observer +server.15=zk3.ap-south.example.com:2185:2186:observer +``` + +Additionally, ZK observers need to have the following parameters: + +```properties +peerType=observer +``` + +##### Start the service + +Once your configuration store configuration is in place, you can start up the service using [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) + +```shell +$ bin/pulsar-daemon start configuration-store +``` + +## Cluster metadata initialization + +Once you set up the cluster-specific ZooKeeper and configuration store quorums for your instance, you need to write some metadata to ZooKeeper for each cluster in your instance. **you only needs to write these metadata once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. The following is an example: + +```shell +$ bin/pulsar initialize-cluster-metadata \ + --cluster us-west \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2184 \ + --web-service-url http://pulsar.us-west.example.com:8080/ \ + --web-service-url-tls https://pulsar.us-west.example.com:8443/ \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650/ \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651/ +``` + +As you can see from the example above, you need to specify the following: + +* The name of the cluster +* The local ZooKeeper connection string for the cluster +* The configuration store connection string for the entire instance +* The web service URL for the cluster +* A broker service URL enabling interaction with the [brokers](reference-terminology.md#broker) in the cluster + +If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. + +Make sure to run `initialize-cluster-metadata` for each cluster in your instance. + +## Deploy BookKeeper + +BookKeeper provides [persistent message storage](concepts-architecture-overview.md#persistent-storage) for Pulsar. + +Each Pulsar broker needs to have its own cluster of bookies. The BookKeeper cluster shares a local ZooKeeper quorum with the Pulsar cluster. + +### Configure bookies + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important aspect of configuring each bookie is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) parameter is set to the connection string for the local ZooKeeper of Pulsar cluster. + +### Start bookies + +You can start a bookie in two ways: in the foreground or as a background daemon. + +To start a bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start bookie +``` + +You can verify that the bookie works properly using the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#bookkeeper-shell): +```shell +$ bin/bookkeeper shell bookiesanity +``` + +This command creates a new ledger on the local bookie, writes a few entries, reads them back and finally deletes the ledger. + +After you have started all bookies, you can use the `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to verify that all bookies in the cluster are running. + +```bash +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries +``` + +Bookie hosts are responsible for storing message data on disk. In order for bookies to provide optimal performance, having a suitable hardware configuration is essential for the bookies. The following are key dimensions for bookie hardware capacity. + +* Disk I/O capacity read/write +* Storage capacity + +Message entries written to bookies are always synced to disk before returning an acknowledgement to the Pulsar broker. To ensure low write latency, BookKeeper is +designed to use multiple devices: + +* A **journal** to ensure durability. For sequential writes, having fast [fsync](https://linux.die.net/man/2/fsync) operations on bookie hosts is critical. Typically, small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) should suffice, or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID)s controller and a battery-backed write cache. Both solutions can reach fsync latency of ~0.4 ms. +* A **ledger storage device** is where data is stored until all consumers acknowledge the message. Writes happen in the background, so write I/O is not a big concern. Reads happen sequentially most of the time and the backlog is drained only in case of consumer drain. To store large amounts of data, a typical configuration involves multiple HDDs with a RAID controller. + + + +## Deploy brokers + +Once you set up ZooKeeper, initialize cluster metadata, and spin up BookKeeper bookies, you can deploy brokers. + +### Broker configuration + +You can configure brokers using the [`conf/broker.conf`](reference-configuration.md#broker) configuration file. + +The most important element of broker configuration is ensuring that each broker is aware of its local ZooKeeper quorum as well as the configuration store quorum. Make sure that you set the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) parameter to reflect the local quorum and the [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameter to reflect the configuration store quorum (although you need to specify only those ZooKeeper servers located in the same cluster). + +You also need to specify the name of the [cluster](reference-terminology.md#cluster) to which the broker belongs using the [`clusterName`](reference-configuration.md#broker-clusterName) parameter. In addition, you need to match the broker and web service ports provided when you initialize the metadata (especially when you use a different port from default) of the cluster. + +The following is an example configuration: + +```properties +# Local ZooKeeper servers +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Configuration store quorum connection string. +configurationStoreServers=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 + +clusterName=us-west + +# Broker data port +brokerServicePort=6650 + +# Broker data port for TLS +brokerServicePortTls=6651 + +# Port to use to server HTTP request +webServicePort=8080 + +# Port to use to server HTTPS request +webServicePortTls=8443 +``` + +### Broker hardware + +Pulsar brokers do not require any special hardware since they do not use the local disk. You had better choose fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) so that the software can take full advantage of that. + +### Start the broker service + +You can start a broker in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```shell +$ bin/pulsar-daemon start broker +``` + +You can also start brokers in the foreground by using [`pulsar broker`](reference-cli-tools.md#broker): + +```shell +$ bin/pulsar broker +``` + +## Service discovery + +[Clients](getting-started-clients.md) connecting to Pulsar brokers need to be able to communicate with an entire Pulsar instance using a single URL. Pulsar provides a built-in service discovery mechanism that you can set up using the instructions [immediately below](#service-discovery-setup). + +You can also use your own service discovery system if you want. If you use your own system, you only need to satisfy just one requirement: when a client performs an HTTP request to an [endpoint](reference-configuration.md) for a Pulsar cluster, such as `http://pulsar.us-west.example.com:8080`, the client needs to be redirected to *some* active broker in the desired cluster, whether via DNS, an HTTP or IP redirect, or some other means. + +> #### Service discovery already provided by many scheduling systems +> Many large-scale deployment systems, such as [Kubernetes](deploy-kubernetes), have service discovery systems built in. If you run Pulsar on such a system, you may not need to provide your own service discovery mechanism. + + +### Service discovery setup + +The service discovery mechanism that included with Pulsar maintains a list of active brokers, which stored in ZooKeeper, and supports lookup using HTTP and also the [binary protocol](developing-binary-protocol.md) of Pulsar. + +To get started setting up the built-in service of discovery of Pulsar, you need to change a few parameters in the [`conf/discovery.conf`](reference-configuration.md#service-discovery) configuration file. Set the [`zookeeperServers`](reference-configuration.md#service-discovery-zookeeperServers) parameter to the ZooKeeper quorum connection string of the cluster and the [`configurationStoreServers`](reference-configuration.md#service-discovery-configurationStoreServers) setting to the [configuration +store](reference-terminology.md#configuration-store) quorum connection string. + +```properties +# Zookeeper quorum connection string +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 + +# Global configuration store connection string +configurationStoreServers=zk1.us-west.example.com:2184,zk2.us-west.example.com:2184,zk3.us-west.example.com:2184 +``` + +To start the discovery service: + +```shell +$ bin/pulsar-daemon start discovery +``` + +## Admin client and verification + +At this point your Pulsar instance should be ready to use. You can now configure client machines that can serve as [administrative clients](admin-api-overview.md) for each cluster. You can use the [`conf/client.conf`](reference-configuration.md#client) configuration file to configure admin clients. + +The most important thing is that you point the [`serviceUrl`](reference-configuration.md#client-serviceUrl) parameter to the correct service URL for the cluster: + +```properties +serviceUrl=http://pulsar.us-west.example.com:8080/ +``` + +## Provision new tenants + +Pulsar is built as a fundamentally multi-tenant system. + + +If a new tenant wants to use the system, you need to create a new one. You can create a new tenant by using the [`pulsar-admin`](reference-pulsar-admin.md#tenants) CLI tool: + + +```shell +$ bin/pulsar-admin tenants create test-tenant \ + --allowed-clusters us-west \ + --admin-roles test-admin-role +``` + +In this command, users who identify with `test-admin-role` role can administer the configuration for the `test-tenant` tenant. The `test-tenant` tenant can only use the `us-west` cluster. From now on, this tenant can manage its resources. + +Once you create a tenant, you need to create [namespaces](reference-terminology.md#namespace) for topics within that tenant. + + +The first step is to create a namespace. A namespace is an administrative unit that can contain many topics. A common practice is to create a namespace for each different use case from a single tenant. + +```shell +$ bin/pulsar-admin namespaces create test-tenant/ns1 +``` + +##### Test producer and consumer + + +Everything is now ready to send and receive messages. The quickest way to test the system is through the [`pulsar-perf`](reference-cli-tools.md#pulsar-perf) client tool. + + +You can use a topic in the namespace that you have just created. Topics are automatically created the first time when a producer or a consumer tries to use them. + +The topic name in this case could be: + +```http +persistent://test-tenant/ns1/my-topic +``` + +Start a consumer that creates a subscription on the topic and waits for messages: + +```shell +$ bin/pulsar-perf consume persistent://test-tenant/ns1/my-topic +``` + +Start a producer that publishes messages at a fixed rate and reports stats every 10 seconds: + +```shell +$ bin/pulsar-perf produce persistent://test-tenant/ns1/my-topic +``` + +To report the topic stats: + +```shell +$ bin/pulsar-admin topics stats persistent://test-tenant/ns1/my-topic +``` diff --git a/site2/website/versioned_docs/version-2.5.0/deploy-bare-metal.md b/site2/website/versioned_docs/version-2.5.0/deploy-bare-metal.md new file mode 100644 index 0000000000000..5023bdf51669a --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/deploy-bare-metal.md @@ -0,0 +1,461 @@ +--- +id: version-2.5.0-deploy-bare-metal +title: Deploy a cluster on bare metal +sidebar_label: Bare metal +original_id: deploy-bare-metal +--- + + +> ### Tips +> +> 1. Single-cluster Pulsar installations should be sufficient for all but the most ambitious use cases. If you are interested in experimenting with +> Pulsar or using Pulsar in a startup or on a single team, you had better opt for a single cluster. If you do need to run a multi-cluster Pulsar instance, +> see the guide [here](deploy-bare-metal-multi-cluster.md). +> +> 2. If you want to use all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you need to download `apache-pulsar-io-connectors` +> package and install `apache-pulsar-io-connectors` under `connectors` directory in the pulsar directory on every broker node or on every function-worker node if you +> have run a separate cluster of function workers for [Pulsar Functions](functions-overview.md). +> +> 3. If you want to use [Tiered Storage](concepts-tiered-storage.md) feature in your Pulsar deployment, you need to download `apache-pulsar-offloaders` +> package and install `apache-pulsar-offloaders` under `offloaders` directory in the pulsar directory on every broker node. For more details of how to configure +> this feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md). + +Deploying a Pulsar cluster involves doing the following (in order): + +* Deploy a [ZooKeeper](#deploy-a-zookeeper-cluster) cluster (optional) +* Initialize [cluster metadata](#initialize-cluster-metadata) +* Deploy a [BookKeeper](#deploy-a-bookkeeper-cluster) cluster +* Deploy one or more Pulsar [brokers](#deploy-pulsar-brokers) + +## Preparation + +### Requirements + +> If you already have an existing zookeeper cluster and want to reuse it, you do not need to prepare the machines +> for running ZooKeeper. + +To run Pulsar on bare metal, you had better have the following: + +* At least 6 Linux machines or VMs + * 3 for running [ZooKeeper](https://zookeeper.apache.org) + * 3 for running a Pulsar broker, and a [BookKeeper](https://bookkeeper.apache.org) bookie +* A single [DNS](https://en.wikipedia.org/wiki/Domain_Name_System) name covering all of the Pulsar broker hosts + +> If you do not have enough machines, or try out Pulsar in cluster mode (and expand the cluster later), +> you can even deploy Pulsar in one node, where Zookeeper, bookie and broker are run in the same machine. + +> If you do not have a DNS server, you can use multi-host in service URL instead. + +Each machine in your cluster needs to have [Java 8](http://www.oracle.com/technetwork/java/javase/downloads/index.html) or higher version of Java installed. + +The following is a diagram showing the basic setup: + +![alt-text](assets/pulsar-basic-setup.png) + +In this diagram, connecting clients need to be able to communicate with the Pulsar cluster using a single URL, in this case `pulsar-cluster.acme.com` abstracts over all of the message-handling brokers. Pulsar message brokers run on machines alongside BookKeeper bookies; brokers and bookies, in turn, rely on ZooKeeper. + +### Hardware considerations + +When you deploy a Pulsar cluster, keep in mind the following basic better choices when you do the capacity planning. + +#### ZooKeeper + +For machines running ZooKeeper, you had better use lighter-weight machines or VMs. Pulsar uses ZooKeeper only for periodic coordination-related and configuration-related tasks, *not* for basic operations. If you run Pulsar on [Amazon Web Services](https://aws.amazon.com/) (AWS), for example, a [t2.small](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/t2-instances.html) instance might likely suffice. + +#### Bookies and Brokers + +For machines running a bookie and a Pulsar broker, you had better use more powerful machines. For an AWS deployment, for example, [i3.4xlarge](https://aws.amazon.com/blogs/aws/now-available-i3-instances-for-demanding-io-intensive-applications/) instances may be appropriate. On those machines you can use the following: + +* Fast CPUs and 10Gbps [NIC](https://en.wikipedia.org/wiki/Network_interface_controller) (for Pulsar brokers) +* Small and fast [solid-state drives](https://en.wikipedia.org/wiki/Solid-state_drive) (SSDs) or [hard disk drives](https://en.wikipedia.org/wiki/Hard_disk_drive) (HDDs) with a [RAID](https://en.wikipedia.org/wiki/RAID) controller and a battery-backed write cache (for BookKeeper bookies) + +## Install the Pulsar binary package + +> You need to install the Pulsar binary package on *each machine in the cluster*, including machines running [ZooKeeper](#deploy-a-zookeeper-cluster) and [BookKeeper](#deploy-a-bookkeeper-cluster). + +To get started deploying a Pulsar cluster on bare metal, you need to download a binary tarball release in one of the following ways: + +* By clicking on the link below directly, which automatically triggers a download: + * Pulsar {{pulsar:version}} binary release +* From the Pulsar [downloads page](pulsar:download_page_url) +* From the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) on [GitHub](https://github.com) +* Using [wget](https://www.gnu.org/software/wget): + +```bash +$ wget pulsar:binary_release_url +``` + +Once you download the tarball, untar it and `cd` into the resulting directory: + +```bash +$ tar xvzf apache-pulsar-{{pulsar:version}}-bin.tar.gz +$ cd apache-pulsar-{{pulsar:version}} +``` + +The untarred directory contains the following subdirectories: + +Directory | Contains +:---------|:-------- +`bin` |[command-line tools](reference-cli-tools.md) of Pulsar, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md) +`conf` | Configuration files for Pulsar, including for [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more +`data` | The data storage directory that ZooKeeper and BookKeeper use +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files that Pulsar uses +`logs` | Logs that the installation creates + +## [Install Builtin Connectors (optional)]( https://pulsar.apache.org/docs/en/next/standalone/#install-builtin-connectors-optional) + +> Since Pulsar releases `2.1.0-incubating`, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +> If you want to enable those `builtin` connectors, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To get started using builtin connectors, you need to download the connectors tarball release on every broker node in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar IO Connectors {{pulsar:version}} release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:connector_release_url/{connector}-{{pulsar:version}}.nar + ``` + +Once you download the nar file, copy the file to directory `connectors` in the pulsar directory, +for example, if you download the connector file `pulsar-io-aerospike-{{pulsar:version}}.nar`: + +```bash +$ mkdir connectors +$ mv pulsar-io-aerospike-{{pulsar:version}}.nar connectors + +$ ls connectors +pulsar-io-aerospike-{{pulsar:version}}.nar +... +``` + +## [Install Tiered Storage Offloaders (optional)](https://pulsar.apache.org/docs/en/next/standalone/#install-tiered-storage-offloaders-optional) + +> Since Pulsar release `2.2.0`, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +> If you want to enable tiered storage feature, you can follow the instructions as below; otherwise you can +> skip this section for now. + +To get started using tiered storage offloaders, you need to download the offloaders tarball release on every broker node in one of the following ways: + +* by clicking the link below and downloading the release from an Apache mirror: + + * Pulsar Tiered Storage Offloaders {{pulsar:version}} release + +* from the Pulsar [downloads page](pulsar:download_page_url) +* from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) +* using [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:offloader_release_url + ``` + +Once you download the tarball, in the pulsar directory, untar the offloaders package and copy the offloaders as `offloaders` in the pulsar directory: + +```bash +$ tar xvfz apache-pulsar-offloaders-{{pulsar:version}}-bin.tar.gz + +// you can find a directory named `apache-pulsar-offloaders-{{pulsar:version}}` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-{{pulsar:version}}/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-{{pulsar:version}}.nar +``` + +For more details of how to configure tiered storage feature, you can refer to the [Tiered storage cookbook](cookbooks-tiered-storage.md) + + +## Deploy a ZooKeeper cluster + +> If you already have an exsiting zookeeper cluster and want to use it, you can skip this section. + +[ZooKeeper](https://zookeeper.apache.org) manages a variety of essential coordination- and configuration-related tasks for Pulsar. To deploy a Pulsar cluster you need to deploy ZooKeeper first (before all other components). You had better deploy a 3-node ZooKeeper cluster. Pulsar does not make heavy use of ZooKeeper, so more lightweight machines or VMs should suffice for running ZooKeeper. + +To begin, add all ZooKeeper servers to the configuration specified in [`conf/zookeeper.conf`](reference-configuration.md#zookeeper) (in the Pulsar directory that you create [above](#install-the-pulsar-binary-package)). The following is an example: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +> If you have only one machine to deploy Pulsar, you just need to add one server entry in the configuration file. + +On each host, you need to specify the ID of the node in the `myid` file of each node, which is in each `data/zookeeper` folder of server by default (you can change the file location via the [`dataDir`](reference-configuration.md#zookeeper-dataDir) parameter). + +> See the [Multi-server setup guide](https://zookeeper.apache.org/doc/r3.4.10/zookeeperAdmin.html#sc_zkMulitServerSetup) in the ZooKeeper documentation for detailed information on `myid` and more. + +On a ZooKeeper server at `zk1.us-west.example.com`, for example, you can set the `myid` value like this: + +```bash +$ mkdir -p data/zookeeper +$ echo 1 > data/zookeeper/myid +``` + +On `zk2.us-west.example.com` the command is `echo 2 > data/zookeeper/myid` and so on. + +Once you add each server to the `zookeeper.conf` configuration and have the appropriate `myid` entry, you can start ZooKeeper on all hosts (in the background, using nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start zookeeper +``` + +> If you plan to deploy zookeeper with bookie on the same node, you +> need to start zookeeper by using different stats port. + +Start zookeeper with [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool like: + +```bash +$ PULSAR_EXTRA_OPTS="-Dstats_server_port=8001" bin/pulsar-daemon start zookeeper +``` + +## Initialize cluster metadata + +Once you deploy ZooKeeper for your cluster, you need to write some metadata to ZooKeeper for each cluster in your instance. You only need to write **once**. + +You can initialize this metadata using the [`initialize-cluster-metadata`](reference-cli-tools.md#pulsar-initialize-cluster-metadata) command of the [`pulsar`](reference-cli-tools.md#pulsar) CLI tool. This command can be run on any machine in your ZooKeeper cluster. The following is an example: + +```shell +$ bin/pulsar initialize-cluster-metadata \ + --cluster pulsar-cluster-1 \ + --zookeeper zk1.us-west.example.com:2181 \ + --configuration-store zk1.us-west.example.com:2181 \ + --web-service-url http://pulsar.us-west.example.com:8080 \ + --web-service-url-tls https://pulsar.us-west.example.com:8443 \ + --broker-service-url pulsar://pulsar.us-west.example.com:6650 \ + --broker-service-url-tls pulsar+ssl://pulsar.us-west.example.com:6651 +``` + +As you can see from the example above, you +need to specify the following: + +Flag | Description +:----|:----------- +`--cluster` | A name for the cluster +`--zookeeper` | A "local" ZooKeeper connection string for the cluster. This connection string only needs to include *one* machine in the ZooKeeper cluster. +`--configuration-store` | The configuration store connection string for the entire instance. As with the `--zookeeper` flag, this connection string only needs to include *one* machine in the ZooKeeper cluster. +`--web-service-url` | The web service URL for the cluster, plus a port. This URL should be a standard DNS name. The default port is 8080 (you had better not use a different port). +`--web-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster. The default port is 8443 (you had better not use a different port). +`--broker-service-url` | A broker service URL enabling interaction with the brokers in the cluster. This URL should not use the same DNS name as the web service URL but should use the `pulsar` scheme instead. The default port is 6650 (you had better not use a different port). +`--broker-service-url-tls` | If you use [TLS](security-tls-transport.md), you also need to specify a TLS web service URL for the cluster as well as a TLS broker service URL for the brokers in the cluster. The default port is 6651 (you had better not use a different port). + + +> If you don't have a DNS server, you can use multi-host in service URL with the following settings: +> +> ```properties +> --web-service-url http://host1:8080,host2:8080,host3:8080 \ +> --web-service-url-tls https://host1:8443,host2:8443,host3:8443 \ +> --broker-service-url pulsar://host1:6650,host2:6650,host3:6650 \ +> --broker-service-url-tls pulsar+ssl://host1:6651,host2:6651,host3:6651 +> ``` + +## Deploy a BookKeeper cluster + +[BookKeeper](https://bookkeeper.apache.org) handles all persistent data storage in Pulsar. You need to deploy a cluster of BookKeeper bookies to use Pulsar. You can choose to run a **3-bookie BookKeeper cluster**. + +You can configure BookKeeper bookies using the [`conf/bookkeeper.conf`](reference-configuration.md#bookkeeper) configuration file. The most important step in configuring bookies for our purposes here is ensuring that the [`zkServers`](reference-configuration.md#bookkeeper-zkServers) is set to the connection string for the ZooKeeper cluster. The following is an example: + +```properties +zkServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +``` + +Once you appropriately modify the `zkServers` parameter, you can provide any other configuration modifications you need. You can find a full listing of the available BookKeeper configuration parameters [here](reference-configuration.md#bookkeeper), although consulting the [BookKeeper documentation](http://bookkeeper.apache.org/docs/latest/reference/config/) for a more in-depth guide might be a better choice. + +> ##### NOTES +> +> Since Pulsar 2.1.0 releases, Pulsar introduces [stateful function](functions-develop.md#state-storage) for Pulsar Functions. If you want to enable that feature, +> you need to enable table service on BookKeeper by doing the following setting in `conf/bookkeeper.conf` file. +> +> ```conf +> extraServerComponents=org.apache.bookkeeper.stream.server.StreamStorageLifecycleComponent +> ``` + +Once you apply the desired configuration in `conf/bookkeeper.conf`, you can start up a bookie on each of your BookKeeper hosts. You can start up each bookie either in the background, using [nohup](https://en.wikipedia.org/wiki/Nohup), or in the foreground. + +To start the bookie in the background, use the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start bookie +``` + +To start the bookie in the foreground: + +```bash +$ bin/bookkeeper bookie +``` + +You can verify that a bookie works properly by running the `bookiesanity` command for the [BookKeeper shell](reference-cli-tools.md#shell) on it: + +```bash +$ bin/bookkeeper shell bookiesanity +``` + +This command creates an ephemeral BookKeeper ledger on the local bookie, writes a few entries, reads them back, and finally deletes the ledger. + +After you start all the bookies, you can use `simpletest` command for [BookKeeper shell](reference-cli-tools.md#shell) on any bookie node, to +verify all the bookies in the cluster are up running. + +```bash +$ bin/bookkeeper shell simpletest --ensemble --writeQuorum --ackQuorum --numEntries +``` + +This command creates a `num-bookies` sized ledger on the cluster, writes a few entries, and finally deletes the ledger. + + +## Deploy Pulsar brokers + +Pulsar brokers are the last thing you need to deploy in your Pulsar cluster. Brokers handle Pulsar messages and provide the administrative interface of Pulsar. A good choice is to run **3 brokers**, one for each machine that already runs a BookKeeper bookie. + +### Configure Brokers + +The most important element of broker configuration is ensuring that each broker is aware of the ZooKeeper cluster that you have deployed. Make sure that the [`zookeeperServers`](reference-configuration.md#broker-zookeeperServers) and [`configurationStoreServers`](reference-configuration.md#broker-configurationStoreServers) parameters. In this case, since you only have 1 cluster and no configuration store setup, the `configurationStoreServers` point to the same `zookeeperServers`. + +```properties +zookeeperServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +configurationStoreServers=zk1.us-west.example.com:2181,zk2.us-west.example.com:2181,zk3.us-west.example.com:2181 +``` + +You also need to specify the cluster name (matching the name that you provide when you [initialize the metadata of the cluster](#initialize-cluster-metadata)): + +```properties +clusterName=pulsar-cluster-1 +``` + +In addition, you need to match the broker and web service ports provided when you initialize the metadata of the cluster (especially when you use a different port from default): + +```properties +brokerServicePort=6650 +brokerServicePortTls=6651 +webServicePort=8080 +webServicePortTls=8443 +``` + +> If you deploy Pulsar in a one-node cluster, you should update the replication settings in `conf/broker.conf` to `1` +> +> ```properties +> # Number of bookies to use when creating a ledger +> managedLedgerDefaultEnsembleSize=1 +> +> # Number of copies to store for each message +> managedLedgerDefaultWriteQuorum=1 +> +> # Number of guaranteed copies (acks to wait before write is complete) +> managedLedgerDefaultAckQuorum=1 +> ``` + +### Enable Pulsar Functions (optional) + +If you want to enable [Pulsar Functions](functions-overview.md), you can follow the instructions as below: + +1. Edit `conf/broker.conf` to enable functions worker, by setting `functionsWorkerEnabled` to `true`. + + ```conf + functionsWorkerEnabled=true + ``` + +2. Edit `conf/functions_worker.yml` and set `pulsarFunctionsCluster` to the cluster name that you provide when you [initialize the metadata of the cluster](#initialize-cluster-metadata). + + ```conf + pulsarFunctionsCluster: pulsar-cluster-1 + ``` + +If you want to learn more options about deploying functions worker, checkout [Deploy and manage functions worker](functions-worker.md). + +### Start Brokers + +You can then provide any other configuration changes that you want in the [`conf/broker.conf`](reference-configuration.md#broker) file. Once you decide on a configuration, you can start up the brokers for your Pulsar cluster. Like ZooKeeper and BookKeeper, you can start brokers either in the foreground or in the background, using nohup. + +You can start a broker in the foreground using the [`pulsar broker`](reference-cli-tools.md#pulsar-broker) command: + +```bash +$ bin/pulsar broker +``` + +You can start a broker in the background using the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + +```bash +$ bin/pulsar-daemon start broker +``` + +Once you succesfully start up all the brokers that you intend to use, your Pulsar cluster should be ready to go! + +## Connect to the running cluster + +Once your Pulsar cluster is up and running, you should be able to connect with it using Pulsar clients. One such client is the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool, which is included with the Pulsar binary package. The `pulsar-client` tool can publish messages to and consume messages from Pulsar topics and thus provide a simple way to make sure that your cluster runs properly. + +To use the `pulsar-client` tool, first modify the client configuration file in [`conf/client.conf`](reference-configuration.md#client) in your binary package. You need to change the values for `webServiceUrl` and `brokerServiceUrl`, substituting `localhost` (which is the default), with the DNS name that you assign to your broker/bookie hosts. The following is an example: + +```properties +webServiceUrl=http://us-west.example.com:8080 +brokerServiceurl=pulsar://us-west.example.com:6650 +``` + +> If you don't have a DNS server, you can specify multi-host in service URL like below: +> +> ```properties +> webServiceUrl=http://host1:8080,host2:8080,host3:8080 +> brokerServiceurl=pulsar://host1:6650,host2:6650,host3:6650 +> ``` + +Once you do that, you can publish a message to Pulsar topic: + +```bash +$ bin/pulsar-client produce \ + persistent://public/default/test \ + -n 1 \ + -m "Hello Pulsar" +``` + +> You may need to use a different cluster name in the topic if you specify a cluster name different from `pulsar-cluster-1`. + +This command publishes a single message to the Pulsar topic. In addition, you can subscribe the Pulsar topic in a different terminal before publishing messages as below: + +```bash +$ bin/pulsar-client consume \ + persistent://public/default/test \ + -n 100 \ + -s "consumer-test" \ + -t "Exclusive" +``` + +Once you successfully publish the message above to the topic, you should see it in the standard output: + +```bash +----- got message ----- +Hello Pulsar +``` + +## Run Functions + +> If you have [enabled](#enable-pulsar-functions-optional) Pulsar Functions, you can also tryout pulsar functions now. + +Create a ExclamationFunction `exclamation`. + +```bash +bin/pulsar-admin functions create \ + --jar examples/api-examples.jar \ + --classname org.apache.pulsar.functions.api.examples.ExclamationFunction \ + --inputs persistent://public/default/exclamation-input \ + --output persistent://public/default/exclamation-output \ + --tenant public \ + --namespace default \ + --name exclamation +``` + +Check if the function runs as expected by [triggering](functions-deploying.md#triggering-pulsar-functions) the function. + +```bash +bin/pulsar-admin functions trigger --name exclamation --trigger-value "hello world" +``` + +You can see the output as below: + +```shell +hello world! +``` diff --git a/site2/website/versioned_docs/version-2.5.0/deploy-dcos.md b/site2/website/versioned_docs/version-2.5.0/deploy-dcos.md new file mode 100644 index 0000000000000..04e57976fab87 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/deploy-dcos.md @@ -0,0 +1,183 @@ +--- +id: version-2.5.0-deploy-dcos +title: Deploy Pulsar on DC/OS +sidebar_label: DC/OS +original_id: deploy-dcos +--- + +> ### Tips +> +> If you want to enable all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you can choose to use `apachepulsar/pulsar-all` image instead of +> `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +[DC/OS](https://dcos.io/) (the DataCenter Operating System) is a distributed operating system used for deploying and managing applications and systems on [Apache Mesos](http://mesos.apache.org/). DC/OS is an open-source tool that [Mesosphere](https://mesosphere.com/) creates and maintains . + +Apache Pulsar is available as a [Marathon Application Group](https://mesosphere.github.io/marathon/docs/application-groups.html), which runs multiple applications as manageable sets. + +## Prerequisites + +In order to run Pulsar on DC/OS, you need the following: + +* DC/OS version [1.9](https://docs.mesosphere.com/1.9/) or higher +* A [DC/OS cluster](https://docs.mesosphere.com/1.9/installing/) with at least three agent nodes +* The [DC/OS CLI tool](https://docs.mesosphere.com/1.9/cli/install/) installed +* The [`PulsarGroups.json`](https://github.com/apache/pulsar/blob/master/deployment/dcos/PulsarGroups.json) configuration file from the Pulsar GitHub repo. + + ```bash + $ curl -O https://raw.githubusercontent.com/apache/pulsar/master/deployment/dcos/PulsarGroups.json + ``` + +Each node in the DC/OS-managed Mesos cluster must have at least: + +* 4 CPU +* 4 GB of memory +* 60 GB of total persistent disk + +Alternatively, you can change the configuration in `PulsarGroups.json` according to match your resources of DC/OS cluster. + +## Deploy Pulsar using the DC/OS command interface + +You can deploy Pulsar on DC/OS using this command: + +```bash +$ dcos marathon group add PulsarGroups.json +``` + +This command deploys Docker container instances in three groups, which together comprise a Pulsar cluster: + +* 3 bookies (1 [bookie](reference-terminology.md#bookie) on each agent node and 1 [bookie recovery](http://bookkeeper.apache.org/docs/latest/admin/autorecovery/) instance) +* 3 Pulsar [brokers](reference-terminology.md#broker) (1 broker on each node and 1 admin instance) +* 1 [Prometheus](http://prometheus.io/) instance and 1 [Grafana](https://grafana.com/) instance + + +> When you run DC/OS, a ZooKeeper cluster already runs at `master.mesos:2181`, thus you do not have to install or start up ZooKeeper separately. + +After executing the `dcos` command above, click on the **Services** tab in the DC/OS [GUI interface](https://docs.mesosphere.com/latest/gui/), which you can access at [http://m1.dcos](http://m1.dcos) in this example. You should see several applications in the process of deploying. + +![DC/OS command executed](assets/dcos_command_execute.png) + +![DC/OS command executed2](assets/dcos_command_execute2.png) + +## The BookKeeper group + +To monitor the status of the BookKeeper cluster deployment, click on the **bookkeeper** group in the parent **pulsar** group. + +![DC/OS bookkeeper status](assets/dcos_bookkeeper_status.png) + +At this point, 3 [bookies](reference-terminology.md#bookie) should be shown as green, which means that the bookies have been deployed successfully and are now running. + +![DC/OS bookkeeper running](assets/dcos_bookkeeper_run.png) + +You can also click into each bookie instance to get more detailed information, such as the bookie running log. + +![DC/OS bookie log](assets/dcos_bookie_log.png) + +To display information about the BookKeeper in ZooKeeper, you can visit [http://m1.dcos/exhibitor](http://m1.dcos/exhibitor). In this example, 3 bookies are under the `available` directory. + +![DC/OS bookkeeper in zk](assets/dcos_bookkeeper_in_zookeeper.png) + +## The Pulsar broker Group + +Similar to the BookKeeper group above, click into the **brokers** to check the status of the Pulsar brokers. + +![DC/OS broker status](assets/dcos_broker_status.png) + +![DC/OS broker running](assets/dcos_broker_run.png) + +You can also click into each broker instance to get more detailed information, such as the broker running log. + +![DC/OS broker log](assets/dcos_broker_log.png) + +Broker cluster information in Zookeeper is also available through the web UI. In this example, you can see that the `loadbalance` and `managed-ledgers` directories have been created. + +![DC/OS broker in zk](assets/dcos_broker_in_zookeeper.png) + +## Monitor Group + +The **monitory** group consists of Prometheus and Grafana. + +![DC/OS monitor status](assets/dcos_monitor_status.png) + +### Prometheus + +Click into the instance of `prom` to get the endpoint of Prometheus, which is `192.168.65.121:9090` in this example. + +![DC/OS prom endpoint](assets/dcos_prom_endpoint.png) + +If you click that endpoint, you can see the Prometheus dashboard. The [http://192.168.65.121:9090/targets](http://192.168.65.121:9090/targets) URL display all the bookies and brokers. + +![DC/OS prom targets](assets/dcos_prom_targets.png) + +### Grafana + +Click into `grafana` to get the endpoint for Grafana, which is `192.168.65.121:3000` in this example. + +![DC/OS grafana endpoint](assets/dcos_grafana_endpoint.png) + +If you click that endpoint, you can access the Grafana dashbaord. + +![DC/OS grafana targets](assets/dcos_grafana_dashboard.png) + +## Run a simple Pulsar consumer and producer on DC/OS + +Now that you have a fully deployed Pulsar cluster, you can run a simple consumer and producer to show Pulsar on DC/OS in action. + +### Download and prepare the Pulsar Java tutorial + +You can clone a [Pulsar Java tutorial](https://github.com/streamlio/pulsar-java-tutorial) repo. This repo contains a simple Pulsar consumer and producer (you can find more information in the `README` file of the repo). + +```bash +$ git clone https://github.com/streamlio/pulsar-java-tutorial +``` + +Change the `SERVICE_URL` from `pulsar://localhost:6650` to `pulsar://a1.dcos:6650` in both [`ConsumerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ConsumerTutorial.java) and [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java). +The `pulsar://a1.dcos:6650` endpoint is for the broker service. You can fetch the endpoint details for each broker instance from the DC/OS GUI. `a1.dcos` is a DC/OS client agent, which runs a broker. The client agent IP address can also replace this. + +Now, change the message number from 10 to 10000000 in main method of [`ProducerTutorial.java`](https://github.com/streamlio/pulsar-java-tutorial/blob/master/src/main/java/tutorial/ProducerTutorial.java) so that it can produce more messages. + +Now compile the project code using the command below: + +```bash +$ mvn clean package +``` + +### Run the consumer and producer + +Execute this command to run the consumer: + +```bash +$ mvn exec:java -Dexec.mainClass="tutorial.ConsumerTutorial" +``` + +Execute this command to run the producer: + +```bash +$ mvn exec:java -Dexec.mainClass="tutorial.ProducerTutorial" +``` + +You can see the producer producing messages and the consumer consuming messages through the DC/OS GUI. + +![DC/OS pulsar producer](assets/dcos_producer.png) + +![DC/OS pulsar consumer](assets/dcos_consumer.png) + +### View Grafana metric output + +While the producer and consumer run, you can access running metrics information from Grafana. + +![DC/OS pulsar dashboard](assets/dcos_metrics.png) + + +## Uninstall Pulsar + +You can shut down and uninstall the `pulsar` application from DC/OS at any time in the following two ways: + +1. Using the DC/OS GUI, you can choose **Delete** at the right end of Pulsar group. + + ![DC/OS pulsar uninstall](assets/dcos_uninstall.png) + +2. You can use the following command: + + ```bash + $ dcos marathon group remove /pulsar + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/deploy-kubernetes.md b/site2/website/versioned_docs/version-2.5.0/deploy-kubernetes.md new file mode 100644 index 0000000000000..f9c0ed5e703af --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/deploy-kubernetes.md @@ -0,0 +1,394 @@ +--- +id: version-2.5.0-deploy-kubernetes +title: Deploying Pulsar on Kubernetes +sidebar_label: Kubernetes +original_id: deploy-kubernetes +--- + +> ### Tips +> +> If you want to enable all builtin [Pulsar IO](io-overview.md) connectors in your Pulsar deployment, you can choose to use `apachepulsar/pulsar-all` image instead of +> `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +You can easily deploy Pulsar in [Kubernetes](https://kubernetes.io/) clusters, either in managed clusters on [Google Kubernetes Engine](#pulsar-on-google-kubernetes-engine) or [Amazon Web Services](https://aws.amazon.com/) or in [custom clusters](#pulsar-on-a-custom-kubernetes-cluster). + +The deployment method shown in this guide relies on [YAML](http://yaml.org/) definitions for Kubernetes [resources](https://kubernetes.io/docs/reference/). The {@inject: github:`deployment/kubernetes`:/deployment/kubernetes} subdirectory of the [Pulsar package](pulsar:download_page_url) holds resource definitions for: + +* A two-bookie BookKeeper cluster +* A three-node ZooKeeper cluster +* A three-broker Pulsar cluster +* A [monitoring stack]() consisting of [Prometheus](https://prometheus.io/), [Grafana](https://grafana.com), and the [Pulsar dashboard](administration-dashboard.md) +* A [pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/) from which you can run administrative commands using the [`pulsar-admin`](reference-pulsar-admin.md) CLI tool + +## Setup + +To get started, install a source package from the [downloads page](pulsar:download_page_url). + +> Note that the Pulsar binary package does *not* contain the necessary YAML resources to deploy Pulsar on Kubernetes. + +If you want to change the number of bookies, brokers, or ZooKeeper nodes in your Pulsar cluster, modify the `replicas` parameter in the `spec` section of the appropriate [`Deployment`](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) or [`StatefulSet`](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) resource. + +## Pulsar on Google Kubernetes Engine + +[Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine) (GKE) automates the creation and management of Kubernetes clusters in [Google Compute Engine](https://cloud.google.com/compute/) (GCE). + +### Prerequisites + +To get started, you need: + +* A Google Cloud Platform account, which you can sign up for at [cloud.google.com](https://cloud.google.com) +* An existing Cloud Platform project +* The [Google Cloud SDK](https://cloud.google.com/sdk/downloads) (in particular the [`gcloud`](https://cloud.google.com/sdk/gcloud/) and [`kubectl`](https://kubernetes.io/docs/tasks/tools/install-kubectl/#download-as-part-of-the-google-cloud-sdk) tools). + +### Create a new Kubernetes cluster + +You can create a new GKE cluster entering the [`container clusters create`](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create) command for `gcloud`. This command enables you to specify the number of nodes in the cluster, the machine types of those nodes, and so on. + +The following example creates a new GKE cluster for Kubernetes version [1.6.4](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md#v164) in the [us-central1-a](https://cloud.google.com/compute/docs/regions-zones/regions-zones#available) zone. The cluster is named `pulsar-gke-cluster` and consists of three VMs, each using two locally attached SSDs and running on [n1-standard-8](https://cloud.google.com/compute/docs/machine-types) machines. [Bookie](reference-terminology.md#bookie) instances use these SSDs, one for the BookKeeper [journal](concepts-architecture-overview.md#journal-storage) and the other for storing the actual message data. + +```bash +$ gcloud container clusters create pulsar-gke-cluster \ + --zone=us-central1-a \ + --machine-type=n1-standard-8 \ + --num-nodes=3 \ + --local-ssd-count=2 \ +``` + +By default, bookies run on all the machines that have locally attached SSD disks. In this example, all of those machines have two SSDs, but you can add different types of machines to the cluster later. You can control which machines host bookie servers using [labels](https://kubernetes.io/docs/concepts/overview/working-with-objects/labels). + +### Dashboard + +You can observe your cluster in the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) by downloading the credentials for your Kubernetes cluster and opening up a proxy to the cluster: + +```bash +$ gcloud container clusters get-credentials pulsar-gke-cluster \ + --zone=us-central1-a \ + --project=your-project-name +$ kubectl proxy +``` + +By default, the proxy is opened on port 8001. Now you can navigate to [localhost:8001/ui](http://localhost:8001/ui) in your browser to access the dashboard. At first your GKE cluster is empty, but that changes as you begin deploying Pulsar components using `kubectl` [component by component](#deploying-pulsar-components), +or using [`helm`](#deploying-pulsar-components-helm). + +## Pulsar on Amazon Web Services + +You can run Kubernetes on [Amazon Web Services](https://aws.amazon.com/) (AWS) in a variety of ways. A very simple way that is [recently introduced](https://aws.amazon.com/blogs/compute/kubernetes-clusters-aws-kops/) involves using the [Kubernetes Operations](https://github.com/kubernetes/kops) (kops) tool. + +You can find detailed instructions for setting up a Kubernetes cluster on AWS from [here](https://github.com/kubernetes/kops/blob/master/docs/aws.md). + +When you create a cluster using those instructions, your `kubectl` config in `~/.kube/config` (on MacOS and Linux) is updated for you, so you probably do not need to change your configuration. Nonetheless, you can ensure that `kubectl` can interact with your cluster by listing the nodes in the cluster: + +```bash +$ kubectl get nodes +``` + +If `kubectl` works with your cluster, you can proceed to deploy Pulsar components using `kubectl` [component by component](#deploying-pulsar-components), +or using [`helm`](#deploying-pulsar-components-helm). + +## Pulsar on a custom Kubernetes cluster + +You can deploy Pulsar on a custom, non-GKE Kubernetes cluster as well. You can find detailed documentation on how to choose a Kubernetes installation method that suits your needs in the [Picking the Right Solution](https://kubernetes.io/docs/setup/pick-right-solution) guide in the Kubernetes docs. + +The easiest way to run a Kubernetes cluster is to do so locally. To install a mini local cluster for testing purposes and running in local VMs, you can either: + +1. Use [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/) to run a single-node Kubernetes cluster. +1. Create a local cluster running on multiple VMs on the same machine. + +### Minikube + +1. [Install and configure minikube](https://github.com/kubernetes/minikube#installation) with + a [VM driver](https://github.com/kubernetes/minikube#requirements), for example, `kvm2` on Linux or `hyperkit` or `VirtualBox` on macOS. +1. Create a kubernetes cluster on Minikube. + ```shell + minikube start --memory=8192 --cpus=4 \ + --kubernetes-version= + ``` + `` can be any [Kubernetes version supported by your minikube installation](https://minikube.sigs.k8s.io/docs/reference/configuration/kubernetes/). Example: `v1.16.1` +1. Set `kubectl` to use Minikube. + ```shell + kubectl config use-context minikube + ``` + +In order to use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) +with local Kubernetes cluster on Minikube, enter the command below: + +```bash +$ minikube dashboard +``` + +The command automatically triggers opening a webpage in your browser. At first your local cluster is empty, but that changes as you begin deploying Pulsar components using `kubectl` [component by component](#deploying-pulsar-components), +or using [`helm`](#deploying-pulsar-components-helm). + +### Multiple VMs + +For the second option, follow the [instructions](https://github.com/pires/kubernetes-vagrant-coreos-cluster) for running Kubernetes using [CoreOS](https://coreos.com/) on [Vagrant](https://www.vagrantup.com/). You can follow an abridged version of those instructions from here. + + +First, make sure you have [Vagrant](https://www.vagrantup.com/downloads.html) and [VirtualBox](https://www.virtualbox.org/wiki/Downloads) installed. Then clone the repo and start up the cluster: + +```bash +$ git clone https://github.com/pires/kubernetes-vagrant-coreos-cluster +$ cd kubernetes-vagrant-coreos-cluster + +# Start a three-VM cluster +$ NODES=3 USE_KUBE_UI=true vagrant up +``` + +Create SSD disk mount points on the VMs using this script: + +```bash +$ for vm in node-01 node-02 node-03; do + NODES=3 vagrant ssh $vm -c "sudo mkdir -p /mnt/disks/ssd0" + NODES=3 vagrant ssh $vm -c "sudo mkdir -p /mnt/disks/ssd1" + done +``` + +Bookies expect two logical devices to mount for [journal](concepts-architecture-overview.md#journal-storage) and persistent message storage to be available. In this VM exercise, you can create two directories on each VM. + +Once the cluster is up, you can verify that `kubectl` can access it: + +```bash +$ kubectl get nodes +NAME STATUS AGE VERSION +172.17.8.101 Ready,SchedulingDisabled 10m v1.6.4 +172.17.8.102 Ready 8m v1.6.4 +172.17.8.103 Ready 6m v1.6.4 +172.17.8.104 Ready 4m v1.6.4 +``` + +In order to use the [Kubernetes Dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) with your local Kubernetes cluster, first, you need to use `kubectl` to create a proxy to the cluster: + +```bash +$ kubectl proxy +``` + +Now you can access the web interface at [localhost:8001/ui](http://localhost:8001/ui). At first your local cluster is empty, but that changes as you begin deploying Pulsar components using `kubectl` [component by component](#deploying-pulsar-components), or using [`helm`](#deploying-pulsar-components-helm). + +## Deploy Pulsar components + +Now that you have set up a Kubernetes cluster, either on [Google Kubernetes Engine](#pulsar-on-google-kubernetes-engine) or on a [custom cluster](#pulsar-on-a-custom-kubernetes-cluster), you can begin deploying the components that make up Pulsar. You can find the YAML resource definitions for Pulsar components in the `kubernetes` folder of the [Pulsar source package](pulsar:download_page_url). + +In that package, you can find different sets of resource definitions for different environments. + +- `deployment/kubernetes/google-kubernetes-engine`: for Google Kubernetes Engine (GKE) +- `deployment/kubernetes/aws`: for AWS +- `deployment/kubernetes/generic`: for a custom Kubernetes cluster + +To begin, `cd` into the appropriate folder. + +### Deploy ZooKeeper + +You *must* deploy ZooKeeper as the first Pulsar component, as ZooKeeper is a dependency for the others. + +```bash +$ kubectl apply -f zookeeper.yaml +``` + +Wait until all three ZooKeeper server pods are up and have the status `Running`. You can check on the status of the ZooKeeper pods at any time: + +```bash +$ kubectl get pods -l component=zookeeper +NAME READY STATUS RESTARTS AGE +zk-0 1/1 Running 0 18m +zk-1 1/1 Running 0 17m +zk-2 0/1 Running 6 15m +``` + +This step may take several minutes, as Kubernetes needs to download the Docker image on the VMs. + +### Initialize cluster metadata + +Once ZooKeeper runs, you need to [initialize the metadata](#cluster-metadata-initialization) for the Pulsar cluster in ZooKeeper. This includes system metadata for [BookKeeper](reference-terminology.md#bookkeeper) and Pulsar more broadly. You only need to run the Kubernetes [job](https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/) in the `cluster-metadata.yaml` file once: + +```bash +$ kubectl apply -f cluster-metadata.yaml +``` + +For the sake of reference, that job runs the following command on an ephemeral pod: + +```bash +$ bin/pulsar initialize-cluster-metadata \ + --cluster local \ + --zookeeper zookeeper \ + --configuration-store zookeeper \ + --web-service-url http://broker.default.svc.cluster.local:8080/ \ + --broker-service-url pulsar://broker.default.svc.cluster.local:6650/ +``` + +### Deploy the rest of the components + +Once you have successfully initialized cluster metadata, you can then deploy the bookies, brokers, monitoring stack ([Prometheus](https://prometheus.io), [Grafana](https://grafana.com), and the [Pulsar dashboard](administration-dashboard.md)), and Pulsar cluster proxy: + +```bash +$ kubectl apply -f bookie.yaml +$ kubectl apply -f broker.yaml +$ kubectl apply -f proxy.yaml +$ kubectl apply -f monitoring.yaml +$ kubectl apply -f admin.yaml +``` + +You can check on the status of the pods for these components either in the Kubernetes Dashboard or using `kubectl`: + +```bash +$ kubectl get pods -w -l app=pulsar +``` + +### Set up properties and namespaces + +Once all of the components are up and running, you need to create at least one Pulsar tenant and at least one namespace. + +>If Pulsar [authentication and authorization](security-overview.md) is turned on,you do not have to strictly perform this step though you are allowed to change [policies](admin-api-namespaces.md) for each of the namespaces later. + +You can create properties and namespaces (and perform any other administrative tasks) using the `pulsar-admin` pod that is already configured to act as an admin client for your newly created Pulsar cluster. One easy way to perform administrative tasks is to create an alias for the [`pulsar-admin`](reference-pulsar-admin.md) tool installed on the admin pod. + +```bash +$ alias pulsar-admin='kubectl exec pulsar-admin -it -- bin/pulsar-admin' +``` + +Now, any time you run `pulsar-admin`, you can run commands from that pod. This command creates a tenant called `ten`: + +```bash +$ pulsar-admin tenants create ten \ + --admin-roles admin \ + --allowed-clusters local +``` + +This command creates a `ns` namespace under the `ten` tenant: + +```bash +$ pulsar-admin namespaces create ten/ns +``` + +To verify that everything has gone as planned: + +```bash +$ pulsar-admin tenants list +public +ten + +$ pulsar-admin namespaces list ten +ten/ns +``` + +Now that you have a namespace and tenant set up, you can move on to [experimenting with your Pulsar cluster](#experimenting-with-your-cluster) from within the cluster or [connecting to the cluster](#client-connections) using a Pulsar client. + +### Experiment with your cluster + +Now that you have successfully created a tenant and namespace, you can begin experimenting with your running Pulsar cluster. Using the same `pulsar-admin` pod via an alias, as in the section above, you can use [`pulsar-perf`](reference-cli-tools.md#pulsar-perf) to create a test [producer](reference-terminology.md#producer) to publish 10,000 messages a second on a topic in the [tenant](reference-terminology.md#tenant) and [namespace](reference-terminology.md#namespace) you have created. + +First, create an alias to use the `pulsar-perf` tool via the admin pod: + +```bash +$ alias pulsar-perf='kubectl exec pulsar-admin -it -- bin/pulsar-perf' +``` + +Now, produce messages: + +```bash +$ pulsar-perf produce persistent://public/default/my-topic \ + --rate 10000 +``` + +Similarly, you can start a [consumer](reference-terminology.md#consumer) to subscribe to and receive all the messages on that topic: + +```bash +$ pulsar-perf consume persistent://public/default/my-topic \ + --subscriber-name my-subscription-name +``` + +You can also view [stats](administration-stats.md) for the topic using the [`pulsar-admin`](reference-pulsar-admin.md#persistent-stats) tool: + +```bash +$ pulsar-admin persistent stats persistent://public/default/my-topic +``` + +### Monitor + +The default monitoring stack for Pulsar on Kubernetes consists of [Prometheus](#prometheus), [Grafana](#grafana), and the [Pulsar dashbaord](administration-dashboard.md). + +> If you deploy the cluster to Minikube, the following monitoring ports are mapped at the minikube VM: +> +> - Prometheus port: 30003 +> - Grafana port: 30004 +> - Dashboard port: 30005 +> +> You can use `minikube ip` to find the IP address of the minikube VM, and then use their mapped ports +> to access corresponding services. For example, you can access Pulsar dashboard at `http://$(minikube ip):30005`. + +#### Prometheus + +A [Prometheus](https://prometheus.io) instance running inside the cluster can collect all Pulsar metrics in Kubernetes. Typically, you do not have to access Prometheus directly. Instead, you can use the [Grafana interface](#grafana) that displays the data stored in Prometheus. + +#### Grafana + +In your Kubernetes cluster, you can use [Grafana](https://grafana.com) to view dashbaords for Pulsar [namespaces](reference-terminology.md#namespace) (message rates, latency, and storage), JVM stats, [ZooKeeper](https://zookeeper.apache.org), and [BookKeeper](reference-terminology.md#bookkeeper). You can get access to the pod serving Grafana using the [`port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster) command of `kubectl`: + +```bash +$ kubectl port-forward \ + $(kubectl get pods -l component=grafana -o jsonpath='{.items[*].metadata.name}') 3000 +``` + +You can then access the dashboard in your web browser at [localhost:3000](http://localhost:3000). + +#### Pulsar dashboard + +While Grafana and Prometheus are used to provide graphs with historical data, [Pulsar dashboard](administration-dashboard.md) reports more detailed current data for individual [topics](reference-terminology.md#topic). + +For example, you can have sortable tables showing all namespaces, topics, and broker stats, with details on the IP address for consumers, how long they have been connected, and much more. + +You can access to the pod serving the Pulsar dashboard using the [`port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster) command of `kubectl`: + +```bash +$ kubectl port-forward \ + $(kubectl get pods -l component=dashboard -o jsonpath='{.items[*].metadata.name}') 8080:80 +``` + +You can then access the dashboard in your web browser at [localhost:8080](http://localhost:8080). + +### Client connections + +> If you deploy the cluster to Minikube, the proxy ports are mapped at the minikube VM: +> +> - Http port: 30001 +> - Pulsar binary protocol port: 30002 +> +> You can use `minikube ip` to find the IP address of the minikube VM, and then use their mapped ports +> to access corresponding services. For example, pulsar webservice url is at `http://$(minikube ip):30001`. + +Once your Pulsar cluster is running on Kubernetes, you can connect to it using a Pulsar client. You can fetch the IP address for the Pulsar proxy running in your Kubernetes cluster using `kubectl`: + +```bash +$ kubectl get service broker-proxy \ + --output=jsonpath='{.status.loadBalancer.ingress[*].ip}' +``` + +If the IP address for the proxy is, for example, 35.12.13.198, you can connect to Pulsar using `pulsar://35.12.13.198:6650`. + +You can find client documentation for: + +* [Java](client-libraries-java.md) +* [Python](client-libraries-python.md) +* [C++](client-libraries-cpp.md) + + +## Deploy Pulsar components (helm) + +Pulsar also provides a [Helm](https://docs.helm.sh/) chart for deploying a Pulsar cluster to Kubernetes. Before you start, make sure you follow [Helm documentation](https://docs.helm.sh/using_helm) to install helm. + +> Assume you clone a pulsar repo under a `PULSAR_HOME` directory. + +### Minikube + +1. Go to Pulsar helm chart directory + ```shell + cd ${PULSAR_HOME}/deployment/kubernetes/helm + ``` +1. Install helm chart to a K8S cluster on Minikube. + ```shell + helm install --values pulsar/values-mini.yaml ./pulsar + ``` + +Once the helm chart is completed on installation, you can access the cluster via: + +- Web service url: `http://$(minikube ip):30001/` +- Pulsar service url: `pulsar://$(minikube ip):30002/` diff --git a/site2/website/versioned_docs/version-2.5.0/deploy-monitoring.md b/site2/website/versioned_docs/version-2.5.0/deploy-monitoring.md new file mode 100644 index 0000000000000..96ea306f4e7fe --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/deploy-monitoring.md @@ -0,0 +1,90 @@ +--- +id: version-2.5.0-deploy-monitoring +title: Monitoring +sidebar_label: Monitoring +original_id: deploy-monitoring +--- + +You can use different ways to monitor a Pulsar cluster, exposing both metrics that relate to the usage of topics and the overall health of the individual components of the cluster. + +## Collect metrics + +You can collect broker stats, ZooKeeper stats, and BookKeeper stats. + +### Broker stats + +You can collect Pulsar broker metrics from brokers and export the metrics in JSON format. The Pulsar broker metrics mainly have two types: + +* *Destination dumps*, which contain stats for each individual topic. You can fetch the destination dumps using the command below: + + ```shell + bin/pulsar-admin broker-stats destinations + ``` + +* Broker metrics, which contain the broker information and topics stats aggregated at namespace level. You can fetch the broker metrics using the command below: + + ```shell + bin/pulsar-admin broker-stats monitoring-metrics + ``` + +All the message rates are updated every 1min. + +The aggregated broker metrics are also exposed in the [Prometheus](https://prometheus.io) format at: + +```shell +http://$BROKER_ADDRESS:8080/metrics +``` + +### ZooKeeper stats + +The local Zookeeper and configuration store server and clients that are shipped with Pulsar have been instrumented to expose detailed stats through Prometheus as well. + +```shell +http://$LOCAL_ZK_SERVER:8000/metrics +http://$GLOBAL_ZK_SERVER:8001/metrics +``` + +The default port of local ZooKeeper is `8000` and the default port of configuration store is `8001`. You can change the default port of local Zookeeper and configuration store by specifying system property `stats_server_port`. + +### BookKeeper stats + +For BookKeeper you can configure the stats frameworks by changing the `statsProviderClass` in +`conf/bookkeeper.conf`. + +The default BookKeeper configuration, which is included with Pulsar distribution, enables the Prometheus exporter. + +```shell +http://$BOOKIE_ADDRESS:8000/metrics +``` + +The default port for bookie is `8000` (instead of `8080`). You can change the port by configuring `prometheusStatsHttpPort` in `conf/bookkeeper.conf`. + +## Configure Prometheus + +You can use Prometheus to collect and store the metrics data. For details, refer to [Prometheus guide](https://prometheus.io/docs/introduction/getting_started/). + +When you run Pulsar on bare metal, you can provide the list of nodes that needs to be probed. When you deploy Pulsar in a Kubernetes cluster, the monitoring is automatically setup with the [provided](deploy-kubernetes.md) instructions. + +## Dashboards + +When you collect time series statistics, the major problem is to make sure the number of dimensions attached to the data does not explode. + +For that reason you only need to collect time series of metrics aggregated at the namespace level. + +### Pulsar per-topic dashboard + +The per-topic dashboard instructions are available at [Dashboard](administration-dashboard.md). + +### Grafana + +You can use grafana to easily create dashboard driven by the data that is stored in Prometheus. + +When you deploy Pulsar on Kubernetes, a `pulsar-grafana` Docker image is enabled by default. You can use the docker image with the principal dashboards. + +Enter the command below to use the dashboard manually: + +```shell +docker run -p3000:3000 \ + -e PROMETHEUS_URL=http://$PROMETHEUS_HOST:9090/ \ + apachepulsar/pulsar-grafana:latest +``` diff --git a/site2/website/versioned_docs/version-2.5.0/functions-cli.md b/site2/website/versioned_docs/version-2.5.0/functions-cli.md new file mode 100644 index 0000000000000..45de48699f951 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/functions-cli.md @@ -0,0 +1,198 @@ +--- +id: version-2.5.0-functions-cli +title: Pulsar Functions command line tool +sidebar_label: Reference: CLI +original_id: functions-cli +--- + +The following tables list Pulsar Functions command-line tools. You can learn Pulsar Functions modes, commands, and parameters. + +## localrun + +Run Pulsar Functions locally, rather than deploying it to the Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | false | +broker-service-url | The URL for the Pulsar broker. | | +classname | The class name of a Pulsar Function.| | +client-auth-params | Client authentication parameter. | | +client-auth-plugin | Client authentication plugin using which function-process can connect to broker. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). | | +hostname-verification-enabled | Enable hostname verification. | false +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package. | | +instance-id-offset | Start the instanceIds from this offset. | 0 +log-topic | The topic to which the logs a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +tls-allow-insecure | Allow insecure tls connection. | false +tls-trust-cert-path | tls trust cert file path. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +use-tls | Use tls connection. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + + +## create + +Create and deploy a Pulsar Function in cluster mode. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | false | +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime).| | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## delete + +Delete a Pulsar Function that is running on a Pulsar cluster. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## update + +Update a Pulsar Function that has been deployed to a Pulsar cluster. + +Name | Description | Default +---|---|--- +auto-ack | Whether or not the framework acknowledges messages automatically. | false +classname | The class name of a Pulsar Function. | | +CPU | The CPU in cores that need to be allocated per function instance (applicable only to docker runtime). | | +custom-runtime-options | A string that encodes options to customize the runtime, see docs for configured runtime for details | | +custom-schema-inputs | The map of input topics to Schema class names (as a JSON string). | | +custom-serde-inputs | The map of input topics to SerDe class names (as a JSON string). | | +dead-letter-topic | The topic where all messages that were not processed successfully are sent. | | +disk | The disk in bytes that need to be allocated per function instance (applicable only to docker runtime). | | +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +function-config-file | The path to a YAML config file specifying the configuration of a Pulsar Function. | | +go | Path to the main Go executable binary for the function (if the function is written in Go). | | +inputs | The input topic or topics of a Pulsar Function (multiple topics can be specified as a comma-separated list). | | +jar | Path to the jar file for the function (if the function is written in Java). It also supports URL-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package. | | +log-topic | The topic to which the logs of a Pulsar Function are produced. | | +max-message-retries | How many times should we try to process a message before giving up. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +output | The output topic of a Pulsar Function (If none is specified, no output is written). | | +output-serde-classname | The SerDe class to be used for messages output by the function. | | +parallelism | The parallelism factor of a Pulsar Function (i.e. the number of function instances to run). | | +processing-guarantees | The processing guarantees (delivery semantics) applied to the function. Available values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]. | ATLEAST_ONCE +py | Path to the main Python file/Python Wheel file for the function (if the function is written in Python). | | +ram | The ram in bytes that need to be allocated per function instance (applicable only to process/docker runtime). | | +retain-ordering | Function consumes and processes messages in order. | | +schema-type | The builtin schema type or custom schema class name to be used for messages output by the function. | +sliding-interval-count | The number of messages after which the window slides. | | +sliding-interval-duration-ms | The time duration after which the window slides. | | +subs-name | Pulsar source subscription name if user wants a specific subscription-name for the input-topic consumer. | | +tenant | The tenant of a Pulsar Function. | | +timeout-ms | The message timeout in milliseconds. | | +topics-pattern | The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (only supported in Java Function). | | +update-auth-data | Whether or not to update the auth data. | false +user-config | User-defined config key/values. | | +window-length-count | The number of messages per window. | | +window-length-duration-ms | The time duration of the window in milliseconds. | | + +## get + +Fetch information about a Pulsar Function. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## restart + +Restart function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## stop + +Stops function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | + +## start + +Starts a stopped function instance. + +Name | Description | Default +---|---|--- +fqfn | The Fully Qualified Function Name (FQFN) for the function. | | +instance-id | The function instanceId (restart all instances if instance-id is not provided. | | +name | The name of a Pulsar Function. | | +namespace | The namespace of a Pulsar Function. | | +tenant | The tenant of a Pulsar Function. | | diff --git a/site2/website/versioned_docs/version-2.5.0/functions-debug.md b/site2/website/versioned_docs/version-2.5.0/functions-debug.md new file mode 100644 index 0000000000000..42a55670d14a8 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/functions-debug.md @@ -0,0 +1,455 @@ +--- +id: version-2.5.0-functions-debug +title: Debug Pulsar Functions +sidebar_label: How-to: Debug +original_id: functions-debug +--- + +You can use the following methods to debug Pulsar Functions: + +* [Captured stderr](functions-debug.md#captured-stderr) +* [Use unit test](functions-debug.md#use-unit-test) +* [Debug with localrun mode](functions-debug.md#debug-with-localrun-mode) +* [Use log topic](functions-debug.md#use-log-topic) +* [Use Functions CLI](functions-debug.md#use-functions-cli) + +## Captured stderr + +Function startup information and captured stderr output is written to `logs/functions////-.log` + +This is useful for debugging why a function fails to start. + +## Use unit test + +A Pulsar Function is a function with inputs and outputs, you can test a Pulsar Function in a similar way as you test any function. + +For example, if you have the following Pulsar Function: + +```java +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} +``` + +You can write a simple unit test to test Pulsar Function. + +```java +@Test +public void testJavaNativeExclamationFunction() { + JavaNativeExclamationFunction exclamation = new JavaNativeExclamationFunction(); + String output = exclamation.apply("foo"); + Assert.assertEquals(output, "foo!"); +} +``` + +The following Pulsar Function implements the `org.apache.pulsar.functions.api.Function` interface. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} +``` + +In this situation, you can write a unit test for this function as well. Remember to mock the `Context` parameter. The following is an example. + +```java +@Test +public void testExclamationFunction() { + ExclamationFunction exclamation = new ExclamationFunction(); + String output = exclamation.process("foo", mock(Context.class)); + Assert.assertEquals(output, "foo!"); +} +``` + +## Debug with localrun mode +When you run a Pulsar Function in localrun mode, it launches an instance of the Function on your local machine as a thread. + +In this mode, a Pulsar Function consumes and produces actual data to a Pulsar cluster, and mirrors how the function actually runs in a Pulsar cluster. + +> Note +> Currently, debugging with localrun mode is only supported by Pulsar Functions written in Java. You need Pulsar version 2.4.0 or later to do the following. Even though localrun is available in versions earlier than Pulsar 2.4.0, you cannot debug with localrun mode programmatically or run Functions as threads. + +You can launch your function in the following manner. + +```java +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setName(functionName); +functionConfig.setInputs(Collections.singleton(sourceTopic)); +functionConfig.setClassName(ExclamationFunction.class.getName()); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setOutput(sinkTopic); + +LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); +localRunner.start(true); +``` + +So you can debug functions using an IDE easily. Set breakpoints and manually step through a function to debug with real data. + +The following example illustrates how to programmatically launch a function in localrun mode. + +```java +public class ExclamationFunction implements Function { + + @Override + public String process(String s, Context context) throws Exception { + return s + "!"; + } + +public static void main(String[] args) throws Exception { + FunctionConfig functionConfig = new FunctionConfig(); + functionConfig.setName("exclamation"); + functionConfig.setInputs(Collections.singleton("input")); + functionConfig.setClassName(ExclamationFunction.class.getName()); + functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); + functionConfig.setOutput("output"); + + LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); + localRunner.start(false); +} +``` + +To use localrun mode programmatically, add the following dependency. + +```xml + + org.apache.pulsar + pulsar-functions-local-runner + ${pulsar.version} + + +``` + +For complete code samples, see [here](https://github.com/jerrypeng/pulsar-functions-demos/tree/master/debugging). + +> Note +> Debugging with localrun mode for Pulsar Functions written in other languages will be supported soon. + +## Use log topic + +In Pulsar Functions, you can generate log information defined in functions to a specified log topic. You can configure consumers to consume messages from a specified log topic to check the log information. + +![Pulsar Functions core programming model](assets/pulsar-functions-overview.png) + +**Example** + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} +``` + +As shown in the example above, you can get the logger via `context.getLogger()` and assign the logger to the `LOG` variable of `slf4j`, so you can define your desired log information in a function using the `LOG` variable. Meanwhile, you need to specify the topic to which the log information is produced. + +**Example** + +```bash +$ bin/pulsar-admin functions create \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs +``` + +## Use Functions CLI + +With [Pulsar Functions CLI](reference-pulsar-admin.md#functions), you can debug Pulsar Functions with the following subcommands: + +* `get` +* `status` +* `stats` +* `list` +* `trigger` + +> **Tip** +> +> For complete commands of **Pulsar Functions CLI**, see [here](reference-pulsar-admin.md#functions)。 + +### `get` + +Get information about a Pulsar Function. + +**Usage** + +```bash +$ pulsar-admin functions get options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +> **Tip** +> +> `--fqfn` consists of `--name`, `--namespace` and `--tenant`, so you can specify either `--fqfn` or `--name`, `--namespace` and `--tenant`. + +**Example** + +You can specify `--fqfn` to get information about a Pulsar Function. + +```bash +$ ./bin/pulsar-admin functions get public/default/ExclamationFunctio6 +``` +Optionally, you can specify `--name`, `--namespace` and `--tenant` to get information about a Pulsar Function. + +```bash +$ ./bin/pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 +``` + +As shown below, the `get` command shows input, output, runtime, and other information about the _ExclamationFunctio6_ function. + +```json +{ + "tenant": "public", + "namespace": "default", + "name": "ExclamationFunctio6", + "className": "org.example.test.ExclamationFunction", + "inputSpecs": { + "persistent://public/default/my-topic-1": { + "isRegexPattern": false + } + }, + "output": "persistent://public/default/test-1", + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "userConfig": {}, + "runtime": "JAVA", + "autoAck": true, + "parallelism": 1 +} +``` + +### `status` + +Check the current status of a Pulsar Function. + +**Usage** + +```bash +$ pulsar-admin functions status options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash +$ ./bin/pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ +``` + +As shown below, the `status` command shows the number of instances, running instances, the instance running under the _ExclamationFunctio6_ function, received messages, successfully processed messages, system exceptions, the average latency and so on. + +```json +{ + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 1, + "numSuccessfullyProcessed" : 1, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.8385, + "lastInvocationTime" : 1557734137987, + "workerId" : "c-standalone-fw-23ccc88ef29b-8080" + } + } ] +} +``` + +### `stats` + +Get the current stats of a Pulsar Function. + +**Usage** + +```bash +$ pulsar-admin functions stats options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--instance-id`|The instance ID of a Pulsar Function.
    If the `--instance-id` is not specified, it gets the IDs of all instances.
    +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash +$ ./bin/pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ +``` + +The output is shown as follows: + +```json +{ + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "instances" : [ { + "instanceId" : 0, + "metrics" : { + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "userMetrics" : { } + } + } ] +} +``` + +### `list` + +List all Pulsar Functions running under a specific tenant and namespace. + +**Usage** + +```bash +$ pulsar-admin functions list options +``` + +**Options** + +|Flag|Description +|---|--- +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. + +**Example** + +```bash +$ ./bin/pulsar-admin functions list \ + --tenant public \ + --namespace default +``` +As shown below, the `list` command returns three functions running under the _public_ tenant and the _default_ namespace. + +```text +ExclamationFunctio1 +ExclamationFunctio2 +ExclamationFunctio3 +``` + +### `trigger` + +Trigger a specified Pulsar Function with a supplied value. This command simulates the execution process of a Pulsar Function and verifies it. + +**Usage** + +```bash +$ pulsar-admin functions trigger options +``` + +**Options** + +|Flag|Description +|---|--- +|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. +|`--name`|The name of a Pulsar Function. +|`--namespace`|The namespace of a Pulsar Function. +|`--tenant`|The tenant of a Pulsar Function. +|`--topic`|The topic name that a Pulsar Function consumes from. +|`--trigger-file`|The path to a file that contains the data to trigger a Pulsar Function. +|`--trigger-value`|The value to trigger a Pulsar Function. + +**Example** + +```bash +$ ./bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + --topic persistent://public/default/my-topic-1 \ + --trigger-value "hello pulsar functions" +``` + +As shown below, the `trigger` command returns the following result: + +```text +This is my function! +``` + +> #### **Note** +> You must specify the [entire topic name](getting-started-pulsar.md#topic-names) when using the `--topic` option. Otherwise, the following error occurs. +> +>```text +>Function in trigger function has unidentified topic +> +>Reason: Function in trigger function has unidentified topic +>``` diff --git a/site2/website/versioned_docs/version-2.5.0/functions-develop.md b/site2/website/versioned_docs/version-2.5.0/functions-develop.md new file mode 100644 index 0000000000000..adb42922e7b7b --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/functions-develop.md @@ -0,0 +1,983 @@ +--- +id: version-2.5.0-functions-develop +title: Develop Pulsar Functions +sidebar_label: How-to: Develop +original_id: functions-develop +--- + +This tutorial walks you through how to develop Pulsar Functions. + +## Available APIs +In Java and Python, you have two options to write Pulsar Functions. In Go, you can use Pulsar Functions SDK for Go. + +Interface | Description | Use cases +:---------|:------------|:--------- +Language-native interface | No Pulsar-specific libraries or special dependencies required (only core libraries from Java/Python). | Functions that do not require access to the function [context](#context). +Pulsar Function SDK for Java/Python/Go | Pulsar-specific libraries that provide a range of functionality not provided by "native" interfaces. | Functions that require access to the function [context](#context). + +The language-native function, which adds an exclamation point to all incoming strings and publishes the resulting string to a topic, has no external dependencies. The following example is language-native function. + + + +```Java +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclamationFunction.java). + + +```python +def process(input): + return "{}!".format(input) +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py). + +> Note +> You can write Pulsar Functions in python2 or python3. However, Pulsar only looks for `python` as the interpreter. +> +> If you're running Pulsar Functions on an Ubuntu system that only supports python3, you might fail to +> start the functions. In this case, you can create a symlink. Your system will fail if +> you subsequently install any other package that depends on Python 2.x. A solution is under development in [Issue 5518](https://github.com/apache/pulsar/issues/5518). +> +> ```bash +> sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10 +> ``` + + + +The following example uses Pulsar Functions SDK. + + +```Java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java). + + +```python +from pulsar import Function + +class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py). + + +```Go +package main + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func HandleRequest(ctx context.Context, in []byte) error{ + fmt.Println(string(in) + "!") + return nil +} + +func main() { + pf.Start(HandleRequest) +} +``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-function-go/examples/inputFunc.go#L20-L36). + + + +## Schema registry +Pulsar has a built in schema registry and comes bundled with a variety of popular schema types(avro, json and protobuf). Pulsar Functions can leverage existing schema information from input topics and derive the input type. The schema registry applies for output topic as well. + +## SerDe +SerDe stands for **Ser**ialization and **De**serialization. Pulsar Functions uses SerDe when publishing data to and consuming data from Pulsar topics. How SerDe works by default depends on the language you use for a particular function. + + + +When you write Pulsar Functions in Java, the following basic Java types are built in and supported by default: + +* `String` +* `Double` +* `Integer` +* `Float` +* `Long` +* `Short` +* `Byte` + +To customize Java types, you need to implement the following interface. + +```java +public interface SerDe { + T deserialize(byte[] input); + byte[] serialize(T input); +} +``` + + +In Python, the default SerDe is identity, meaning that the type is serialized as whatever type the producer function returns. + +You can specify the SerDe when [creating](functions-deploy.md#cluster-mode) or [running](functions-deploy.md#local-run-mode) functions. + +```bash +$ bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name my_function \ + --py my_function.py \ + --classname my_function.MyFunction \ + --custom-serde-inputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ + --output-serde-classname Serde3 \ + --output output-topic-1 +``` + +This case contains two input topics: `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the map must be specified as a JSON string). The output topic, `output-topic-1`, uses the `Serde3` class for SerDe. At the moment, all Pulsar Functions logic, include processing function and SerDe classes, must be contained within a single Python file. + +When using Pulsar Functions for Python, you have three SerDe options: + +1. You can use the [`IdentitySerde`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. The `IdentitySerDe` is the **default**. Creating or running a function without explicitly specifying SerDe means that this option is used. +2. You can use the [`PickleSerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. +3. You can create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods: [`serialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes, and [`deserialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. + +The table below shows when you should use each SerDe. + +SerDe option | When to use +:------------|:----------- +`IdentitySerde` | When you work with simple types like strings, Booleans, integers. +`PickleSerDe` | When you work with complex, application-specific types and are comfortable with the "best effort" approach of `pickle`. +Custom SerDe | When you require explicit control over SerDe, potentially for performance or data compatibility purposes. + + +Currently, the feature is not available in Go. + + + +### Example +Imagine that you're writing Pulsar Functions that are processing tweet objects, you can refer to the following example of `Tweet` class. + + + + +```java +public class Tweet { + private String username; + private String tweetContent; + + public Tweet(String username, String tweetContent) { + this.username = username; + this.tweetContent = tweetContent; + } + + // Standard setters and getters +} +``` + +To pass `Tweet` objects directly between Pulsar Functions, you need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings in which the username and tweet content are separated by a `|`. + +```java +package com.example.serde; + +import org.apache.pulsar.functions.api.SerDe; + +import java.util.regex.Pattern; + +public class TweetSerde implements SerDe { + public Tweet deserialize(byte[] input) { + String s = new String(input); + String[] fields = s.split(Pattern.quote("|")); + return new Tweet(fields[0], fields[1]); + } + + public byte[] serialize(Tweet input) { + return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); + } +} +``` + +To apply this customized SerDe to a particular Pulsar Function, you need to: + +* Package the `Tweet` and `TweetSerde` classes into a JAR. +* Specify a path to the JAR and SerDe class name when deploying the function. + +The following is an example of [`create`](reference-pulsar-admin.md#create-1) operation. + +```bash +$ bin/pulsar-admin functions create \ + --jar /path/to/your.jar \ + --output-serde-classname com.example.serde.TweetSerde \ + # Other function attributes +``` + +> #### Custom SerDe classes must be packaged with your function JARs +> Pulsar does not store your custom SerDe classes separately from your Pulsar Functions. So you need to include your SerDe classes in your function JARs. If not, Pulsar returns an error. + + + +```python +class Tweet(object): + def __init__(self, username, tweet_content): + self.username = username + self.tweet_content = tweet_content +``` + +In order to use this class in Pulsar Functions, you have two options: + +1. You can specify `PickleSerDe`, which applies the [`pickle`](https://docs.python.org/3/library/pickle.html) library SerDe. +2. You can create your own SerDe class. The following is an example. + + ```python +from pulsar import SerDe + +class TweetSerDe(SerDe): + + def serialize(self, input): + return bytes("{0}|{1}".format(input.username, input.tweet_content)) + + def deserialize(self, input_bytes): + tweet_components = str(input_bytes).split('|') + return Tweet(tweet_components[0], tweet_componentsp[1]) + ``` +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/custom_object_function.py). + + + +In both languages, however, you can write custom SerDe logic for more complex, application-specific types. + +## Context +Java, Python and Go SDKs provide access to a **context object** that can be used by a function. This context object provides a wide variety of information and functionality to the function. + +* The name and ID of a Pulsar Function. +* The message ID of each message. Each Pulsar message is automatically assigned with an ID. +* The key, event time, properties and partition key of each message. +* The name of the topic to which the message is sent. +* The names of all input topics as well as the output topic associated with the function. +* The name of the class used for [SerDe](#serde). +* The [tenant](reference-terminology.md#tenant) and namespace associated with the function. +* The ID of the Pulsar Functions instance running the function. +* The version of the function. +* The [logger object](functions-develop.md#logger) used by the function, which can be used to create function log messages. +* Access to arbitrary [user configuration](#user-config) values supplied via the CLI. +* An interface for recording [metrics](#metrics). +* An interface for storing and retrieving state in [state storage](#state-storage). +* A function to publish new messages onto arbitrary topics. +* A function to ack the message being processed (if auto-ack is disabled). + + + +The [Context](https://github.com/apache/pulsar/blob/master/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Context.java) interface provides a number of methods that you can use to access the function [context](#context). The various method signatures for the `Context` interface are listed as follows. + +```java +public interface Context { + Record getCurrentRecord(); + Collection getInputTopics(); + String getOutputTopic(); + String getOutputSchemaType(); + String getTenant(); + String getNamespace(); + String getFunctionName(); + String getFunctionId(); + String getInstanceId(); + String getFunctionVersion(); + Logger getLogger(); + void incrCounter(String key, long amount); + long getCounter(String key); + void putState(String key, ByteBuffer value); + void deleteState(String key); + ByteBuffer getState(String key); + Map getUserConfigMap(); + Optional getUserConfigValue(String key); + Object getUserConfigValueOrDefault(String key, Object defaultValue); + void recordMetric(String metricName, double value); + CompletableFuture publish(String topicName, O object, String schemaOrSerdeClassName); + CompletableFuture publish(String topicName, O object); +} +``` + +The following example uses several methods available via the `Context` object. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.stream.Collectors; + +public class ContextFunction implements Function { + public Void process(String input, Context context) { + Logger LOG = context.getLogger(); + String inputTopics = context.getInputTopics().stream().collect(Collectors.joining(", ")); + String functionName = context.getFunctionName(); + + String logMessage = String.format("A message with a value of \"%s\" has arrived on one of the following topics: %s\n", + input, + inputTopics); + + LOG.info(logMessage); + + String metricName = String.format("function-%s-messages-received", functionName); + context.recordMetric(metricName, 1); + + return null; + } +} +``` + + +``` +class ContextImpl(pulsar.Context): + def get_message_id(self): + ... + def get_message_key(self): + ... + def get_message_eventtime(self): + ... + def get_message_properties(self): + ... + def get_current_message_topic_name(self): + ... + def get_partition_key(self): + ... + def get_function_name(self): + ... + def get_function_tenant(self): + ... + def get_function_namespace(self): + ... + def get_function_id(self): + ... + def get_instance_id(self): + ... + def get_function_version(self): + ... + def get_logger(self): + ... + def get_user_config_value(self, key): + ... + def get_user_config_map(self): + ... + def record_metric(self, metric_name, metric_value): + ... + def get_input_topics(self): + ... + def get_output_topic(self): + ... + def get_output_serde_class_name(self): + ... + def publish(self, topic_name, message, serde_class_name="serde.IdentitySerDe", + properties=None, compression_type=None, callback=None, message_conf=None): + ... + def ack(self, msgid, topic): + ... + def get_and_reset_metrics(self): + ... + def reset_metrics(self): + ... + def get_metrics(self): + ... + def incr_counter(self, key, amount): + ... + def get_counter(self, key): + ... + def del_counter(self, key): + ... + def put_state(self, key, value): + ... + def get_state(self, key): + ... +``` + + +``` +func (c *FunctionContext) GetInstanceID() int { + return c.instanceConf.instanceID +} + +func (c *FunctionContext) GetInputTopics() []string { + return c.inputTopics +} + +func (c *FunctionContext) GetOutputTopic() string { + return c.instanceConf.funcDetails.GetSink().Topic +} + +func (c *FunctionContext) GetFuncTenant() string { + return c.instanceConf.funcDetails.Tenant +} + +func (c *FunctionContext) GetFuncName() string { + return c.instanceConf.funcDetails.Name +} + +func (c *FunctionContext) GetFuncNamespace() string { + return c.instanceConf.funcDetails.Namespace +} + +func (c *FunctionContext) GetFuncID() string { + return c.instanceConf.funcID +} + +func (c *FunctionContext) GetFuncVersion() string { + return c.instanceConf.funcVersion +} + +func (c *FunctionContext) GetUserConfValue(key string) interface{} { + return c.userConfigs[key] +} + +func (c *FunctionContext) GetUserConfMap() map[string]interface{} { + return c.userConfigs +} +``` + +The following example uses several methods available via the `Context` object. + +``` +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func contextFunc(ctx context.Context) { + if fc, ok := pf.FromContext(ctx); ok { + fmt.Printf("function ID is:%s, ", fc.GetFuncID()) + fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) + } +} +``` + +For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-function-go/examples/contextFunc.go#L29-L34). + + + +### User config +When you run or update Pulsar Functions created using SDK, you can pass arbitrary key/values to them with the command line with the `--userConfig` flag. Key/values must be specified as JSON. The following function creation command passes a user configured key/value to a function. + +```bash +$ bin/pulsar-admin functions create \ + --name word-filter \ + # Other function configs + --user-config '{"forbidden-word":"rosebud"}' +``` + + + +The Java SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash +$ bin/pulsar-admin functions create \ + # Other function configs + --user-config '{"word-of-the-day":"verdure"}' +``` + +To access that value in a Java function: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + Optional wotd = context.getUserConfigValue("word-of-the-day"); + if (wotd.isPresent()) { + LOG.info("The word of the day is {}", wotd); + } else { + LOG.warn("No word of the day provided"); + } + return null; + } +} +``` + +The `UserConfigFunction` function will log the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The `word-of-the-day` user config will be changed only when the function is updated with a new config value via the command line. + +You can also access the entire user config map or set a default value in case no value is present: + +```java +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); +``` + +> For all key/value pairs passed to Java functions, both the key *and* the value are `String`. To set the value to be a different type, you need to deserialize from the `String` type. + + +In Python function, you can access the configuration value like this. + +```python +from pulsar import Function + +class WordFilter(Function): + def process(self, context, input): + forbidden_word = context.user_config()["forbidden-word"] + + # Don't publish the message if it contains the user-supplied + # forbidden word + if forbidden_word in input: + pass + # Otherwise publish the message + else: + return input +``` + +The Python SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. + +```bash +$ bin/pulsar-admin functions create \ + # Other function configs \ + --user-config '{"word-of-the-day":"verdure"}' +``` + +To access that value in a Python function: + +```python +from pulsar import Function + +class UserConfigFunction(Function): + def process(self, input, context): + logger = context.get_logger() + wotd = context.get_user_config_value('word-of-the-day') + if wotd is None: + logger.warn('No word of the day provided') + else: + logger.info("The word of the day is {0}".format(wotd)) +``` + +Currently, the feature is not available in Go. + + + +### Logger + + + +Pulsar Functions that use the Java SDK have access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. The following example logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} +``` + +If you want your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. + +```bash +$ bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs +``` + +All logs produced by `LoggingFunction` above can be accessed via the `persistent://public/default/logging-function-logs` topic. + + +Pulsar Functions that use the Python SDK have access to a logging object that can be used to produce logs at the chosen log level. The following example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```python +from pulsar import Function + +class LoggingFunction(Function): + def process(self, input, context): + logger = context.get_logger() + msg_id = context.get_message_id() + if 'danger' in input: + logger.warn("A warning was received in message {0}".format(context.get_message_id())) + else: + logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) +``` + +If you want your function to produce logs on a Pulsar topic, you need to specify a **log topic** when creating or running the function. The following is an example. + +```bash +$ bin/pulsar-admin functions create \ + --py logging_function.py \ + --classname logging_function.LoggingFunction \ + --log-topic logging-function-logs \ + # Other function configs +``` + +All logs produced by `LoggingFunction` above can be accessed via the `logging-function-logs` topic. + + +The following Go Function example shows different log levels based on the function input. + +``` +import ( + "context" + + "github.com/apache/pulsar/pulsar-function-go/log" + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func loggerFunc(ctx context.Context, input []byte) { + if len(input) <= 100 { + log.Infof("This input has a length of: %d", len(input)) + } else { + log.Warnf("This input is getting too long! It has {%d} characters", len(input)) + } +} + +func main() { + pf.Start(loggerFunc) +} +``` + +When you use `logTopic` related functionalities in Go Function, import `github.com/apache/pulsar/pulsar-function-go/log`, and you do not have to use the `getLogger()` context object. + + + +## Metrics +Pulsar Functions can publish arbitrary metrics to the metrics interface which can be queried. + +> If a Pulsar Function uses the language-native interface for Java or Python, that function is not able to publish metrics and stats to Pulsar. + + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecorderFunction implements Function { + @Override + public void apply(Integer input, Context context) { + // Records the metric 1 every time a message arrives + context.recordMetric("hit-count", 1); + + // Records the metric only if the arriving number equals 11 + if (input == 11) { + context.recordMetric("elevens-count", 1); + } + + return null; + } +} +``` + +> For instructions on reading and using metrics, see the [Monitoring](deploy-monitoring.md) guide. + + +You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. The following is an example. + +```python +from pulsar import Function + +class MetricRecorderFunction(Function): + def process(self, input, context): + context.record_metric('hit-count', 1) + + if input == 11: + context.record_metric('elevens-count', 1) +``` + +Currently, the feature is not available in Go. + + + +### Access metrics +To access metrics created by Pulsar Functions, refer to [Monitoring](deploy-monitoring.md) in Pulsar. + +## Security + +If you want to enable security on Pulsar Functions, first you should enable security on [Functions Workers](functions-worker.md). For more details, refer to [Security settings](functions-worker.md#security-settings). + +Pulsar Functions can support the following providers: + +- ClearTextSecretsProvider +- EnvironmentBasedSecretsProvider + +> Pulsar Function supports ClearTextSecretsProvider by default. + +At the same time, Pulsar Functions provides two interfaces, **SecretsProvider** and **SecretsProviderConfigurator**, allowing users to customize secret provider. + + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class GetSecretProviderFunction implements Function { + + @Override + public Void process(String input, Context context) throws Exception { + Logger LOG = context.getLogger(); + String secretProvider = context.getSecret(input); + + if (!secretProvider.isEmpty()) { + LOG.info("The secret provider is {}", secretProvider); + } else { + LOG.warn("No secret provider"); + } + + return null; + } +} +``` + + +You can get secret provider using the [`Context`](#context) object. The following is an example: + +```python +from pulsar import Function + +class GetSecretProviderFunction(Function): + def process(self, input, context): + logger = context.get_logger() + secret_provider = context.get_secret(input) + if secret_provider is None: + logger.warn('No secret provider') + else: + logger.info("The secret provider is {0}".format(secret_provider)) +``` + + + +Currently, the feature is not available in Go. + + + +## State storage +Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Pulsar installation, including the local standalone installation, includes deployment of BookKeeper bookies. + +Since Pulsar 2.1.0 release, Pulsar integrates with Apache BookKeeper [table service](https://docs.google.com/document/d/155xAwWv5IdOitHh1NVMEwCMGgB28M3FyMiQSxEpjE-Y/edit#heading=h.56rbh52koe3f) to store the `State` for functions. For example, a `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions State API. + +States are key-value pairs, where the key is a string and the value is arbitrary binary data - counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function, and shared between instances of that function. + +You can access states within Pulsar Functions using the `putState`, `getState`, `incrCounter`, `getCounter` and `deleteState` calls on the context object. You can also manage states using the [querystate](#query-state) and [putstate](#putstate) options to `pulsar-admin functions`. + +> Note +> State storage is not available in Go. + +### API + + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](functions-develop.md#context) object when you are using Java SDK functions. + +#### incrCounter + +```java + /** + * Increment the builtin distributed counter refered by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); +``` + +Application can use `incrCounter` to change the counter of a given `key` by the given `amount`. + +#### getCounter + +```java + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); +``` + +Application can use `getCounter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### putState + +```java + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); +``` + +#### getState + +```java + /** + * Retrieve the state value for the key. + * + * @param key name of the key + * @return the state value for the key. + */ + ByteBuffer getState(String key); +``` + +#### deleteState + +```java + /** + * Delete the state value for the key. + * + * @param key name of the key + */ +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + +Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](#context) object when you are using Python SDK functions. + +#### incr_counter + +```python + def incr_counter(self, key, amount): + """incr the counter of a given key in the managed state""" +``` + +Application can use `incr_counter` to change the counter of a given `key` by the given `amount`. +If the `key` does not exist, a new key is created. + +#### get_counter + +```python + def get_counter(self, key): + """get the counter of a given key in the managed state""" +``` + +Application can use `get_counter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +Except the `counter` API, Pulsar also exposes a general key/value API for functions to store +general key/value state. + +#### put_state + +```python + def put_state(self, key, value): + """update the value of a given key in the managed state""" +``` + +The key is a string, and the value is arbitrary binary data. + +#### get_state + +```python + def get_state(self, key): + """get the value of a given key in the managed state""" +``` + +#### del_counter + +```python + def del_counter(self, key): + """delete the counter of a given key in the managed state""" +``` + +Counters and binary values share the same keyspace, so this deletes either type. + + + +### Query State + +A Pulsar Function can use the [State API](#api) for storing state into Pulsar's state storage +and retrieving state back from Pulsar's state storage. Additionally Pulsar also provides +CLI commands for querying its state. + +```shell +$ bin/pulsar-admin functions querystate \ + --tenant \ + --namespace \ + --name \ + --state-storage-url \ + --key \ + [---watch] +``` + +If `--watch` is specified, the CLI will watch the value of the provided `state-key`. + +### Example + + + + +{@inject: github:`WordCountFunction`:/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/WordCountFunction.java} is a very good example +demonstrating on how Application can easily store `state` in Pulsar Functions. + +```java +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split("\\.")).forEach(word -> context.incrCounter(word, 1)); + return null; + } +} +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received `String` into multiple words using regex `\\.`. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incrCounter(key, amount)`). + + + +```python +from pulsar import Function + +class WordCount(Function): + def process(self, item, context): + for word in item.split(): + context.incr_counter(word, 1) +``` + +The logic of this `WordCount` function is pretty simple and straightforward: + +1. The function first splits the received string into multiple words on space. +2. For each `word`, the function increments the corresponding `counter` by 1 (via `incr_counter(key, amount)`). + + diff --git a/site2/website/versioned_docs/version-2.5.0/functions-metrics.md b/site2/website/versioned_docs/version-2.5.0/functions-metrics.md new file mode 100644 index 0000000000000..532190b2bffba --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/functions-metrics.md @@ -0,0 +1,7 @@ +--- +id: version-2.5.0-functions-metrics +title: Metrics for Pulsar Functions +sidebar_label: Metrics +original_id: functions-metrics +--- + diff --git a/site2/website/versioned_docs/version-2.5.0/functions-overview.md b/site2/website/versioned_docs/version-2.5.0/functions-overview.md new file mode 100644 index 0000000000000..606857999b262 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/functions-overview.md @@ -0,0 +1,200 @@ +--- +id: version-2.5.0-functions-overview +title: Pulsar Functions overview +sidebar_label: Overview +original_id: functions-overview +--- + +**Pulsar Functions** are lightweight compute processes that + +* consume messages from one or more Pulsar topics, +* apply a user-supplied processing logic to each message, +* publish the results of the computation to another topic. + + +## Goals +With Pulsar Functions, you can create complex processing logic without deploying a separate neighboring system (such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), [Apache Flink](https://flink.apache.org/)). Pulsar Functions are computing infrastructure of Pulsar messaging system. The core goal is tied to a series of other goals: + +* Developer productivity (language-native vs Pulsar Functions SDK functions) +* Easy troubleshooting +* Operational simplicity (no need for an external processing system) + +## Inspirations +Pulsar Functions are inspired by (and take cues from) several systems and paradigms: + +* Stream processing engines such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), and [Apache Flink](https://flink.apache.org) +* "Serverless" and "Function as a Service" (FaaS) cloud platforms like [Amazon Web Services Lambda](https://aws.amazon.com/lambda/), [Google Cloud Functions](https://cloud.google.com/functions/), and [Azure Cloud Functions](https://azure.microsoft.com/en-us/services/functions/) + +Pulsar Functions can be described as + +* [Lambda](https://aws.amazon.com/lambda/)-style functions that are +* specifically designed to use Pulsar as a message bus. + +## Programming model +Pulsar Functions provide a wide range of functionality, and the core programming model is simple. Functions receive messages from one or more **input [topics](reference-terminology.md#topic)**. Each time a message is received, the function will complete the following tasks. + + * Apply some processing logic to the input and write output to: + * An **output topic** in Pulsar + * [Apache BookKeeper](#state-storage) + * Write logs to a **log topic** (potentially for debugging purposes) + * Increment a [counter](#word-count-example) + +![Pulsar Functions core programming model](assets/pulsar-functions-overview.png) + +You can use Pulsar Functions to set up the following processing chain: + +* A Python function listens for the `raw-sentences` topic and "sanitizes" incoming strings (removing extraneous whitespace and converting all characters to lowercase) and then publishes the results to a `sanitized-sentences` topic. +* A Java function listens for the `sanitized-sentences` topic, counts the number of times each word appears within a specified time window, and publishes the results to a `results` topic +* Finally, a Python function listens for the `results` topic and writes the results to a MySQL table. + + +### Word count example + +If you implement the classic word count example using Pulsar Functions, it looks something like this: + +![Pulsar Functions word count example](assets/pulsar-functions-word-count.png) + +To write the function in Java with [Pulsar Functions SDK for Java](functions-develop.md#available-apis), you can write the function as follows. + +```java +package org.example.functions; + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +import java.util.Arrays; + +public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } +} +``` + +Bundle and build the JAR file to be deployed, and then deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash +$ bin/pulsar-admin functions create \ + --jar target/my-jar-with-dependencies.jar \ + --classname org.example.functions.WordCountFunction \ + --tenant public \ + --namespace default \ + --name word-count \ + --inputs persistent://public/default/sentences \ + --output persistent://public/default/count +``` + +### Content-based routing example + +Pulsar Functions are used in many cases. The following is a sophisticated example that involves content-based routing. + +For example, a function takes items (strings) as input and publishes them to either a `fruits` or `vegetables` topic, depending on the item. Or, if an item is neither fruit nor vegetable, a warning is logged to a [log topic](functions-develop.md#logger). The following is a visual representation. + +![Pulsar Functions routing example](assets/pulsar-functions-routing-example.png) + +If you implement this routing functionality in Python, it looks something like this: + +```python +from pulsar import Function + +class RoutingFunction(Function): + def __init__(self): + self.fruits_topic = "persistent://public/default/fruits" + self.vegetables_topic = "persistent://public/default/vegetables" + + def is_fruit(item): + return item in [b"apple", b"orange", b"pear", b"other fruits..."] + + def is_vegetable(item): + return item in [b"carrot", b"lettuce", b"radish", b"other vegetables..."] + + def process(self, item, context): + if self.is_fruit(item): + context.publish(self.fruits_topic, item) + elif self.is_vegetable(item): + context.publish(self.vegetables_topic, item) + else: + warning = "The item {0} is neither a fruit nor a vegetable".format(item) + context.get_logger().warn(warning) +``` + +If this code is stored in `~/router.py`, then you can deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. + +```bash +$ bin/pulsar-admin functions create \ + --py ~/router.py \ + --classname router.RoutingFunction \ + --tenant public \ + --namespace default \ + --name route-fruit-veg \ + --inputs persistent://public/default/basket-items +``` + +### Functions, messages and message types +Pulsar Functions take byte arrays as inputs and spit out byte arrays as output. However in languages that support typed interfaces(Java), you can write typed Functions, and bind messages to types in the following ways. +* [Schema Registry](functions-develop.md#schema-registry) +* [SerDe](functions-develop.md#serde) + + +## Fully Qualified Function Name (FQFN) +Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function tenant, namespace, and function name. FQFN looks like this: + +```http +tenant/namespace/name +``` + +FQFNs enable you to create multiple functions with the same name provided that they are in different namespaces. + +## Supported languages +Currently, you can write Pulsar Functions in Java, Python, and Go. For details, refer to [Develop Pulsar Functions](functions-develop.md). + +## Processing guarantees +Pulsar Functions provide three different messaging semantics that you can apply to any function. + +Delivery semantics | Description +:------------------|:------- +**At-most-once** delivery | Each message sent to the function is likely to be processed, or not to be processed (hence "at most"). +**At-least-once** delivery | Each message sent to the function can be processed more than once (hence the "at least"). +**Effectively-once** delivery | Each message sent to the function will have one output associated with it. + + +### Apply processing guarantees to a function +You can set the processing guarantees for a Pulsar Function when you create the Function. The [`pulsar-function create`](reference-pulsar-admin.md#create-1) command applies effectively-once guarantees to the Function. + +```bash +$ bin/pulsar-admin functions create \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs +``` + +The available options are: + +* `ATMOST_ONCE` +* `ATLEAST_ONCE` +* `EFFECTIVELY_ONCE` + +The following command runs a function in the cluster mode with effectively-once guarantees applied. + +```bash +$ bin/pulsar-admin functions create \ + --name my-effectively-once-function \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs +``` + +> By default, Pulsar Functions provide at-least-once delivery guarantees. So if you create a function without supplying a value for the `--processingGuarantees` flag, the function provides at-least-once guarantees. + +### Update the processing guarantees of a function +You can change the processing guarantees applied to a function using the [`update`](reference-pulsar-admin.md#update-1) command. The following is an example. + +```bash +$ bin/pulsar-admin functions update \ + --processing-guarantees ATMOST_ONCE \ + # Other function configs +``` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/functions-runtime.md b/site2/website/versioned_docs/version-2.5.0/functions-runtime.md new file mode 100644 index 0000000000000..bf83fef5926ae --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/functions-runtime.md @@ -0,0 +1,173 @@ +--- +id: version-2.5.0-functions-runtime +title: Configure Functions runtime +sidebar_label: Setup: Configure Functions runtime +original_id: functions-runtime +--- + +Pulsar Functions support the following methods to run functions. + +- *Thread*: Invoke functions in threads in Functions Worker. +- *Process*: Invoke functions in processes forked by Functions Worker. +- *Kubernetes*: Submit functions as Kubernetes StatefulSets by Functions Worker. + +The differences of the thread and process modes are: +- Thread mode: when a function runs in thread mode, it runs on the same Java virtual machine (JVM) with Functions worker. +- Process mode: when a function runs in process mode, it runs on the same machine that Functions worker runs. + +## Configure thread runtime +It is easy to configure *Thread* runtime. In most cases, you do not need to configure anything. You can customize the thread group name with the following settings: + +```yaml +threadContainerFactory: + threadGroupName: "Your Function Container Group" +``` + +*Thread* runtime is only supported in Java function. + +## Configure process runtime +When you enable *Process* runtime, you do not need to configure anything. + +```yaml +processContainerFactory: + # the directory for storing the function logs + logDirectory: + # change the jar location only when you put the java instance jar in a different location + javaInstanceJarLocation: + # change the python instance location only when you put the python instance jar in a different location + pythonInstanceLocation: + # change the extra dependencies location: + extraFunctionDependenciesDir: +``` + +*Process* runtime is supported in Java, Python, and Go functions. + +## Configure Kubernetes runtime + +It is easy to configure Kubernetes runtime. You can just uncomment the settings of `kubernetesContainerFactory` in the `functions_worker.yaml` file. The following is an example. + +```yaml +kubernetesContainerFactory: + # uri to kubernetes cluster, leave it to empty and it will use the kubernetes settings in function worker + k8Uri: + # the kubernetes namespace to run the function instances. it is `default`, if this setting is left to be empty + jobNamespace: + # the docker image to run function instance. by default it is `apachepulsar/pulsar` + pulsarDockerImageName: + # the root directory of pulsar home directory in `pulsarDockerImageName`. by default it is `/pulsar`. + # if you are using your own built image in `pulsarDockerImageName`, you need to set this setting accordingly + pulsarRootDir: + # this setting only takes effects if `k8Uri` is set to null. if your function worker is running as a k8 pod, + # setting this to true is let function worker to submit functions to the same k8s cluster as function worker + # is running. setting this to false if your function worker is not running as a k8 pod. + submittingInsidePod: false + # setting the pulsar service url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar service url configured in worker service + pulsarServiceUrl: + # setting the pulsar admin url that pulsar function should use to connect to pulsar + # if it is not set, it will use the pulsar admin url configured in worker service + pulsarAdminUrl: + # the custom labels that function worker uses to select the nodes for pods + customLabels: + # the directory for dropping extra function dependencies + # if it is not an absolute path, it is relative to `pulsarRootDir` + extraFunctionDependenciesDir: + # Additional memory padding added on top of the memory requested by the function per on a per instance basis + percentMemoryPadding: 10 +``` + +If you have already run a Pulsar cluster on Kubernetes, you can keep the settings unchanged at most of time. + +However, if you enable RBAC on deploying your Pulsar cluster, make sure the service account you use for +running Functions Workers (or brokers, if Functions Workers run along with brokers) have permissions on the following +kubernetes APIs. + +- services +- configmaps +- pods +- apps.statefulsets + +Otherwise, you will not be able to create any functions. The following is an example of error message. + +```bash +22:04:27.696 [Timer-0] ERROR org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory - Error while trying to fetch configmap example-pulsar-4qvmb5gur3c6fc9dih0x1xn8b-function-worker-config at namespace pulsar +io.kubernetes.client.ApiException: Forbidden + at io.kubernetes.client.ApiClient.handleResponse(ApiClient.java:882) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.ApiClient.execute(ApiClient.java:798) ~[io.kubernetes-client-java-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMapWithHttpInfo(CoreV1Api.java:23673) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMap(CoreV1Api.java:23655) ~[io.kubernetes-client-java-api-2.0.0.jar:?] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory.fetchConfigMap(KubernetesRuntimeFactory.java:284) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory$1.run(KubernetesRuntimeFactory.java:275) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] + at java.util.TimerThread.mainLoop(Timer.java:555) [?:1.8.0_212] + at java.util.TimerThread.run(Timer.java:505) [?:1.8.0_212] +``` +If this happens, you need to grant the required permissions to the service account used for running Functions Workers. An example to grant permissions is shown below: a service account `functions-worker` is granted with permissions to access Kubernetes resources `services`, `configmaps`, `pods` and `apps.statefulsets`. + +```yaml +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: functions-worker +rules: +- apiGroups: [""] + resources: + - services + - configmaps + - pods + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: functions-worker +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: functions-worker +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: functions-worker +subjects: +- kind: ServiceAccount + name: functions-worker +``` + +### Kubernetes CustomRuntimeOptions + +The functions (and sinks/sources) API provides a flag, `customRuntimeOptions` which can be used to pass options to the runtime to customize how the runtime operates. + +In the case of case of kubernetes, this is passed to an instance of the `org.apache.pulsar.functions.runtime.kubernetes.KubernetesManifestCustomizer`. This interface can be overridden +and allows for a high degree of customization over how the K8S manifests are generated. The interface is injected by passing the class name to the `runtimeCustomizerClassName` in the `functions-worker.yaml` + +To use the basic implementation, set `org.apache.pulsar.functions.runtime.kubernetes.BasicKubernetesManifestCustomizer` +for the `runtimeCustomerClassName` property. This implementation takes the following `customRuntimeOptions` +```Json +{ + "jobNamespace": "namespace", // the k8s namespace to run this function in + "extractLabels": { // extra labels to attach to the statefulSet, service, and pods + "extraLabel": "value" + }, + "extraAnnotations": { // extra annotations to attach to the statefulSet, service, and pods + "extraAnnotation": "value" + }, + "nodeSelectorLabels": { // node selector labels to add on to the pod spec + "customLabel": "value" + }, + "tolerations": [ // tolerations to add to the pod spec + { + "key": "custom-key", + "value": "value", + "effect": "NoSchedule" + } + ] +} +``` diff --git a/site2/website/versioned_docs/version-2.5.0/functions-worker.md b/site2/website/versioned_docs/version-2.5.0/functions-worker.md new file mode 100644 index 0000000000000..ff7ab9643648f --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/functions-worker.md @@ -0,0 +1,242 @@ +--- +id: version-2.5.0-functions-worker +title: Deploy and manage functions worker +sidebar_label: Setup: Pulsar Functions Worker +original_id: functions-worker +--- +Before using Pulsar Functions, you need to learn how to set up Pulsar Functions worker and how to [configure Functions runtime](functions-runtime.md). + +Pulsar `functions-worker` is a logic component to run Pulsar Functions in cluster mode. Two options are available, and you can select either of the two options based on your requirements. +- [run with brokers](#run-functions-worker-with-brokers) +- [run it separately](#run-functions-worker-separately) in a different broker + +> Note +> The `--- Service Urls---` lines in the following diagrams represent Pulsar service URLs that Pulsar client and admin use to connect to a Pulsar cluster. + +## Run Functions-worker with brokers + +The following diagram illustrates the deployment of functions-workers running along with brokers. + +![assets/functions-worker-corun.png](assets/functions-worker-corun.png) + +To enable functions-worker running as part of a broker, you need to set `functionsWorkerEnabled` to `true` in the `broker.conf` file. + +```conf +functionsWorkerEnabled=true +``` + +When you set `functionsWorkerEnabled` to `true`, it means that you start functions-worker as part of a broker. You need to configure the `conf/functions_worker.yml` file to customize your functions_worker. + +Before you run Functions-worker with broker, you have to configure Functions-worker, and then start it with brokers. + +### Configure Functions-Worker to run with brokers +In this mode, since `functions-worker` is running as part of broker, most of the settings already inherit from your broker configuration (for example, configurationStore settings, authentication settings, and so on). + +Pay attention to the following required settings when configuring functions-worker in this mode. + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`, which is good for standalone deployment. For production deployment, to ensure high availability, set it to be more than `2` . +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). + +If authentication is enabled on the BookKeeper cluster, configure the following BookKeeper authentication settings. + +- `bookkeeperClientAuthenticationPlugin`: the BookKeeper client authentication plugin name. +- `bookkeeperClientAuthenticationParametersName`: the BookKeeper client authentication plugin parameters name. +- `bookkeeperClientAuthenticationParameters`: the BookKeeper client authentication plugin parameters. + +### Start Functions-worker with broker + +Once you have configured the `functions_worker.yml` file, you can start or restart your broker. + +And then you can use the following command to verify if `functions-worker` is running well. + +```bash +curl :8080/admin/v2/worker/cluster +``` + +After entering the command above, a list of active function workers in the cluster is returned. The output is something similar as follows. + +```json +[{"workerId":"","workerHostname":"","port":8080}] +``` + +## Run Functions-worker separately + +This section illustrates how to run `functions-worker` as a separate process in separate machines. + +![assets/functions-worker-separated.png](assets/functions-worker-separated.png) + +> Note +In this mode, make sure `functionsWorkerEnabled` is set to `false`, so you won't start `functions-worker` with brokers by mistake. + +### Configure Functions-worker to run separately + +To run function-worker separately, you have to configure the following parameters. + +#### Worker parameters + +- `workerId`: The type is string. It is unique across clusters, used to identify a worker machine. +- `workerHostname`: The hostname of the worker machine. +- `workerPort`: The port that the worker server listens on. Keep it as default if you don't customize it. +- `workerPortTls`: The TLS port that the worker server listens on. Keep it as default if you don't customize it. + +#### Function package parameter + +- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`. + +#### Function metadata parameter + +- `pulsarServiceUrl`: The Pulsar service URL for your broker cluster. +- `pulsarWebServiceUrl`: The Pulser web service URL for your broker cluster. +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). + +If authentication is enabled for your broker cluster, you *should* configure the authentication plugin and parameters for the functions worker to communicate with the brokers. + +- `clientAuthenticationPlugin` +- `clientAuthenticationParameters` + +#### Security settings + +If you want to enable security on functions workers, you *should*: +- [Enable TLS transport encryption](#enable-tls-transport-encryption) +- [Enable Authentication Provider](#enable-authentication-provider) +- [Enable Authorization Provider](#enable-authorization-provider) + +##### Enable TLS transport encryption + +To enable TLS transport encryption, configure the following settings. + +``` +tlsEnabled: true +tlsCertificateFilePath: /path/to/functions-worker.cert.pem +tlsKeyFilePath: /path/to/functions-worker.key-pk8.pem +tlsTrustCertsFilePath: /path/to/ca.cert.pem +``` + +For details on TLS encryption, refer to [Transport Encryption using TLS](security-tls-transport.md). + +##### Enable Authentication Provider + +To enable authentication on Functions Worker, configure the following settings. +> Note +Substitute the *providers list* with the providers you want to enable. + +``` +authenticationEnabled: true +authenticationProviders: [ provider1, provider2 ] +``` + +For *SASL Authentication* provider, add `saslJaasClientAllowedIds` and `saslJaasBrokerSectionName` +under `properties` if needed. + +``` +properties: + saslJaasClientAllowedIds: .*pulsar.* + saslJaasBrokerSectionName: Broker +``` + +For *Token Authentication* prodivder, add necessary settings under `properties` if needed. +See [Token Authentication](security-jwt.md) for more details. +``` +properties: + tokenSecretKey: file://my/secret.key + # If using public/private + # tokenPublicKey: file:///path/to/public.key +``` + +##### Enable Authorization Provider + +To enable authorization on Functions Worker, you need to configure `authorizationEnabled` and `configurationStoreServers`. The authentication provider connects to `configurationStoreServers` to receive namespace policies. + +```yaml +authorizationEnabled: true +configurationStoreServers: +``` + +You should also configure a list of superuser roles. The superuser roles are able to access any admin API. The following is a configuration example. + +```yaml +superUserRoles: + - role1 + - role2 + - role3 +``` + +#### BookKeeper Authentication + +If authentication is enabled on the BookKeeper cluster, you should configure the BookKeeper authentication settings as follows: + +- `bookkeeperClientAuthenticationPlugin`: the plugin name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParametersName`: the plugin parameters name of BookKeeper client authentication. +- `bookkeeperClientAuthenticationParameters`: the plugin parameters of BookKeeper client authentication. + +### Start Functions-worker + +Once you have finished configuring the `functions_worker.yml` configuration file, you can use the following command to start a `functions-worker`: + +```bash +bin/pulsar functions-worker +``` + +### Configure Proxies for Functions-workers + +When you are running `functions-worker` in a separate cluster, the admin rest endpoints are split into two clusters. `functions`, `function-worker`, `source` and `sink` endpoints are now served +by the `functions-worker` cluster, while all the other remaining endpoints are served by the broker cluster. +Hence you need to configure your `pulsar-admin` to use the right service URL accordingly. + +In order to address this inconvenience, you can start a proxy cluster for routing the admin rest requests accordingly. Hence you will have one central entry point for your admin service. + +If you already have a proxy cluster, continue reading. If you haven't setup a proxy cluster before, you can follow the [instructions](http://pulsar.apache.org/docs/en/administration-proxy/) to +start proxies. + +![assets/functions-worker-separated.png](assets/functions-worker-separated-proxy.png) + +To enable routing functions related admin requests to `functions-worker` in a proxy, you can edit the `proxy.conf` file to modify the following settings: + +```conf +functionWorkerWebServiceURL= +functionWorkerWebServiceURLTLS= +``` + +## Compare the Run-with-Broker and Run-separately modes + +As described above, you can run Function-worker with brokers, or run it separately. And it is more convenient to run functions-workers along with brokers. However, running functions-workers in a separate cluster provides better resource isolation for running functions in `Process` or `Thread` mode. + +Use which mode for your cases, refer to the following guidelines to determine. + +Use the `Run-with-Broker` mode in the following cases: +- a) if resource isolation is not required when running functions in `Process` or `Thread` mode; +- b) if you configure the functions-worker to run functions on Kubernetes (where the resource isolation problem is addressed by Kubernetes). + +Use the `Run-separately` mode in the following cases: +- a) you don't have a Kubernetes cluster; +- b) if you want to run functions and brokers separately. + +## Troubleshooting + +**Error message: Namespace missing local cluster name in clusters list** + +``` +Failed to get partitioned topic metadata: org.apache.pulsar.client.api.PulsarClientException$BrokerMetadataException: Namespace missing local cluster name in clusters list: local_cluster=xyz ns=public/functions clusters=[standalone] +``` + +The error message prompts when either of the cases occurs: +- a) a broker is started with `functionsWorkerEnabled=true`, but the `pulsarFunctionsCluster` is not set to the correct cluster in the `conf/functions_worker.yaml` file; +- b) setting up a geo-replicated Pulsar cluster with `functionsWorkerEnabled=true`, while brokers in one cluster run well, brokers in the other cluster do not work well. + +**Workaround** + +If any of these cases happens, follow the instructions below to fix the problem: + +1. Get the current clusters list of `public/functions` namespace. + +```bash +bin/pulsar-admin namespaces get-clusters public/functions +``` + +2. Check if the cluster is in the clusters list. If the cluster is not in the list, add it to the list and update the clusters list. + +```bash +bin/pulsar-admin namespaces set-clusters --cluster=, public/functions +``` + +3. Set the correct cluster name in `pulsarFunctionsCluster` in the `conf/functions_worker.yml` file. diff --git a/site2/website/versioned_docs/version-2.5.0/getting-started-clients.md b/site2/website/versioned_docs/version-2.5.0/getting-started-clients.md new file mode 100644 index 0000000000000..ca1bf67ba2516 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/getting-started-clients.md @@ -0,0 +1,59 @@ +--- +id: version-2.5.0-client-libraries +title: Pulsar client libraries +sidebar_label: Use Pulsar with client libraries +original_id: client-libraries +--- + +Pulsar supports the following client libraries: + +- [Java client](#java-client) +- [Go client](#go-client) +- [Python client](#python-client) +- [C++ client](#c-client) + +## Java client + +For instructions on how to use the Pulsar Java client to produce and consume messages, see [Pulsar Java client](client-libraries-java.md). + +Two independent sets of Javadoc API docs are available. + +Library | Purpose +:-------|:------- +[`org.apache.pulsar.client.api`](/api/client) | The [Pulsar Java client](client-libraries-java.md) is used to produce and consume messages on Pulsar topics. +[`org.apache.pulsar.client.admin`](/api/admin) | The Java client for the [Pulsar admin interface](admin-api-overview.md). + + +## Go client + +For a tutorial on using the Pulsar Go client, see [Pulsar Go client](client-libraries-go.md). + + +## Python client + +For a tutorial on using the Pulsar Python client, see [Pulsar Python client](client-libraries-python.md). + +There are also [pdoc](https://github.com/BurntSushi/pdoc)-generated API docs for the Python client [here](/api/python). + +## C++ client + +For a tutorial on using the Pulsar C++ clent, see [Pulsar C++ client](client-libraries-cpp.md). + +There are also [Doxygen](http://www.doxygen.nl/)-generated API docs for the C++ client [here](/api/cpp). + +## Feature Matrix +Pulsar client feature matrix for different languages is listed on [Client Features Matrix](https://github.com/apache/pulsar/wiki/Client-Features-Matrix) page. + +## Thirdparty Clients + +Besides the official released clients, there are also multiple projects on developing a Pulsar client in different languages. + +> If you have developed a new Pulsar client, feel free to submit a pull request and add your client to the list below. + +| Language | Project | Maintainer | License | Description | +|----------|---------|------------|---------|-------------| +| Go | [pulsar-client-go](https://github.com/Comcast/pulsar-client-go) | [Comcast](https://github.com/Comcast) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | A native golang client | +| Go | [go-pulsar](https://github.com/t2y/go-pulsar) | [t2y](https://github.com/t2y) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | | +| Scala | [pulsar4s](https://github.com/sksamuel/pulsar4s) | [sksamuel](https://github.com/sksamuel) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Idomatic, typesafe, and reactive Scala client for Apache Pulsar | +| Rust | [pulsar-rs](https://github.com/wyyerd/pulsar-rs) | [Wyyerd Group](https://github.com/wyyerd) | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | Future-based Rust bindings for Apache Pulsar | +| .NET | [pulsar-client-dotnet](https://github.com/fsharplang-ru/pulsar-client-dotnet) | [Lanayx](https://github.com/Lanayx) | [![GitHub](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/licenses/MIT) | Native .NET client for C#/F#/VB | diff --git a/site2/website/versioned_docs/version-2.5.0/getting-started-docker.md b/site2/website/versioned_docs/version-2.5.0/getting-started-docker.md new file mode 100644 index 0000000000000..36707efcee335 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/getting-started-docker.md @@ -0,0 +1,161 @@ +--- +id: version-2.5.0-standalone-docker +title: Set up a standalone Pulsar in Docker +sidebar_label: Run Pulsar in Docker +original_id: standalone-docker +--- + +For local development and testing, you can run Pulsar in standalone +mode on your own machine within a Docker container. + +If you have not installed Docker, download the [Community edition](https://www.docker.com/community-edition) +and follow the instructions for your OS. + +## Start Pulsar in Docker + +* For MacOS, Linux, and Windows: + + ```shell + $ docker run -it \ + -p 6650:6650 \ + -p 8080:8080 \ + --mount source=pulsardata,target=/pulsar/data \ + --mount source=pulsarconf,target=/pulsar/conf \ + apachepulsar/pulsar:{{pulsar:version}} \ + bin/pulsar standalone + ``` + +A few things to note about this command: + * The data, metadata, and configuration are persisted on Docker volumes in order to not start "fresh" every +time the container is restarted. For details on the volumes you can use `docker volume inspect ` + * For Docker on Windows make sure to configure it to use Linux containers + +If you start Pulsar successfully, you will see `INFO`-level log messages like this: + +``` +2017-08-09 22:34:04,030 - INFO - [main:WebService@213] - Web Service started at http://127.0.0.1:8080 +2017-08-09 22:34:04,038 - INFO - [main:PulsarService@335] - messaging service is ready, bootstrap service on port=8080, broker url=pulsar://127.0.0.1:6650, cluster=standalone, configs=org.apache.pulsar.broker.ServiceConfiguration@4db60246 +... +``` + +> #### Tip +> +> When you start a local standalone cluster, a `public/default` +namespace is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. +For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar in Docker + +Pulsar offers client libraries for [Java](client-libraries-java.md), [Go](client-libraries-go.md), [Python](client-libraries-python.md) +and [C++](client-libraries-cpp.md). If you're running a local standalone cluster, you can +use one of these root URLs to interact with your cluster: + +* `pulsar://localhost:6650` +* `http://localhost:8080` + +The following example will guide you get started with Pulsar quickly by using the [Python](client-libraries-python.md) +client API. + +Install the Pulsar Python client library directly from [PyPI](https://pypi.org/project/pulsar-client/): + +```shell +$ pip install pulsar-client +``` + +### Consume a message + +Create a consumer and subscribe to the topic: + +```python +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +consumer = client.subscribe('my-topic', + subscription_name='my-sub') + +while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + +client.close() +``` + +### Produce a message + +Now start a producer to send some test messages: + +```python +import pulsar + +client = pulsar.Client('pulsar://localhost:6650') +producer = client.create_producer('my-topic') + +for i in range(10): + producer.send(('hello-pulsar-%d' % i).encode('utf-8')) + +client.close() +``` + +## Get the topic statistics + +In Pulsar, you can use REST, Java, or command-line tools to control every aspect of the system. +For details on APIs, refer to [Admin API Overview](admin-api-overview.md). + +In the simplest example, you can use curl to probe the stats for a particular topic: + +```shell +$ curl http://localhost:8080/admin/v2/persistent/public/default/my-topic/stats | python -m json.tool +``` + +The output is something like this: + +```json +{ + "averageMsgSize": 0.0, + "msgRateIn": 0.0, + "msgRateOut": 0.0, + "msgThroughputIn": 0.0, + "msgThroughputOut": 0.0, + "publishers": [ + { + "address": "/172.17.0.1:35048", + "averageMsgSize": 0.0, + "clientVersion": "1.19.0-incubating", + "connectedSince": "2017-08-09 20:59:34.621+0000", + "msgRateIn": 0.0, + "msgThroughputIn": 0.0, + "producerId": 0, + "producerName": "standalone-0-1" + } + ], + "replication": {}, + "storageSize": 16, + "subscriptions": { + "my-sub": { + "blockedSubscriptionOnUnackedMsgs": false, + "consumers": [ + { + "address": "/172.17.0.1:35064", + "availablePermits": 996, + "blockedConsumerOnUnackedMsgs": false, + "clientVersion": "1.19.0-incubating", + "connectedSince": "2017-08-09 21:05:39.222+0000", + "consumerName": "166111", + "msgRateOut": 0.0, + "msgRateRedeliver": 0.0, + "msgThroughputOut": 0.0, + "unackedMessages": 0 + } + ], + "msgBacklog": 0, + "msgRateExpired": 0.0, + "msgRateOut": 0.0, + "msgRateRedeliver": 0.0, + "msgThroughputOut": 0.0, + "type": "Exclusive", + "unackedMessages": 0 + } + } +} +``` diff --git a/site2/website/versioned_docs/version-2.5.0/getting-started-standalone.md b/site2/website/versioned_docs/version-2.5.0/getting-started-standalone.md new file mode 100644 index 0000000000000..4fa12bef4b700 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/getting-started-standalone.md @@ -0,0 +1,226 @@ +--- +id: version-2.5.0-standalone +title: Set up a standalone Pulsar locally +sidebar_label: Run Pulsar locally +original_id: standalone +--- + +For local development and testing, you can run Pulsar in standalone mode on your machine. The standalone mode includes a Pulsar broker, the necessary ZooKeeper and BookKeeper components running inside of a single Java Virtual Machine (JVM) process. + +> #### Pulsar in production? +> If you're looking to run a full production Pulsar installation, see the [Deploying a Pulsar instance](deploy-bare-metal.md) guide. + +## Install Pulsar standalone + +This tutorial guides you through every step of the installation process. + +### System requirements + +Pulsar is currently available for **MacOS** and **Linux**. To use Pulsar, you need to install Java 8 from [Oracle download center](http://www.oracle.com/). + +> #### Tip +> By default, Pulsar allocates 2G JVM heap memory to start. It can be changed in `conf/pulsar_env.sh` file under `PULSAR_MEM`. This is extra options passed into JVM. + +### Install Pulsar using binary release + +To get started with Pulsar, download a binary tarball release in one of the following ways: + +* download from the Apache mirror (Pulsar {{pulsar:version}} binary release) + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:binary_release_url + ``` + +After you download the tarball, untar it and use the `cd` command to navigate to the resulting directory: + +```bash +$ tar xvfz apache-pulsar-{{pulsar:version}}-bin.tar.gz +$ cd apache-pulsar-{{pulsar:version}} +``` + +#### What your package contains + +The Pulsar binary package initially contains the following directories: + +Directory | Contains +:---------|:-------- +`bin` | Pulsar's command-line tools, such as [`pulsar`](reference-cli-tools.md#pulsar) and [`pulsar-admin`](reference-pulsar-admin.md). +`conf` | Configuration files for Pulsar, including [broker configuration](reference-configuration.md#broker), [ZooKeeper configuration](reference-configuration.md#zookeeper), and more. +`examples` | A Java JAR file containing [Pulsar Functions](functions-overview.md) example. +`lib` | The [JAR](https://en.wikipedia.org/wiki/JAR_(file_format)) files used by Pulsar. +`licenses` | License files, in the`.txt` form, for various components of the Pulsar [codebase](https://github.com/apache/pulsar). + +These directories are created once you begin running Pulsar. + +Directory | Contains +:---------|:-------- +`data` | The data storage directory used by ZooKeeper and BookKeeper. +`instances` | Artifacts created for [Pulsar Functions](functions-overview.md). +`logs` | Logs created by the installation. + +> #### Tip +> If you want to use builtin connectors and tiered storage offloaders, you can install them according to the following instructions: +> +> * [Install builtin connectors (optional)](#install-builtin-connectors-optional) +> * [Install tiered storage offloaders (optional)](#install-tiered-storage-offloaders-optional) +> +> Otherwise, skip this step and perform the next step [Start Pulsar standalone](#start-pulsar-standalone). Pulsar can be successfully installed without installing bulitin connectors and tiered storage offloaders. + +### Install builtin connectors (optional) + +Since `2.1.0-incubating` release, Pulsar releases a separate binary distribution, containing all the `builtin` connectors. +To enable those `builtin` connectors, you can download the connectors tarball release in one of the following ways: + +* download from the Apache mirror Pulsar IO Connectors {{pulsar:version}} release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:connector_release_url/{connector}-{{pulsar:version}}.nar + ``` + +After you download the nar file, copy the file to the `connectors` directory in the pulsar directory. +For example, if you download the `pulsar-io-aerospike-{{pulsar:version}}.nar` connector file, enter the following commands: + +```bash +$ mkdir connectors +$ mv pulsar-io-aerospike-{{pulsar:version}}.nar connectors + +$ ls connectors +pulsar-io-aerospike-{{pulsar:version}}.nar +... +``` + +> #### Note +> +> * If you are running Pulsar in a bare metal cluster, make sure `connectors` tarball is unzipped in every pulsar directory of the broker +> (or in every pulsar directory of function-worker if you are running a separate worker cluster for Pulsar Functions). +> +> * If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DCOS](deploy-dcos.md)), +> you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled [all builtin connectors](io-overview.md#working-with-connectors). + +### Install tiered storage offloaders (optional) + +> #### Tip +> +> Since `2.2.0` release, Pulsar releases a separate binary distribution, containing the tiered storage offloaders. +> To enable tiered storage feature, follow the instructions below; otherwise skip this section. + +To get started with [tiered storage offloaders](concepts-tiered-storage.md), you need to download the offloaders tarball release on every broker node in one of the following ways: + +* download from the Apache mirror Pulsar Tiered Storage Offloaders {{pulsar:version}} release + +* download from the Pulsar [downloads page](pulsar:download_page_url) + +* download from the Pulsar [releases page](https://github.com/apache/pulsar/releases/latest) + +* use [wget](https://www.gnu.org/software/wget): + + ```shell + $ wget pulsar:offloader_release_url + ``` + +After you download the tarball, untar the offloaders package and copy the offloaders as `offloaders` +in the pulsar directory: + +```bash +$ tar xvfz apache-pulsar-offloaders-{{pulsar:version}}-bin.tar.gz + +// you will find a directory named `apache-pulsar-offloaders-{{pulsar:version}}` in the pulsar directory +// then copy the offloaders + +$ mv apache-pulsar-offloaders-{{pulsar:version}}/offloaders offloaders + +$ ls offloaders +tiered-storage-jcloud-{{pulsar:version}}.nar +``` + +For more information on how to configure tiered storage, see [Tiered storage cookbook](cookbooks-tiered-storage.md). + +> #### Note +> +> * If you are running Pulsar in a bare metal cluster, make sure that `offloaders` tarball is unzipped in every broker's pulsar directory. +> +> * If you are [running Pulsar in Docker](getting-started-docker.md) or deploying Pulsar using a docker image (e.g. [K8S](deploy-kubernetes.md) or [DCOS](deploy-dcos.md)), +> you can use the `apachepulsar/pulsar-all` image instead of the `apachepulsar/pulsar` image. `apachepulsar/pulsar-all` image has already bundled tiered storage offloaders. + +## Start Pulsar standalone + +Once you have an up-to-date local copy of the release, you can start a local cluster using the [`pulsar`](reference-cli-tools.md#pulsar) command, which is stored in the `bin` directory, and specifying that you want to start Pulsar in standalone mode. + +```bash +$ bin/pulsar standalone +``` + +If you have started Pulsar successfully, you will see `INFO`-level log messages like this: + +```bash +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@95] - Configuration Store cache started +2017-06-01 14:46:29,192 - INFO - [main:AuthenticationService@61] - Authentication is disabled +2017-06-01 14:46:29,192 - INFO - [main:WebSocketService@108] - Pulsar WebSocket Service started +``` + +> #### Tip +> +> * The service is running on your terminal, which is under your direct control. If you need to run other commands, open a new terminal window. +You can also run the service as a background process using the `pulsar-daemon start standalone` command. For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). +> +> * By default, there is no encryption, authentication, or authorization configured. Apache Pulsar can be accessed from remote server without any authorization. Please do check [Security Overview](security-overview.md) document to secure your deployment. +> +> * When you start a local standalone cluster, a `public/default` [namespace](concepts-messaging.md#namespaces) is created automatically. The namespace is used for development purposes. All Pulsar topics are managed within namespaces. For more information, see [Topics](concepts-messaging.md#topics). + +## Use Pulsar standalone + +Pulsar provides a CLI tool called [`pulsar-client`](reference-cli-tools.md#pulsar-client). The pulsar-client tool enables you to consume and produce messages to a Pulsar topic in a running cluster. + +### Consume a message + +The following command consumes a message with the subscription name `first-subscription` to the `my-topic` topic: + +```bash +$ bin/pulsar-client consume my-topic -s "first-subscription" +``` + +If the message has been successfully consumed, you will see a confirmation like the following in the `pulsar-client` logs: + +``` +09:56:55.566 [pulsar-client-io-1-1] INFO org.apache.pulsar.client.impl.MultiTopicsConsumerImpl - [TopicsConsumerFakeTopicNamee2df9] [first-subscription] Success subscribe new topic my-topic in topics consumer, partitions: 4, allTopicPartitionsNumber: 4 +``` + +> #### Tip +> +> As you have noticed that we do not explicitly create the `my-topic` topic, to which we consume the message. When you consume a message to a topic that does not yet exist, Pulsar creates that topic for you automatically. Producing a message to a topic that does not exist will automatically create that topic for you as well. + +### Produce a message + +The following command produces a message saying `hello-pulsar` to the `my-topic` topic: + +```bash +$ bin/pulsar-client produce my-topic --messages "hello-pulsar" +``` + +If the message has been successfully published to the topic, you will see a confirmation like the following in the `pulsar-client` logs: + +``` +13:09:39.356 [main] INFO org.apache.pulsar.client.cli.PulsarClientTool - 1 messages successfully produced +``` + +## Stop Pulsar standalone + +Press `Ctrl+C` to stop a local standalone Pulsar. + +> #### Tip +> +> If the service runs as a background process using the `pulsar-daemon start standalone` command, then use the `pulsar-daemon stop standalone` command to stop the service. +> +> For more information, see [pulsar-daemon](https://pulsar.apache.org/docs/en/reference-cli-tools/#pulsar-daemon). diff --git a/site2/website/versioned_docs/version-2.5.0/io-aerospike-sink.md b/site2/website/versioned_docs/version-2.5.0/io-aerospike-sink.md new file mode 100644 index 0000000000000..74aab5ded87a7 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-aerospike-sink.md @@ -0,0 +1,26 @@ +--- +id: version-2.5.0-io-aerospike-sink +title: Aerospike sink connector +sidebar_label: Aerospike sink connector +original_id: io-aerospike-sink +--- + +The Aerospike sink connector pulls messages from Pulsar topics to Aerospike clusters. + +## Configuration + +The configuration of the Aerospike sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `seedHosts` |String| true | No default value| The comma-separated list of one or more Aerospike cluster hosts.

    Each host can be specified as a valid IP address or hostname followed by an optional port number. | +| `keyspace` | String| true |No default value |The Aerospike namespace. | +| `columnName` | String | true| No default value|The Aerospike column name. | +|`userName`|String|false|NULL|The Aerospike username.| +|`password`|String|false|NULL|The Aerospike password.| +| `keySet` | String|false |NULL | The Aerospike set name. | +| `maxConcurrentRequests` |int| false | 100 | The maximum number of concurrent Aerospike transactions that a sink can open. | +| `timeoutMs` | int|false | 100 | This property controls `socketTimeout` and `totalTimeout` for Aerospike transactions. | +| `retries` | int|false | 1 |The maximum number of retries before aborting a write transaction to Aerospike. | diff --git a/site2/website/versioned_docs/version-2.5.0/io-canal-source.md b/site2/website/versioned_docs/version-2.5.0/io-canal-source.md new file mode 100644 index 0000000000000..16ca414ace824 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-canal-source.md @@ -0,0 +1,203 @@ +--- +id: version-2.5.0-io-canal-source +title: Canal source connector +sidebar_label: Canal source connector +original_id: io-canal-source +--- + +The Canal source connector pulls messages from MySQL to Pulsar topics. + +## Configuration + +The configuration of Canal source connector has the following properties. + +### Property + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `username` | true | None | Canal server account (not MySQL).| +| `password` | true | None | Canal server password (not MySQL). | +|`destination`|true|None|Source destination that Canal source connector connects to. +| `singleHostname` | false | None | Canal server address.| +| `singlePort` | false | None | Canal server port.| +| `cluster` | true | false | Whether to enable cluster mode based on Canal server configuration or not.

  • true: **cluster** mode.
    If set to true, it talks to `zkServers` to figure out the actual database host.

  • false: **standalone** mode.
    If set to false, it connects to the database specified by `singleHostname` and `singlePort`. | +| `zkServers` | true | None | Address and port of the Zookeeper that Canal source connector talks to figure out the actual database host.| +| `batchSize` | false | 1000 | Batch size to fetch from Canal. | + +### Example + +Before using the Canal connector, you can create a configuration file through one of the following methods. + +* JSON + + ```json + { + "zkServers": "127.0.0.1:2181", + "batchSize": "5120", + "destination": "example", + "username": "", + "password": "", + "cluster": false, + "singleHostname": "127.0.0.1", + "singlePort": "11111", + } + ``` + +* YAML + + You can create a YAML file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/resources/canal-mysql-source-config.yaml) below to your YAML file. + + ```yaml + configs: + zkServers: "127.0.0.1:2181" + batchSize: 5120 + destination: "example" + username: "" + password: "" + cluster: false + singleHostname: "127.0.0.1" + singlePort: 11111 + ``` + +## Usage + +Here is an example of storing MySQL data using the configuration file as above. + +1. Start a MySQL server. + + ```bash + $ docker pull mysql:5.7 + $ docker run -d -it --rm --name pulsar-mysql -p 3306:3306 -e MYSQL_ROOT_PASSWORD=canal -e MYSQL_USER=mysqluser -e MYSQL_PASSWORD=mysqlpw mysql:5.7 + ``` + +2. Create a configuration file `mysqld.cnf`. + + ```bash + [mysqld] + pid-file = /var/run/mysqld/mysqld.pid + socket = /var/run/mysqld/mysqld.sock + datadir = /var/lib/mysql + #log-error = /var/log/mysql/error.log + # By default we only accept connections from localhost + #bind-address = 127.0.0.1 + # Disabling symbolic-links is recommended to prevent assorted security risks + symbolic-links=0 + log-bin=mysql-bin + binlog-format=ROW + server_id=1 + ``` + +3. Copy the configuration file `mysqld.cnf` to MySQL server. + + ```bash + $ docker cp mysqld.cnf pulsar-mysql:/etc/mysql/mysql.conf.d/ + ``` + +4. Restart the MySQL server. + + ```bash + $ docker restart pulsar-mysql + ``` + +5. Create a test database in MySQL server. + + ```bash + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal -e 'create database test;' + ``` + +6. Start a Canal server and connect to MySQL server. + + ``` + $ docker pull canal/canal-server:v1.1.2 + $ docker run -d -it --link pulsar-mysql -e canal.auto.scan=false -e canal.destinations=test -e canal.instance.master.address=pulsar-mysql:3306 -e canal.instance.dbUsername=root -e canal.instance.dbPassword=canal -e canal.instance.connectionCharset=UTF-8 -e canal.instance.tsdb.enable=true -e canal.instance.gtidon=false --name=pulsar-canal-server -p 8000:8000 -p 2222:2222 -p 11111:11111 -p 11112:11112 -m 4096m canal/canal-server:v1.1.2 + ``` + +7. Start Pulsar standalone. + + ```bash + $ docker pull apachepulsar/pulsar:2.3.0 + $ docker run -d -it --link pulsar-canal-server -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:2.3.0 bin/pulsar standalone + ``` + +8. Modify the configuration file `canal-mysql-source-config.yaml`. + + ```yaml + configs: + zkServers: "" + batchSize: "5120" + destination: "test" + username: "" + password: "" + cluster: false + singleHostname: "pulsar-canal-server" + singlePort: "11111" + ``` + +9. Create a consumer file `pulsar-client.py`. + + ```python + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', + subscription_name='my-sub') + + while True: + msg = consumer.receive() + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + ``` + +10. Copy the configuration file `canal-mysql-source-config.yaml` and the consumer file `pulsar-client.py` to Pulsar server. + + ```bash + $ docker cp canal-mysql-source-config.yaml pulsar-standalone:/pulsar/conf/ + $ docker cp pulsar-client.py pulsar-standalone:/pulsar/ + ``` + +11. Download a Canal connector and start it. + + ```bash + $ docker exec -it pulsar-standalone /bin/bash + $ wget https://archive.apache.org/dist/pulsar/pulsar-2.3.0/connectors/pulsar-io-canal-2.3.0.nar -P connectors + $ ./bin/pulsar-admin source localrun \ + --archive ./connectors/pulsar-io-canal-2.3.0.nar \ + --classname org.apache.pulsar.io.canal.CanalStringSource \ + --tenant public \ + --namespace default \ + --name canal \ + --destination-topic-name my-topic \ + --source-config-file /pulsar/conf/canal-mysql-source-config.yaml \ + --parallelism 1 + ``` + +12. Consume data from MySQL. + + ```bash + $ docker exec -it pulsar-standalone /bin/bash + $ python pulsar-client.py + ``` + +13. Open another window to log in MySQL server. + + ```bash + $ docker exec -it pulsar-mysql /bin/bash + $ mysql -h 127.0.0.1 -uroot -pcanal + ``` + +14. Create a table, and insert, delete, and update data in MySQL server. + + ```bash + mysql> use test; + mysql> show tables; + mysql> CREATE TABLE IF NOT EXISTS `test_table`(`test_id` INT UNSIGNED AUTO_INCREMENT,`test_title` VARCHAR(100) NOT NULL, + `test_author` VARCHAR(40) NOT NULL, + `test_date` DATE,PRIMARY KEY ( `test_id` ))ENGINE=InnoDB DEFAULT CHARSET=utf8; + mysql> INSERT INTO test_table (test_title, test_author, test_date) VALUES("a", "b", NOW()); + mysql> UPDATE test_table SET test_title='c' WHERE test_title='a'; + mysql> DELETE FROM test_table WHERE test_title='c'; + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-cassandra-sink.md b/site2/website/versioned_docs/version-2.5.0/io-cassandra-sink.md new file mode 100644 index 0000000000000..bd6de75c7ce04 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-cassandra-sink.md @@ -0,0 +1,54 @@ +--- +id: version-2.5.0-io-cassandra-sink +title: Cassandra sink connector +sidebar_label: Cassandra sink connector +original_id: io-cassandra-sink +--- + +The Cassandra sink connector pulls messages from Pulsar topics to Cassandra clusters. + +## Configuration + +The configuration of the Cassandra sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `roots` | String|true | " " (empty string) | A comma-separated list of Cassandra hosts to connect to.| +| `keyspace` | String|true| " " (empty string)| The key space used for writing pulsar messages.

    **Note: `keyspace` should be created prior to a Cassandra sink.**| +| `keyname` | String|true| " " (empty string)| The key name of the Cassandra column family.

    The column is used for storing Pulsar message keys.

    If a Pulsar message doesn't have any key associated, the message value is used as the key. | +| `columnFamily` | String|true| " " (empty string)| The Cassandra column family name.

    **Note: `columnFamily` should be created prior to a Cassandra sink.**| +| `columnName` | String|true| " " (empty string) | The column name of the Cassandra column family.

    The column is used for storing Pulsar message values. | + +### Example + +Before using the Cassandra sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + ``` + +* YAML + + ``` + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + ``` + + +## Usage + +For more information about **how to connect Pulsar with Cassandra**, see [here](io-quickstart.md#connect-pulsar-to-apache-cassandra). diff --git a/site2/website/versioned_docs/version-2.5.0/io-cdc-debezium.md b/site2/website/versioned_docs/version-2.5.0/io-cdc-debezium.md new file mode 100644 index 0000000000000..25effc718357e --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-cdc-debezium.md @@ -0,0 +1,475 @@ +--- +id: version-2.5.0-io-cdc-debezium +title: Debezium source connector +sidebar_label: Debezium source connector +original_id: io-cdc-debezium +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL +and persists the messages to Pulsar topics. + +## Configuration + +The configuration of Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `pulsar.service.url` | true | null | Pulsar cluster service URL. | +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | +| `mongodb.hosts` | true | null | The comma-separated list of hostname and port pairs (in the form 'host' or 'host:port') of the MongoDB servers in the replica set. The list contains a single hostname and a port pair. If mongodb.members.auto.discover is set to false, the host and port pair are prefixed with the replica set name (e.g., rs0/localhost:27017). | +| `mongodb.name` | true | null | A unique name that identifies the connector and/or MongoDB replica set or shared cluster that this connector monitors. Each server should be monitored by at most one Debezium connector, since this server name prefixes all persisted Kafka topics emanating from the MongoDB replica set or cluster. | +| `mongodb.user` | true | null | Name of the database user to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.password` | true | null | Password to be used when connecting to MongoDB. This is required only when MongoDB is configured to use authentication. | +| `mongodb.task.id` | true | null | The taskId of the MongoDB connector that attempts to use a separate task for each replica set. | + + + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "offset.storage.topic": "offset-topic" + } + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + $ bin/pulsar standalone + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar`. + + ```bash + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + ``` + +5. Start a MySQL client in docker. + + ```bash + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "postgres", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "schema.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.8 + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "postgres" + database.dbname: "postgres" + database.server.name: "dbserver1" + schema.whitelist: "inventory" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + ``` + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + $ docker pull debezium/example-postgres:0.8 + $ docker run -d -it --rm --name pulsar-postgresql -p 5432:5432 debezium/example-postgres:0.8 + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + $ bin/pulsar standalone + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar`. + + ```bash + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "postgres","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + $ docker exec -it pulsar-postgresql /bin/bash + ``` + +6. A PostgreSQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + psql -U postgres postgres + postgres=# \c postgres; + You are now connected to database "postgres" as user "postgres". + postgres=# SET search_path TO inventory; + SET + postgres=# select * from products; + id | name | description | weight + -----+--------------------+---------------------------------------------------------+-------- + 102 | car battery | 12V car battery | 8.1 + 103 | 12-pack drill bits | 12-pack of drill bits with sizes ranging from #40 to #3 | 0.8 + 104 | hammer | 12oz carpenter's hammer | 0.75 + 105 | hammer | 14oz carpenter's hammer | 0.875 + 106 | hammer | 16oz carpenter's hammer | 1 + 107 | rocks | box of assorted rocks | 5.3 + 108 | jacket | water resistent black wind breaker | 0.1 + 109 | spare tire | 24 inch spare tire | 22.2 + 101 | 1111111111 | Small 2-wheel scooter | 3.14 + (9 rows) + + postgres=# UPDATE products SET name='1111111111' WHERE id=107; + UPDATE 1 + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":107}}�{"schema":{"type":"struct","fields":[{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"before"},{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"after"},{"type":"struct","fields":[{"type":"string","optional":true,"field":"version"},{"type":"string","optional":true,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":false,"field":"db"},{"type":"int64","optional":true,"field":"ts_usec"},{"type":"int64","optional":true,"field":"txId"},{"type":"int64","optional":true,"field":"lsn"},{"type":"string","optional":true,"field":"schema"},{"type":"string","optional":true,"field":"table"},{"type":"boolean","optional":true,"default":false,"field":"snapshot"},{"type":"boolean","optional":true,"field":"last_snapshot_record"}],"optional":false,"name":"io.debezium.connector.postgresql.Source","field":"source"},{"type":"string","optional":false,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"before":{"id":107,"name":"rocks","description":"box of assorted rocks","weight":5.3},"after":{"id":107,"name":"1111111111","description":"box of assorted rocks","weight":5.3},"source":{"version":"0.9.2.Final","connector":"postgresql","name":"dbserver1","db":"postgres","ts_usec":1559208957661080,"txId":577,"lsn":23862872,"schema":"inventory","table":"products","snapshot":false,"last_snapshot_record":null},"op":"u","ts_ms":1559208957692}} + ``` +## Example of MongoDB + +You need to create a configuration file before using the Pulsar Debezium connector. + +* JSON + + ```json + { + "mongodb.hosts": "rs0/mongodb:27017", + "mongodb.name": "dbserver1", + "mongodb.user": "debezium", + "mongodb.password": "dbz", + "mongodb.task.id": "1", + "database.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + ``` + +* YAML + + You can create a `debezium-mongodb-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mongodb/src/main/resources/debezium-mongodb-source-config.yaml) below to the `debezium-mongodb-source-config.yaml` file. + + ```yaml + tenant: "public" + namespace: "default" + name: "debezium-mongodb-source" + topicName: "debezium-mongodb-topic" + archive: "connectors/pulsar-io-debezium-mongodb-{{pulsar:version}}.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.10 + mongodb.hosts: "rs0/mongodb:27017", + mongodb.name: "dbserver1", + mongodb.user: "debezium", + mongodb.password: "dbz", + mongodb.task.id: "1", + database.whitelist: "inventory", + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + ``` + +### Usage + +This example shows how to change the data of a MongoDB table using the Pulsar Debezium connector. + + +1. Start a MongoDB server with a database from which Debezium can capture changes. + + ```bash + $ docker pull debezium/example-mongodb:0.10 + $ docker run -d -it --rm --name pulsar-mongodb -e MONGODB_USER=mongodb -e MONGODB_PASSWORD=mongodb -p 27017:27017 debezium/example-mongodb:0.10 + ``` + Use the following commands to initialize the data. + + ``` bash + ./usr/local/bin/init-inventory.sh + ``` + If the local host cannot access the container network, you can update the file ```/etc/hosts``` and add a rule ```127.0.0.1 6 f114527a95f```. f114527a95f is container id, you can try to get by ```docker ps -a``` + + +2. Start a Pulsar service locally in standalone mode. + + ```bash + $ bin/pulsar standalone + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-mongodb-{{pulsar:version}}.nar`. + + ```bash + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mongodb-{{pulsar:version}}.nar \ + --name debezium-mongodb-source \ + --destination-topic-name debezium-mongodb-topic \ + --tenant public \ + --namespace default \ + --source-config '{"mongodb.hosts": "rs0/mongodb:27017","mongodb.name": "dbserver1","mongodb.user": "debezium","mongodb.password": "dbz","mongodb.task.id": "1","database.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mongodb-source-config.yaml + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + ``` + +5. Start a MongoDB client in docker. + + ```bash + $ docker exec -it pulsar-mongodb /bin/bash + ``` + +6. A MongoDB client pops out. + + ```bash + mongo -u debezium -p dbz --authenticationDatabase admin localhost:27017/inventory + db.products.update({"_id":NumberLong(104)},{$set:{weight:1.25}}) + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":"104"}}, value = {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"int64","optional":true,"field":"h"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"after":"{\"_id\": {\"$numberLong\": \"104\"},\"name\": \"hammer\",\"description\": \"12oz carpenter's hammer\",\"weight\": 1.25,\"quantity\": 4}","patch":null,"source":{"version":"0.10.0.Final","connector":"mongodb","name":"dbserver1","ts_ms":1573541905000,"snapshot":"true","db":"inventory","rs":"rs0","collection":"products","ord":1,"h":4983083486544392763},"op":"r","ts_ms":1573541909761}}. + ``` + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt +max.queue.size= +``` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/io-cdc.md b/site2/website/versioned_docs/version-2.5.0/io-cdc.md new file mode 100644 index 0000000000000..05250e5fa7ef4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-cdc.md @@ -0,0 +1,26 @@ +--- +id: version-2.5.0-io-cdc +title: CDC connector +sidebar_label: CDC connector +original_id: io-cdc +--- + +CDC source connectors capture log changes of databases (such as MySQL, MongoDB, and PostgreSQL) into Pulsar. + +> CDC source connectors are built on top of [Canal](https://github.com/alibaba/canal) and [Debezium](https://debezium.io/) and store all data into Pulsar cluster in a persistent, replicated, and partitioned way. + +Currently, Pulsar has the following CDC connectors. + +Name|Java Class +|---|--- +[Canal source connector](io-canal-source.md)|[org.apache.pulsar.io.canal.CanalStringSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) +[Debezium source connector](io-cdc-debezium.md)|
  • [org.apache.pulsar.io.debezium.DebeziumSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/core/src/main/java/org/apache/pulsar/io/debezium/DebeziumSource.java)
  • [org.apache.pulsar.io.debezium.mysql.DebeziumMysqlSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java)
  • [org.apache.pulsar.io.debezium.postgres.DebeziumPostgresSource.java](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java) + +For more information about Canal and Debezium, see the information below. + +Subject | Reference +|---|--- +How to use Canal source connector with MySQL|[Canal guide](https://github.com/alibaba/canal/wiki) +How does Canal work | [Canal tutorial](https://github.com/alibaba/canal/wiki) +How to use Debezium source connector with MySQL | [Debezium guide](https://debezium.io/docs/connectors/mysql/) +How does Debezium work | [Debezium tutorial](https://debezium.io/docs/tutorial/) diff --git a/site2/website/versioned_docs/version-2.5.0/io-cli.md b/site2/website/versioned_docs/version-2.5.0/io-cli.md new file mode 100644 index 0000000000000..11be9ea8c630f --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-cli.md @@ -0,0 +1,601 @@ +--- +id: version-2.5.0-io-cli +title: Connector Admin CLI +sidebar_label: CLI +original_id: io-cli +--- + +The `pulsar-admin` tool helps you manage Pulsar connectors. + +## `sources` + +An interface for managing Pulsar IO sources (ingress data into Pulsar). + +```bash +$ pulsar-admin sources subcommands +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sources` + +* `reload` + + +### `create` + +Submit a Pulsar IO source connector to run in a Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sources create options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the source.
    Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. +| `--tenant` | The source's tenant. + +### `update` + +Update a already submitted Pulsar IO source connector. + +#### Usage + +```bash +$ pulsar-admin sources update options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--classname` | The source's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per source instance (applicable only to Docker runtime). +| `--deserialization-classname` | The SerDe classname for the source. +| `--destination-topic-name` | The Pulsar topic to which data is sent. +| `--disk` | The disk (in bytes) that needs to be allocated per source instance (applicable only to Docker runtime). +|`--name` | The source's name. +| `--namespace` | The source's namespace. +| ` --parallelism` | The source's parallelism factor, that is, the number of source instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the source.
    Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per source instance (applicable only to the process and Docker runtimes). +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +| `--source-config` | Source config key/values. +| `--source-config-file` | The path to a YAML config file specifying the source's configuration. +| `-t`, `--source-type` | The source's connector provider. +| `--tenant` | The source's tenant. +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + + +### `delete` + +Delete a Pulsar IO source connector. + +#### Usage + +```bash +$ pulsar-admin sources delete options +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `get` + +Get the information about a Pulsar IO source connector. + +#### Usage + +```bash +$ pulsar-admin sources get options +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `status` + +Check the current status of a Pulsar Source. + +#### Usage + +```bash +$ pulsar-admin sources status options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source ID.
    If `instance-id` is not provided, Pulasr gets status of all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `list` + +List all running Pulsar IO source connectors. + +#### Usage + +```bash +$ pulsar-admin sources list options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `stop` + +Stop a source instance. + +#### Usage + +```bash +$ pulsar-admin sources stop options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + +### `start` + +Start a source instance. + +#### Usage + +```bash +$ pulsar-admin sources start options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `restart` + +Restart a source instance. + +#### Usage + +```bash +$ pulsar-admin sources restart options +``` + +#### Options +|Flag|Description| +|---|---| +|`--instance-id`|The source instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The source's name.| +|`--namespace`|The source's namespace.| +|`--tenant`|The source's tenant.| + + +### `localrun` + +Run a Pulsar IO source connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sources localrun options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the NAR archive for the Source.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The source's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--deserialization-classname`|The SerDe classname for the source. +|`--destination-topic-name`|The Pulsar topic to which data is sent. +|`--disk`|The disk (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +|`--name`|The source’s name.| +|`--namespace`|The source’s namespace.| +|`--parallelism`|The source’s parallelism factor, that is, the number of source instances to run).| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the source.
    Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per source instance (applicable only to the Docker runtime).| +| `-st`, `--schema-type` | The schema type.
    Either a builtin schema (for example, AVRO and JSON) or custom schema class name to be used to encode messages emitted from source. +|`--source-config`|Source config key/values. +|`--source-config-file`|The path to a YAML config file specifying the source’s configuration. +|`--source-type`|The source's connector provider. +|`--tenant`|The source’s tenant. +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +|`--use-tls`|Use tls connection.
    **Default value: false**. + +### `available-sources` + +Get the list of Pulsar IO connector sources supported by Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sources available-sources +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash +$ pulsar-admin sources reload +``` + +## `sinks` + +An interface for managing Pulsar IO sinks (egress data from Pulsar). + +```bash +$ pulsar-admin sinks subcommands +``` + +Subcommands are: + +* `create` + +* `update` + +* `delete` + +* `get` + +* `status` + +* `list` + +* `stop` + +* `start` + +* `restart` + +* `localrun` + +* `available-sinks` + +* `reload` + + +### `create` + +Submit a Pulsar IO sink connector to run in a Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sinks create options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the sink.
    Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). + +### `update` + +Update a Pulsar IO sink connector. + +#### Usage + +```bash +$ pulsar-admin sinks update options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--classname` | The sink's class name if `archive` is file-url-path (file://). +| `--cpu` | The CPU (in cores) that needs to be allocated per sink instance (applicable only to Docker runtime). +| `--custom-schema-inputs` | The map of input topics to schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +| `--disk` | The disk (in bytes) that needs to be allocated per sink instance (applicable only to Docker runtime). +|`-i, --inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name` | The sink's name. +| `--namespace` | The sink's namespace. +| ` --parallelism` | The sink's parallelism factor, that is, the number of sink instances to run. +| `--processing-guarantees` | The processing guarantees (aka delivery semantics) applied to the sink.
    Possible Values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +| `--ram` | The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the process and Docker runtimes). +| `--retain-ordering` | Sink consumes and sinks messages in order. +| `--sink-config` | sink config key/values. +| `--sink-config-file` | The path to a YAML config file specifying the sink's configuration. +| `-t`, `--sink-type` | The sink's connector provider. +| `--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +| `--tenant` | The sink's tenant. +| `--timeout-ms` | The message timeout in milliseconds. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +| `--update-auth-data` | Whether or not to update the auth data.
    **Default value: false.** + +### `delete` + +Delete a Pulsar IO sink connector. + +#### Usage + +```bash +$ pulsar-admin sinks delete options +``` + +#### Option + +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `get` + +Get the information about a Pulsar IO sink connector. + +#### Usage + +```bash +$ pulsar-admin sinks get options +``` + +#### Options +|Flag|Description| +|---|---| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `status` + +Check the current status of a Pulsar sink. + +#### Usage + +```bash +$ pulsar-admin sinks status options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink ID.
    If `instance-id` is not provided, Pulasr gets status of all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `list` + +List all running Pulsar IO sink connectors. + +#### Usage + +```bash +$ pulsar-admin sinks list options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `stop` + +Stop a sink instance. + +#### Usage + +```bash +$ pulsar-admin sinks stop options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar stops all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + +### `start` + +Start a sink instance. + +#### Usage + +```bash +$ pulsar-admin sinks start options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar starts all instances.| +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `restart` + +Restart a sink instance. + +#### Usage + +```bash +$ pulsar-admin sinks restart options +``` + +#### Options + +|Flag|Description| +|---|---| +|`--instance-id`|The sink instanceID.
    If `instance-id` is not provided, Pulsar restarts all instances. +|`--name`|The sink's name.| +|`--namespace`|The sink's namespace.| +|`--tenant`|The sink's tenant.| + + +### `localrun` + +Run a Pulsar IO sink connector locally rather than deploying it to the Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sinks localrun options +``` + +#### Options + +|Flag|Description| +|----|---| +| `-a`, `--archive` | The path to the archive file for the sink.
    It also supports url-path (http/https/file [file protocol assumes that file already exists on worker host]) from which worker can download the package. +| `--auto-ack` | Whether or not the framework will automatically acknowledge messages. +| `--broker-service-url` | The URL for the Pulsar broker. +|`--classname`|The sink's class name if `archive` is file-url-path (file://). +| `--client-auth-params` | Client authentication parameter. +| `--client-auth-plugin` | Client authentication plugin using which function-process can connect to broker. +|`--cpu`|The CPU (in cores) that needs to be allocated per sink instance (applicable only to the Docker runtime). +| `--custom-schema-inputs` | The map of input topics to Schema types or class names (as a JSON string). +| `--custom-serde-inputs` | The map of input topics to SerDe class names (as a JSON string). +|`--disk`|The disk (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--hostname-verification-enabled`|Enable hostname verification.
    **Default value: false**. +| `-i`, `--inputs` | The sink's input topic or topics (multiple topics can be specified as a comma-separated list). +|`--name`|The sink’s name.| +|`--namespace`|The sink’s namespace.| +|`--parallelism`|The sink’s parallelism factor, that is, the number of sink instances to run).| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the sink.
    Available values: ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE. +|`--ram`|The RAM (in bytes) that needs to be allocated per sink instance (applicable only to the Docker runtime).| +|`--retain-ordering` | Sink consumes and sinks messages in order. +|`--sink-config`|sink config key/values. +|`--sink-config-file`|The path to a YAML config file specifying the sink’s configuration. +|`--sink-type`|The sink's connector provider. +|`--subs-name` | Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer. +|`--tenant`|The sink’s tenant. +| `--timeout-ms` | The message timeout in milliseconds. +|`--tls-allow-insecure`|Allow insecure tls connection.
    **Default value: false**. +|`--tls-trust-cert-path`|The tls trust cert file path. +| `--topics-pattern` | TopicsPattern to consume from list of topics under a namespace that match the pattern.
    `--input` and `--topics-Pattern` are mutually exclusive.
    Add SerDe class name for a pattern in `--customSerdeInputs` (supported for java fun only). +|`--use-tls`|Use tls connection.
    **Default value: false**. + +### `available-sinks` + +Get the list of Pulsar IO connector sinks supported by Pulsar cluster. + +#### Usage + +```bash +$ pulsar-admin sinks available-sinks +``` + +### `reload` + +Reload the available built-in connectors. + +#### Usage + +```bash +$ pulsar-admin sinks reload +``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-connectors.md b/site2/website/versioned_docs/version-2.5.0/io-connectors.md new file mode 100644 index 0000000000000..289a55becf7b9 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-connectors.md @@ -0,0 +1,189 @@ +--- +id: version-2.5.0-io-connectors +title: Built-in connector +sidebar_label: Built-in connector +original_id: io-connectors +--- + +Pulsar distribution includes a set of common connectors that have been packaged and tested with the rest of Apache Pulsar. These connectors import and export data from some of the most commonly used data systems. + +Using any of these connectors is as easy as writing a simple connector and running the connector locally or submitting the connector to a Pulsar Functions cluster. + +## Source connector + +Pulsar has various source connectors, which are sorted alphabetically as below. + +### Canal + +* [Configuration](io-canal-source.md#configuration) + +* [Example](io-canal-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/canal/src/main/java/org/apache/pulsar/io/canal/CanalStringSource.java) + + +### Debezium MySQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-mysql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/java/org/apache/pulsar/io/debezium/mysql/DebeziumMysqlSource.java) + +### Debezium PostgreSQL + +* [Configuration](io-debezium-source.md#configuration) + +* [Example](io-debezium-source.md#example-of-postgresql) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/java/org/apache/pulsar/io/debezium/postgres/DebeziumPostgresSource.java) + + +### File + +* [Configuration](io-file-source.md#configuration) + +* [Example](io-file-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/file/src/main/java/org/apache/pulsar/io/file/FileSource.java) + +### Flume + +* [Configuration](io-flume-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/FlumeConnector.java) + +### Twitter firehose + +* [Configuration](io-twitter-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/twitter/src/main/java/org/apache/pulsar/io/twitter/TwitterFireHose.java) + +### Kafka + +* [Configuration](io-kafka-source.md#configuration) + +* [Example](io-kafka-source.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java) + +### Kinesis + +* [Configuration](io-kinesis-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSource.java) + +### Netty + +* [Configuration](io-netty-source.md#configuration) + +* [Example of TCP](io-netty-source.md#tcp) + +* [Example of HTTP](io-netty-source.md#http) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/netty/src/main/java/org/apache/pulsar/io/netty/NettySource.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-source.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSource.java) + +## Sink connector + +Pulsar has various sink connectors, which are sorted alphabetically as below. + +### Aerospike + +* [Configuration](io-aerospike-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/aerospike/src/main/java/org/apache/pulsar/io/aerospike/AerospikeStringSink.java) + +### Cassandra + +* [Configuration](io-cassandra-sink.md#configuration) + +* [Example](io-cassandra-sink.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/cassandra/src/main/java/org/apache/pulsar/io/cassandra/CassandraStringSink.java) + +### ElasticSearch + +* [Configuration](io-elasticsearch-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/elastic-search/src/main/java/org/apache/pulsar/io/elasticsearch/ElasticSearchSink.java) + +### Flume + +* [Configuration](io-flume-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/java/org/apache/pulsar/io/flume/sink/StringSink.java) + +### HBase + +* [Configuration](io-hbase.md) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hbase/src/main/java/org/apache/pulsar/io/hbase/HbaseAbstractConfig.java) + +### HDFS2 + +* [Configuration](io-hdfs2-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs2/src/main/java/org/apache/pulsar/io/hdfs2/AbstractHdfsConnector.java) + +### HDFS3 + +* [Configuration](io-hdfs3-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/hdfs3/src/main/java/org/apache/pulsar/io/hdfs3/AbstractHdfsConnector.java) + +### InfluxDB + +* [Configuration](io-influxdb-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/influxdb/src/main/java/org/apache/pulsar/io/influxdb/InfluxDBAbstractSink.java) + +### JDBC + +* [Configuration](io-jdbc-sink.md#configuration) + +* [Example](io-jdbc-sink.md#usage) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/jdbc/src/main/java/org/apache/pulsar/io/jdbc/JdbcAbstractSink.java) + +### Kafka + +* [Configuration](io-kafka-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java) + +### Kinesis + +* [Configuration](io-kinesis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/KinesisSink.java) + +### MongoDB + +* [Configuration](io-mongo-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/mongo/src/main/java/org/apache/pulsar/io/mongodb/MongoSink.java) + +### RabbitMQ + +* [Configuration](io-rabbitmq-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/rabbitmq/src/main/java/org/apache/pulsar/io/rabbitmq/RabbitMQSink.java) + +### Redis + +* [Configuration](io-redis-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/redis/src/main/java/org/apache/pulsar/io/redis/RedisAbstractConfig.java) + +### Solr + +* [Configuration](io-solr-sink.md#configuration) + +* [Java class](https://github.com/apache/pulsar/blob/master/pulsar-io/solr/src/main/java/org/apache/pulsar/io/solr/SolrSinkConfig.java) + diff --git a/site2/website/versioned_docs/version-2.5.0/io-debezium-source.md b/site2/website/versioned_docs/version-2.5.0/io-debezium-source.md new file mode 100644 index 0000000000000..e27f765cbdf7a --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-debezium-source.md @@ -0,0 +1,350 @@ +--- +id: version-2.5.0-io-debezium-source +title: Debezium source onnector +sidebar_label: Debezium source connector +original_id: io-debezium-source +--- + +The Debezium source connector pulls messages from MySQL or PostgreSQL to Pulsar topics. + +## Configuration + +The configuration of the Debezium source connector has the following properties. + +| Name | Required | Default | Description | +|------|----------|---------|-------------| +| `task.class` | true | null | A source task class that implemented in Debezium. | +| `database.hostname` | true | null | The address of a database server. | +| `database.port` | true | null | The port number of a database server.| +| `database.user` | true | null | The name of a database user that has the required privileges. | +| `database.password` | true | null | The password for a database user that has the required privileges. | +| `database.server.id` | true | null | The connector’s identifier that must be unique within a database cluster and similar to the database’s server-id configuration property. | +| `database.server.name` | true | null | The logical name of a database server/cluster, which forms a namespace and it is used in all the names of Kafka topics to which the connector writes, the Kafka Connect schema names, and the namespaces of the corresponding Avro schema when the Avro Connector is used. | +| `database.whitelist` | false | null | A list of all databases hosted by this server which is monitored by the connector.

    This is optional, and there are other properties for listing databases and tables to include or exclude from monitoring. | +| `key.converter` | true | null | The converter provided by Kafka Connect to convert record key. | +| `value.converter` | true | null | The converter provided by Kafka Connect to convert record value. | +| `database.history` | true | null | The name of the database history class. | +| `database.history.pulsar.topic` | true | null | The name of the database history topic where the connector writes and recovers DDL statements.

    **Note: this topic is for internal use only and should not be used by consumers.** | +| `database.history.pulsar.service.url` | true | null | Pulsar cluster service URL for history topic. | +| `pulsar.service.url` | true | null | Pulsar cluster service URL. | +| `offset.storage.topic` | true | null | Record the last committed offsets that the connector successfully completes. | + +## Example of MySQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + { + "database.hostname": "localhost", + "database.port": "3306", + "database.user": "debezium", + "database.password": "dbz", + "database.server.id": "184054", + "database.server.name": "dbserver1", + "database.whitelist": "inventory", + "database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory", + "database.history.pulsar.topic": "history-topic", + "database.history.pulsar.service.url": "pulsar://127.0.0.1:6650", + "key.converter": "org.apache.kafka.connect.json.JsonConverter", + "value.converter": "org.apache.kafka.connect.json.JsonConverter", + "pulsar.service.url": "pulsar://127.0.0.1:6650", + "offset.storage.topic": "offset-topic" + } + ``` + +* YAML + + You can create a `debezium-mysql-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/mysql/src/main/resources/debezium-mysql-source-config.yaml) below to the `debezium-mysql-source-config.yaml` file. + + ```yaml + tenant: "public" + namespace: "default" + name: "debezium-mysql-source" + topicName: "debezium-mysql-topic" + archive: "connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar" + parallelism: 1 + + configs: + + ## config for mysql, docker image: debezium/example-mysql:0.8 + database.hostname: "localhost" + database.port: "3306" + database.user: "debezium" + database.password: "dbz" + database.server.id: "184054" + database.server.name: "dbserver1" + database.whitelist: "inventory" + database.history: "org.apache.pulsar.io.debezium.PulsarDatabaseHistory" + database.history.pulsar.topic: "history-topic" + database.history.pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## KEY_CONVERTER_CLASS_CONFIG, VALUE_CONVERTER_CLASS_CONFIG + key.converter: "org.apache.kafka.connect.json.JsonConverter" + value.converter: "org.apache.kafka.connect.json.JsonConverter" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + + ## OFFSET_STORAGE_TOPIC_CONFIG + offset.storage.topic: "offset-topic" + ``` + +### Usage + +This example shows how to change the data of a MySQL table using the Pulsar Debezium connector. + +1. Start a MySQL server with a database from which Debezium can capture changes. + + ```bash + $ docker run -it --rm \ + --name mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=debezium \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw debezium/example-mysql:0.8 + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + $ bin/pulsar standalone + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar`. + + ```bash + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-mysql-{{pulsar:version}}.nar \ + --name debezium-mysql-source --destination-topic-name debezium-mysql-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "3306","database.user": "debezium","database.password": "dbz","database.server.id": "184054","database.server.name": "dbserver1","database.whitelist": "inventory","database.history": "org.apache.pulsar.io.debezium.PulsarDatabaseHistory","database.history.pulsar.topic": "history-topic","database.history.pulsar.service.url": "pulsar://127.0.0.1:6650","key.converter": "org.apache.kafka.connect.json.JsonConverter","value.converter": "org.apache.kafka.connect.json.JsonConverter","pulsar.service.url": "pulsar://127.0.0.1:6650","offset.storage.topic": "offset-topic"}' + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-mysql-source-config.yaml + ``` + +4. Subscribe the topic _sub-products_ for the table _inventory.products_. + + ```bash + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + ``` + +5. Start a MySQL client in docker. + + ```bash + $ docker run -it --rm \ + --name mysqlterm \ + --link mysql \ + --rm mysql:5.7 sh \ + -c 'exec mysql -h"$MYSQL_PORT_3306_TCP_ADDR" -P"$MYSQL_PORT_3306_TCP_PORT" -uroot -p"$MYSQL_ENV_MYSQL_ROOT_PASSWORD"' + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + mysql> use inventory; + mysql> show tables; + mysql> SELECT * FROM products; + mysql> UPDATE products SET name='1111111111' WHERE id=101; + mysql> UPDATE products SET name='1111111111' WHERE id=107; + ``` + + In the terminal window of subscribing topic, you can find the data changes have been kept in the _sub-products_ topic. + +## Example of PostgreSQL + +You need to create a configuration file before using the Pulsar Debezium connector. + +### Configuration + +You can use one of the following methods to create a configuration file. + +* JSON + + ```json + { + "database.hostname": "localhost", + "database.port": "5432", + "database.user": "postgres", + "database.password": "postgres", + "database.dbname": "postgres", + "database.server.name": "dbserver1", + "schema.whitelist": "inventory", + "pulsar.service.url": "pulsar://127.0.0.1:6650" + } + ``` + +* YAML + + You can create a `debezium-postgres-source-config.yaml` file and copy the [contents](https://github.com/apache/pulsar/blob/master/pulsar-io/debezium/postgres/src/main/resources/debezium-postgres-source-config.yaml) below to the `debezium-postgres-source-config.yaml` file. + + ```yaml + tenant: "public" + namespace: "default" + name: "debezium-postgres-source" + topicName: "debezium-postgres-topic" + archive: "connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar" + parallelism: 1 + + configs: + + ## config for pg, docker image: debezium/example-postgress:0.8 + database.hostname: "localhost" + database.port: "5432" + database.user: "postgres" + database.password: "postgres" + database.dbname: "postgres" + database.server.name: "dbserver1" + schema.whitelist: "inventory" + + ## PULSAR_SERVICE_URL_CONFIG + pulsar.service.url: "pulsar://127.0.0.1:6650" + ``` + +### Usage + +This example shows how to change the data of a PostgreSQL table using the Pulsar Debezium connector. + + +1. Start a PostgreSQL server with a database from which Debezium can capture changes. + + ```bash + $ docker pull debezium/example-postgres:0.8 + $ docker run -d -it --rm --name pulsar-postgresql -p 5432:5432 debezium/example-postgres:0.8 + ``` + +2. Start a Pulsar service locally in standalone mode. + + ```bash + $ bin/pulsar standalone + ``` + +3. Start the Pulsar Debezium connector in local run mode using one of the following methods. + + * Use the **JSON** configuration file as shown previously. + + Make sure the nar file is available at `connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar`. + + ```bash + $ bin/pulsar-admin source localrun \ + --archive connectors/pulsar-io-debezium-postgres-{{pulsar:version}}.nar \ + --name debezium-postgres-source \ + --destination-topic-name debezium-postgres-topic \ + --tenant public \ + --namespace default \ + --source-config '{"database.hostname": "localhost","database.port": "5432","database.user": "postgres","database.password": "postgres","database.dbname": "postgres","database.server.name": "dbserver1","schema.whitelist": "inventory","pulsar.service.url": "pulsar://127.0.0.1:6650"}' + ``` + + * Use the **YAML** configuration file as shown previously. + + ```bash + $ bin/pulsar-admin source localrun \ + --source-config-file debezium-postgres-source-config.yaml + ``` + +4. Subscribe the topic _sub-products_ for the _inventory.products_ table. + + ``` + $ bin/pulsar-client consume -s "sub-products" public/default/dbserver1.inventory.products -n 0 + ``` + +5. Start a PostgreSQL client in docker. + + ```bash + $ docker exec -it pulsar-postgresql /bin/bash + ``` + +6. A MySQL client pops out. + + Use the following commands to change the data of the table _products_. + + ``` + psql -U postgres postgres + postgres=# \c postgres; + You are now connected to database "postgres" as user "postgres". + postgres=# SET search_path TO inventory; + SET + postgres=# select * from products; + id | name | description | weight + -----+--------------------+---------------------------------------------------------+-------- + 102 | car battery | 12V car battery | 8.1 + 103 | 12-pack drill bits | 12-pack of drill bits with sizes ranging from #40 to #3 | 0.8 + 104 | hammer | 12oz carpenter's hammer | 0.75 + 105 | hammer | 14oz carpenter's hammer | 0.875 + 106 | hammer | 16oz carpenter's hammer | 1 + 107 | rocks | box of assorted rocks | 5.3 + 108 | jacket | water resistent black wind breaker | 0.1 + 109 | spare tire | 24 inch spare tire | 22.2 + 101 | 1111111111 | Small 2-wheel scooter | 3.14 + (9 rows) + + postgres=# UPDATE products SET name='1111111111' WHERE id=107; + UPDATE 1 + ``` + + In the terminal window of subscribing topic, you can receive the following messages. + + ```bash + ----- got message ----- + {"schema":{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.products.Key"},"payload":{"id":107}}�{"schema":{"type":"struct","fields":[{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"before"},{"type":"struct","fields":[{"type":"int32","optional":false,"field":"id"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":true,"field":"description"},{"type":"double","optional":true,"field":"weight"}],"optional":true,"name":"dbserver1.inventory.products.Value","field":"after"},{"type":"struct","fields":[{"type":"string","optional":true,"field":"version"},{"type":"string","optional":true,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"string","optional":false,"field":"db"},{"type":"int64","optional":true,"field":"ts_usec"},{"type":"int64","optional":true,"field":"txId"},{"type":"int64","optional":true,"field":"lsn"},{"type":"string","optional":true,"field":"schema"},{"type":"string","optional":true,"field":"table"},{"type":"boolean","optional":true,"default":false,"field":"snapshot"},{"type":"boolean","optional":true,"field":"last_snapshot_record"}],"optional":false,"name":"io.debezium.connector.postgresql.Source","field":"source"},{"type":"string","optional":false,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"}],"optional":false,"name":"dbserver1.inventory.products.Envelope"},"payload":{"before":{"id":107,"name":"rocks","description":"box of assorted rocks","weight":5.3},"after":{"id":107,"name":"1111111111","description":"box of assorted rocks","weight":5.3},"source":{"version":"0.9.2.Final","connector":"postgresql","name":"dbserver1","db":"postgres","ts_usec":1559208957661080,"txId":577,"lsn":23862872,"schema":"inventory","table":"products","snapshot":false,"last_snapshot_record":null},"op":"u","ts_ms":1559208957692}} + ``` + +## FAQ + +### Debezium postgres connector will hang when create snap + +```$xslt +#18 prio=5 os_prio=31 tid=0x00007fd83096f800 nid=0xa403 waiting on condition [0x000070000f534000] + java.lang.Thread.State: WAITING (parking) + at sun.misc.Unsafe.park(Native Method) + - parking to wait for <0x00000007ab025a58> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) + at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) + at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) + at java.util.concurrent.LinkedBlockingDeque.putLast(LinkedBlockingDeque.java:396) + at java.util.concurrent.LinkedBlockingDeque.put(LinkedBlockingDeque.java:649) + at io.debezium.connector.base.ChangeEventQueue.enqueue(ChangeEventQueue.java:132) + at io.debezium.connector.postgresql.PostgresConnectorTask$$Lambda$203/385424085.accept(Unknown Source) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.sendCurrentRecord(RecordsSnapshotProducer.java:402) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.readTable(RecordsSnapshotProducer.java:321) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$takeSnapshot$6(RecordsSnapshotProducer.java:226) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$$Lambda$240/1347039967.accept(Unknown Source) + at io.debezium.jdbc.JdbcConnection.queryWithBlockingConsumer(JdbcConnection.java:535) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.takeSnapshot(RecordsSnapshotProducer.java:224) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.lambda$start$0(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.RecordsSnapshotProducer$$Lambda$206/589332928.run(Unknown Source) + at java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705) + at java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717) + at java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010) + at io.debezium.connector.postgresql.RecordsSnapshotProducer.start(RecordsSnapshotProducer.java:87) + at io.debezium.connector.postgresql.PostgresConnectorTask.start(PostgresConnectorTask.java:126) + at io.debezium.connector.common.BaseSourceTask.start(BaseSourceTask.java:47) + at org.apache.pulsar.io.kafka.connect.KafkaConnectSource.open(KafkaConnectSource.java:127) + at org.apache.pulsar.io.debezium.DebeziumSource.open(DebeziumSource.java:100) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupInput(JavaInstanceRunnable.java:690) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.setupJavaInstance(JavaInstanceRunnable.java:200) + at org.apache.pulsar.functions.instance.JavaInstanceRunnable.run(JavaInstanceRunnable.java:230) + at java.lang.Thread.run(Thread.java:748) +``` + +If you encounter the above problems in synchronizing data, please refer to [this](https://github.com/apache/pulsar/issues/4075) and add the following configuration to the configuration file: + +```$xslt +max.queue.size= +``` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/io-develop.md b/site2/website/versioned_docs/version-2.5.0/io-develop.md new file mode 100644 index 0000000000000..8396f969d27af --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-develop.md @@ -0,0 +1,230 @@ +--- +id: version-2.5.0-io-develop +title: How to develop Pulsar connectors +sidebar_label: Develop +original_id: io-develop +--- + +This guide describes how to develop Pulsar connectors to move data +between Pulsar and other systems. + +Pulsar connectors are special [Pulsar Functions](functions-overview.md), so creating +a Pulsar connector is similar to creating a Pulsar function. + +Pulsar connectors come in two types: + +| Type | Description | Example +|---|---|--- +{@inject: github:`Source`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java}|Import data from another system to Pulsar.|[RabbitMQ source connector](io-rabbitmq.md) imports the messages of a RabbitMQ queue to a Pulsar topic. +{@inject: github:`Sink`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java}|Export data from Pulsar to another system.|[Kinesis sink connector](io-kinesis.md) exports the messages of a Pulsar topic to a Kinesis stream. + +## Develop + +You can develop Pulsar source connectors and sink connectors. + +### Source + +Developing a source connector is to implement the {@inject: github:`Source`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java} +interface, which means you need to implement the {@inject: github:`open`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java#L33} method and the {@inject: github:`record`:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java#L28} method. + +1. Implement the {@inject: github:`open`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java#L33} method. + + ```java + /** + * Open connector with configuration + * + * @param config initialization config + * @param sourceContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SourceContext sourceContext) throws Exception; + ``` + + This method is called when the source connector is initialized. + + In this method, you can retrieve all connector specific settings through the passed-in `config` parameter and initialize all necessary resources. + + For example, a Kafka connector can create a Kafka client in this `open` method. + + Besides, Pulsar runtime also provides a `SourceContext` for the + connector to access runtime resources for tasks like collecting metrics. The implementation can save the `SourceContext` for future use. + +2. Implement the {@inject: github:`read`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Source.java#L41} method. + + ```java + /** + * Reads the next message from source. + * If source does not have any new messages, this call should block. + * @return next message from source. The return result should never be null + * @throws Exception + */ + Record read() throws Exception; + ``` + + If nothing to return, the implementation should be blocking rather than returning `null`. + + The returned {@inject: github:`Record`:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java#L28} should encapsulate the following information, which is needed by Pulsar IO runtime. + + * {@inject: github:`Record`:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java#L28} should provide the following variables: + + |Variable|Required|Description + |---|---|--- + `TopicName`|No|Pulsar topic name from which the record is originated from. + `Key`|No| Messages can optionally be tagged with keys.

    For more information, see [Routing modes](concepts-messaging.md#routing-modes).| + `Value`|Yes|Actual data of the record. + `EventTime`|No|Event time of the record from the source. + `PartitionId`|No| If the record is originated from a partitioned source, it returns its `PartitionId`.

    `PartitionId` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `RecordSequence`|No|If the record is originated from a sequential source, it returns its `RecordSequence`.

    `RecordSequence` is used as a part of the unique identifier by Pulsar IO runtime to deduplicate messages and achieve exactly-once processing guarantee. + `Properties` |No| If the record carries user-defined properties, it returns those properties. + `DestinationTopic`|No|Topic to which message should be written. + `Message`|No|A class which carries data sent by users.

    For more information, see [Message.java](https://github.com/apache/pulsar/blob/master/pulsar-client-api/src/main/java/org/apache/pulsar/client/api/Message.java).| + + * {@inject: github:`Record`:/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Record.java#L28} should provide the following methods: + + Method|Description + |---|--- + `ack` |Acknowledge that the record is fully processed. + `fail`|Indicate that the record fails to be processed. + +> #### Tip +> +> For more information about **how to create a source connector**, see {@inject: github:`KafkaSource`:/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java}. + +### Sink + +Developing a sink connector **is similar to** developing a source connector, that is, you need to implement the {@inject: github:`Sink`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java} interface, which means implementing the {@inject: github:`open`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java#L36} method and the {@inject: github:`write`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java#L44} method. + +1. Implement the {@inject: github:`open`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java#L36} method. + + ```java + /** + * Open connector with configuration + * + * @param config initialization config + * @param sinkContext + * @throws Exception IO type exceptions when opening a connector + */ + void open(final Map config, SinkContext sinkContext) throws Exception; + ``` + +2. Implement the {@inject: github:`write`:/pulsar-io/core/src/main/java/org/apache/pulsar/io/core/Sink.java#L44} method. + + ```java + /** + * Write a message to Sink + * @param inputRecordContext Context of input record from the source + * @param record record to write to sink + * @throws Exception + */ + void write(Record record) throws Exception; + ``` + + During the implementation, you can decide how to write the `Value` and + the `Key` to the actual source, and leverage all the provided information such as + `PartitionId` and `RecordSequence` to achieve different processing guarantees. + + You also need to ack records (if messages are sent successfully) or fail records (if messages fail to send). + +## Test + +Testing connectors can be challenging because Pulsar IO connectors interact with two systems +that may be difficult to mock—Pulsar and the system to which the connector is connecting. + +It is +recommended writing special tests to test the connector functionalities as below +while mocking the external service. + +### Unit test + +You can create unit tests for your connector. + +### Integration test + +Once you have written sufficient unit tests, you can add +separate integration tests to verify end-to-end functionality. + +Pulsar uses +[testcontainers](https://www.testcontainers.org/) **for all integration tests**. + +> #### Tip +> +>For more information about **how to create integration tests for Pulsar connectors**, see {@inject: github:`IntegrationTests`:/tests/integration/src/test/java/org/apache/pulsar/tests/integration/io}. + +## Package + +Once you've developed and tested your connector, you need to package it so that it can be submitted +to a [Pulsar Functions](functions-overview.md) cluster. + +There are two methods to +work with Pulsar Functions' runtime, that is, [NAR](#nar) and [uber JAR](#uber-jar). + +> #### Note +> +> If you plan to package and distribute your connector for others to use, you are obligated to +license and copyright your own code properly. Remember to add the license and copyright to +all libraries your code uses and to your distribution. +> +> If you use the [NAR](#nar) method, the NAR plugin +automatically creates a `DEPENDENCIES` file in the generated NAR package, including the proper +licensing and copyrights of all libraries of your connector. + +### NAR + +**NAR** stands for NiFi Archive, which is a custom packaging mechanism used by Apache NiFi, to provide +a bit of Java ClassLoader isolation. + +> #### Tip +> +> For more information about **how NAR works**, see +> [here](https://medium.com/hashmapinc/nifi-nar-files-explained-14113f7796fd). + +Pulsar uses the same mechanism for packaging **all** [built-in connectors](io-connectors). + +The easiest approach to package a Pulsar connector is to create a NAR package using +[nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin). + +All you need to do is to include this [nifi-nar-maven-plugin](https://mvnrepository.com/artifact/org.apache.nifi/nifi-nar-maven-plugin) in your maven project for your connector as below. + +```xml + + + org.apache.nifi + nifi-nar-maven-plugin + 1.2.0 + + +``` + +> #### Tip +> +> For more information about an **how to use NAR for Pulsar connectors**, see {@inject: github:`TwitterFirehose`:/pulsar-io/twitter/pom.xml#L79}. + +### Uber JAR + +An alternative approach is to create an **uber JAR** that contains all of the connector's JAR files +and other resource files. No directory internal structure is necessary. + +You can use [maven-shade-plugin](https://maven.apache.org/plugins/maven-shade-plugin/examples/includes-excludes.html) to create a uber JAR as below: + +```xml + + org.apache.maven.plugins + maven-shade-plugin + 3.1.1 + + + package + + shade + + + + + *:* + + + + + + +``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-elasticsearch-sink.md b/site2/website/versioned_docs/version-2.5.0/io-elasticsearch-sink.md new file mode 100644 index 0000000000000..c736a6c6d86eb --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-elasticsearch-sink.md @@ -0,0 +1,50 @@ +--- +id: version-2.5.0-io-elasticsearch-sink +title: ElasticSearch sink connector +sidebar_label: ElasticSearch sink connector +original_id: io-elasticsearch-sink +--- + +The ElasticSearch sink connector pulls messages from Pulsar topics and persists the messages to indexes. + +## Configuration + +The configuration of the ElasticSearch sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `elasticSearchUrl` | String| true |" " (empty string)| The URL of elastic search cluster to which the connector connects. | +| `indexName` | String| true |" " (empty string)| The index name to which the connector writes messages. | +| `indexNumberOfShards` | int| false |1| The number of shards of the index. | +| `indexNumberOfReplicas` | int| false |1 | The number of replicas of the index. | +| `username` | String| false |" " (empty string)| The username used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | +| `password` | String| false | " " (empty string)|The password used by the connector to connect to the elastic search cluster.

    If `username` is set, then `password` should also be provided. | + +### Example + +Before using the ElasticSearch sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "elasticSearchUrl": "http://localhost:90902", + "indexName": "myIndex", + "username": "scooby", + "password": "doobie" + } + ``` + +* YAML + + ```yaml + configs: + elasticSearchUrl: "http://localhost:90902" + indexName: "myIndex" + username: "scooby" + password: "doobie" + ``` + + diff --git a/site2/website/versioned_docs/version-2.5.0/io-file-source.md b/site2/website/versioned_docs/version-2.5.0/io-file-source.md new file mode 100644 index 0000000000000..13824865ca8e3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-file-source.md @@ -0,0 +1,138 @@ +--- +id: version-2.5.0-io-file-source +title: File source connector +sidebar_label: File source connector +original_id: io-file-source +--- + +The File source connector pulls messages from files in directories and persists the messages to Pulsar topics. + +## Configuration + +The configuration of the File source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `inputDirectory` | String|true | No default value|The input directory to pull files. | +| `recurse` | Boolean|false | true | Whether to pull files from subdirectory or not.| +| `keepFile` |Boolean|false | false | If set to true, the file is not deleted after it is processed, which means the file can be picked up continually. | +| `fileFilter` | String|false| [^\\.].* | The file whose name matches the given regular expression is picked up. | +| `pathFilter` | String |false | NULL | If `recurse` is set to true, the subdirectory whose path matches the given regular expression is scanned. | +| `minimumFileAge` | Integer|false | 0 | The minimum age that a file can be processed.

    Any file younger than `minimumFileAge` (according to the last modification date) is ignored. | +| `maximumFileAge` | Long|false |Long.MAX_VALUE | The maximum age that a file can be processed.

    Any file older than `maximumFileAge` (according to last modification date) is ignored. | +| `minimumSize` |Integer| false |1 | The minimum size (in bytes) that a file can be processed. | +| `maximumSize` | Double|false |Double.MAX_VALUE| The maximum size (in bytes) that a file can be processed. | +| `ignoreHiddenFiles` |Boolean| false | true| Whether the hidden files should be ignored or not. | +| `pollingInterval`|Long | false | 10000L | Indicates how long to wait before performing a directory listing. | +| `numWorkers` | Integer | false | 1 | The number of worker threads that process files.

    This allows you to process a larger number of files concurrently.

    However, setting this to a value greater than 1 makes the data from multiple files mixed in the target topic. | + +### Example + +Before using the File source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "inputDirectory": "/Users/david", + "recurse": true, + "keepFile": true, + "fileFilter": "[^\\.].*", + "pathFilter": "*", + "minimumFileAge": 0, + "maximumFileAge": 9999999999, + "minimumSize": 1, + "maximumSize": 5000000, + "ignoreHiddenFiles": true, + "pollingInterval": 5000, + "numWorkers": 1 + } + ``` + +* YAML + + ```yaml + configs: + inputDirectory: "/Users/david" + recurse: true + keepFile: true + fileFilter: "[^\\.].*" + pathFilter: "*" + minimumFileAge: 0 + maximumFileAge: 9999999999 + minimumSize: 1 + maximumSize: 5000000 + ignoreHiddenFiles: true + pollingInterval: 5000 + numWorkers: 1 + ``` + +## Usage + +Here is an example of using the File source connecter. + +1. Pull a Pulsar image. + + ```bash + $ docker pull apachepulsar/pulsar:{version} + ``` + +2. Start Pulsar standalone. + + ```bash + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + ``` + +3. Create a configuration file _file-connector.yaml_. + + ```yaml + configs: + inputDirectory: "/opt" + ``` + +4. Copy the configuration file _file-connector.yaml_ to the container. + + ```bash + $ docker cp connectors/file-connector.yaml pulsar-standalone:/pulsar/ + ``` + +5. Download the File source connector. + + ```bash + $ curl -O https://mirrors.tuna.tsinghua.edu.cn/apache/pulsar/pulsar-{version}/connectors/pulsar-io-file-{version}.nar + ``` + +6. Start the File source connector. + + ```bash + $ docker exec -it pulsar-standalone /bin/bash + + $ ./bin/pulsar-admin sources localrun \ + --archive /pulsar/pulsar-io-file-{version}.nar \ + --name file-test \ + --destination-topic-name pulsar-file-test \ + --source-config-file /pulsar/file-connector.yaml + ``` + +7. Start a consumer. + + ```bash + ./bin/pulsar-client consume -s file-test -n 0 pulsar-file-test + ``` + +8. Write the message to the file _test.txt_. + + ```bash + echo "hello world!" > /opt/test.txt + ``` + + The following information appears on the consumer terminal window. + + ```bash + ----- got message ----- + hello world! + ``` + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/io-flume-sink.md b/site2/website/versioned_docs/version-2.5.0/io-flume-sink.md new file mode 100644 index 0000000000000..936521a3d7cb3 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-flume-sink.md @@ -0,0 +1,52 @@ +--- +id: version-2.5.0-io-flume-sink +title: Flume sink connector +sidebar_label: Flume sink connector +original_id: io-flume-sink +--- + +The Flume sink connector pulls messages from Pulsar topics to logs. + +## Configuration + +The configuration of the Flume sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume sink connector, you need to create a configuration file through one of the following methods. + +> For more information about the `sink.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/sink.conf). + +* JSON + + ```json + { + "name": "a1", + "confFile": "sink.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + ``` + +* YAML + + ```yaml + configs: + name: a1 + confFile: sink.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-flume-source.md b/site2/website/versioned_docs/version-2.5.0/io-flume-source.md new file mode 100644 index 0000000000000..98cd83ed55a78 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-flume-source.md @@ -0,0 +1,52 @@ +--- +id: version-2.5.0-io-flume-source +title: Flume source connector +sidebar_label: Flume source connector +original_id: io-flume-source +--- + +The Flume source connector pulls messages from logs to Pulsar topics. + +## Configuration + +The configuration of the Flume source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`name`|String|true|"" (empty string)|The name of the agent. +`confFile`|String|true|"" (empty string)|The configuration file. +`noReloadConf`|Boolean|false|false|Whether to reload configuration file if changed. +`zkConnString`|String|true|"" (empty string)|The ZooKeeper connection. +`zkBasePath`|String|true|"" (empty string)|The base path in ZooKeeper for agent configuration. + +### Example + +Before using the Flume source connector, you need to create a configuration file through one of the following methods. + +> For more information about the `source.conf` in the example below, see [here](https://github.com/apache/pulsar/blob/master/pulsar-io/flume/src/main/resources/flume/source.conf). + +* JSON + + ```json + { + "name": "a1", + "confFile": "source.conf", + "noReloadConf": "false", + "zkConnString": "", + "zkBasePath": "" + } + ``` + +* YAML + + ```yaml + configs: + name: a1 + confFile: source.conf + noReloadConf: false + zkConnString: "" + zkBasePath: "" + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-hbase-sink.md b/site2/website/versioned_docs/version-2.5.0/io-hbase-sink.md new file mode 100644 index 0000000000000..c5cb4978b98dc --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-hbase-sink.md @@ -0,0 +1,64 @@ +--- +id: version-2.5.0-io-hbase-sink +title: HBase sink connector +sidebar_label: HBase sink connector +original_id: io-hbase-sink +--- + +The HBase sink connector pulls the messages from Pulsar topics +and persists the messages to HBase tables + +## Configuration + +The configuration of the HBase sink connector has the following properties. + +### Property + +| Name | Type|Default | Required | Description | +|------|---------|----------|-------------| +| `hbaseConfigResources` | String|None | false | HBase system configuration `hbase-site.xml` file. | +| `zookeeperQuorum` | String|None | true | HBase system configuration about `hbase.zookeeper.quorum` value. | +| `zookeeperClientPort` | String|2181 | false | HBase system configuration about `hbase.zookeeper.property.clientPort` value. | +| `zookeeperZnodeParent` | String|/hbase | false | HBase system configuration about `zookeeper.znode.parent` value. | +| `tableName` | None |String | true | HBase table, the value is `namespace:tableName`. | +| `rowKeyName` | String|None | true | HBase table rowkey name. | +| `familyName` | String|None | true | HBase table column family name. | +| `qualifierNames` |String| None | true | HBase table column qualifier names. | +| `batchTimeMs` | Long|1000l| false | HBase table operation timeout in milliseconds. | +| `batchSize` | int|200| false | Batch size of updates made to the HBase table. | + +### Example + +Before using the HBase sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "hbaseConfigResources": "hbase-site.xml", + "zookeeperQuorum": "localhost", + "zookeeperClientPort": "2181", + "zookeeperZnodeParent": "/hbase", + "tableName": "pulsar_hbase", + "rowKeyName": "rowKey", + "familyName": "info", + "qualifierNames": [ 'name', 'address', 'age'] + } + ``` + + +* YAML + + ```yaml + configs: + hbaseConfigResources: "hbase-site.xml" + zookeeperQuorum: "localhost" + zookeeperClientPort: "2181" + zookeeperZnodeParent: "/hbase" + tableName: "pulsar_hbase" + rowKeyName: "rowKey" + familyName: "info" + qualifierNames: [ 'name', 'address', 'age'] + ``` + + \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/io-hdfs2-sink.md b/site2/website/versioned_docs/version-2.5.0/io-hdfs2-sink.md new file mode 100644 index 0000000000000..56222efc55c28 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-hdfs2-sink.md @@ -0,0 +1,54 @@ +--- +id: version-2.5.0-io-hdfs2-sink +title: HDFS2 sink connector +sidebar_label: HDFS2 sink connector +original_id: io-hdfs2-sink +--- + +The HDFS2 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS2 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY| +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| false |None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| false | None| The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. + +### Example + +Before using the HDFS2 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "compression": "SNAPPY" + } + ``` + +* YAML + + ```yaml + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + compression: "SNAPPY" + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-hdfs3-sink.md b/site2/website/versioned_docs/version-2.5.0/io-hdfs3-sink.md new file mode 100644 index 0000000000000..4e116eccfd666 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-hdfs3-sink.md @@ -0,0 +1,54 @@ +--- +id: version-2.5.0-io-hdfs3-sink +title: HDFS3 sink connector +sidebar_label: HDFS3 sink connector +original_id: io-hdfs3-sink +--- + +The HDFS3 sink connector pulls the messages from Pulsar topics +and persists the messages to HDFS files. + +## Configuration + +The configuration of the HDFS3 sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `hdfsConfigResources` | String|true| None | A file or a comma-separated list containing the Hadoop file system configuration.

    **Example**
    'core-site.xml'
    'hdfs-site.xml' | +| `directory` | String | true | None|The HDFS directory where files read from or written to. | +| `encoding` | String |false |None |The character encoding for the files.

    **Example**
    UTF-8
    ASCII | +| `compression` | Compression |false |None |The compression code used to compress or de-compress the files on HDFS.

    Below are the available options:
  • BZIP2
  • DEFLATE
  • GZIP
  • LZ4
  • SNAPPY| +| `kerberosUserPrincipal` |String| false| None|The principal account of Kerberos user used for authentication. | +| `keytab` | String|false|None| The full pathname of the Kerberos keytab file used for authentication. | +| `filenamePrefix` |String| false |None |The prefix of the files created inside the HDFS directory.

    **Example**
    The value of topicA result in files named topicA-. | +| `fileExtension` | String| false | None| The extension added to the files written to HDFS.

    **Example**
    '.txt'
    '.seq' | +| `separator` | char|false |None |The character used to separate records in a text file.

    If no value is provided, the contents from all records are concatenated together in one continuous byte array. | +| `syncInterval` | long| false |0| The interval between calls to flush data to HDFS disk in milliseconds. | +| `maxPendingRecords` |int| false|Integer.MAX_VALUE | The maximum number of records that hold in memory before acking.

    Setting this property to 1 makes every record send to disk before the record is acked.

    Setting this property to a higher value allows buffering records before flushing them to disk. + +### Example + +Before using the HDFS3 sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "hdfsConfigResources": "core-site.xml", + "directory": "/foo/bar", + "filenamePrefix": "prefix", + "compression": "SNAPPY" + } + ``` + +* YAML + + ```yaml + configs: + hdfsConfigResources: "core-site.xml" + directory: "/foo/bar" + filenamePrefix: "prefix" + compression: "SNAPPY" + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-influxdb-sink.md b/site2/website/versioned_docs/version-2.5.0/io-influxdb-sink.md new file mode 100644 index 0000000000000..c3501100bbbc0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-influxdb-sink.md @@ -0,0 +1,62 @@ +--- +id: version-2.5.0-io-influxdb-sink +title: InfluxDB sink connector +sidebar_label: InfluxDB sink connector +original_id: io-influxdb-sink +--- + +The InfluxDB sink connector pulls messages from Pulsar topics +and persists the messages to InfluxDB. + +## Configuration + +The configuration of the InfluxDB sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `influxdbUrl` |String| true|" " (empty string) | The URL of the InfluxDB instance. | +| `username` | String|false| " " (empty string) |The username used to authenticate to InfluxDB. | +| `password` | String| false|" " (empty string) | The password used to authenticate to InfluxDB. | +| `database` |String| true | " " (empty string)| The InfluxDB to which write messages. | +| `consistencyLevel` | String|false|ONE | The consistency level for writing data to InfluxDB.

    Below are the available options:
  • ALL
  • ANY
  • ONE
  • QUORUM | +| `logLevel` | String|false| NONE|The log level for InfluxDB request and response.

    Below are the available options:
  • NONE
  • BASIC
  • HEADERS
  • FULL| +| `retentionPolicy` | String|false| autogen| The retention policy for InfluxDB. | +| `gzipEnable` | boolean|false | false | Whether to enable gzip or not. | +| `batchTimeMs` |long|false| 1000L | The InfluxDB operation time in milliseconds. | +| `batchSize` | int|false|200| The batch size of writing to InfluxDB. | + +### Example + +Before using the InfluxDB sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "influxdbUrl": "http://localhost:8086", + "database": "test_db", + "consistencyLevel": "ONE", + "logLevel": "NONE", + "retentionPolicy": "autogen", + "gzipEnable": false, + "batchTimeMs": 1000, + "batchSize": 100 + } + ``` + +* YAML + + ```yaml + { + influxdbUrl: "http://localhost:8086" + database: "test_db" + consistencyLevel": "ONE" + logLevel: "NONE" + retentionPolicy: "autogen" + gzipEnable: false + batchTimeMs: 1000 + batchSize: 100 + } + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-jdbc-sink.md b/site2/website/versioned_docs/version-2.5.0/io-jdbc-sink.md new file mode 100644 index 0000000000000..287eb18219465 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-jdbc-sink.md @@ -0,0 +1,57 @@ +--- +id: version-2.5.0-io-jdbc-sink +title: JDBC sink connector +sidebar_label: JDBC sink connector +original_id: io-jdbc-sink +--- + +The JDBC sink connector pulls messages from Pulsar topics +and persists the messages to MySQL or SQlite. + +> Currently, INSERT, DELETE and UPDATE operations are supported. + +## Configuration + +The configuration of the JDBC sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `userName` | String|false | " " (empty string) | The username used to connect to the database specified by `jdbcUrl`.

    **Note: `userName` is case-sensitive.**| +| `password` | String|false | " " (empty string)| The password used to connect to the database specified by `jdbcUrl`.

    **Note: `password` is case-sensitive.**| +| `jdbcUrl` | String|true | " " (empty string) | The JDBC URL of the database to which the connector connects. | +| `tableName` | String|true | " " (empty string) | The name of the table to which the connector writes. | +| `nonKey` | String|false | " " (empty string) | A comma-separated list contains the fields used in updating events. | +| `key` | String|false | " " (empty string) | A comma-separated list contains the fields used in `where` condition of updating and deleting events. | +| `timeoutMs` | int| false|500 | The JDBC operation timeout in milliseconds. | +| `batchSize` | int|false | 200 | The batch size of updates made to the database. | + +### Example + +Before using the JDBC sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "userName": "root", + "password": "jdbc", + "jdbcUrl": "jdbc:mysql://127.0.0.1:3306/pulsar_mysql_jdbc_sink", + "tableName": "pulsar_mysql_jdbc_sink" + } + ``` + +* YAML + + ```yaml + configs: + userName: "root" + password: "jdbc" + jdbcUrl: "jdbc:mysql://127.0.0.1:3306/pulsar_mysql_jdbc_sink" + tableName: "pulsar_mysql_jdbc_sink" + ``` + +## Usage + +For more information about **how to use a JDBC sink connector**, see [connect Pulsar to MySQL](io-quickstart.md#connect-pulsar-to-mysql). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/io-kafka-sink.md b/site2/website/versioned_docs/version-2.5.0/io-kafka-sink.md new file mode 100644 index 0000000000000..5988b6aa65d7c --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-kafka-sink.md @@ -0,0 +1,69 @@ +--- +id: version-2.5.0-io-kafka-sink +title: Kafka sink connector +sidebar_label: Kafka sink connector +original_id: io-kafka-sink +--- + +The Kafka sink connector pulls messages from Pulsar topics and persists the messages +to Kafka topics. + +This guide explains how to configure and use the Kafka sink connector. + +## Configuration + +The configuration of the Kafka sink connector has the following parameters. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +|`ack`|String|true|" " (empty string) |The number of acknowledgments that the producer requires the leader to receive before a request completes.
    This controls the durability of the sent records. +|`batchsize`|long|false|16384L|The batch size that a Kafka producer attempts to batch records together before sending them to brokers. +|`maxRequestSize`|long|false|1048576L|The maximum size of a Kafka request in bytes. +|`topic`|String|true|" " (empty string) |The Kafka topic which receives messages from Pulsar. +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringSerializer | The serializer class for Kafka producers to serialize keys. +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArraySerializer | The serializer class for Kafka producers to serialize values.

    The serializer is set by a specific implementation of [`KafkaAbstractSink`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSink.java). +|`producerConfigProperties`|Map|false|" " (empty string)|The producer configuration properties to be passed to producers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. + + +### Example + +Before using the Kafka sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "bootstrapServers": "localhost:6667", + "topic": "test", + "acks": "1", + "batchSize": "16384", + "maxRequestSize": "1048576", + "producerConfigProperties": + { + "client.id": "test-pulsar-producer", + "security.protocol": "SASL_PLAINTEXT", + "sasl.mechanism": "GSSAPI", + "sasl.kerberos.service.name": "kafka", + "acks": "all" + } + } + +* YAML + + ```yaml + configs: + bootstrapServers: "localhost:6667" + topic: "test" + acks: "1" + batchSize: "16384" + maxRequestSize: "1048576" + producerConfigProperties: + client.id: "test-pulsar-producer" + security.protocol: "SASL_PLAINTEXT" + sasl.mechanism: "GSSAPI" + sasl.kerberos.service.name: "kafka" + acks: "all" + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-kafka-source.md b/site2/website/versioned_docs/version-2.5.0/io-kafka-source.md new file mode 100644 index 0000000000000..885f109d0d6a4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-kafka-source.md @@ -0,0 +1,171 @@ +--- +id: version-2.5.0-io-kafka-source +title: Kafka source connector +sidebar_label: Kafka source connector +original_id: io-kafka-source +--- + +The Kafka source connector pulls messages from Kafka topics and persists the messages +to Pulsar topics. + +This guide explains how to configure and use the Kafka source connector. + +## Configuration + +The configuration of the Kafka source connector has the following properties. + +### Property + +| Name | Type| Required | Default | Description +|------|----------|---------|-------------|-------------| +| `bootstrapServers` |String| true | " " (empty string) | A comma-separated list of host and port pairs for establishing the initial connection to the Kafka cluster. | +| `groupId` |String| true | " " (empty string) | A unique string that identifies the group of consumer processes to which this consumer belongs. | +| `fetchMinBytes` | long|false | 1 | The minimum byte expected for each fetch response. | +| `autoCommitEnabled` | boolean |false | true | If set to true, the consumer's offset is periodically committed in the background.

    This committed offset is used when the process fails as the position from which a new consumer begins. | +| `autoCommitIntervalMs` | long|false | 5000 | The frequency in milliseconds that the consumer offsets are auto-committed to Kafka if `autoCommitEnabled` is set to true. | +| `heartbeatIntervalMs` | long| false | 3000 | The interval between heartbeats to the consumer when using Kafka's group management facilities.

    **Note: `heartbeatIntervalMs` must be smaller than `sessionTimeoutMs`**.| +| `sessionTimeoutMs` | long|false | 30000 | The timeout used to detect consumer failures when using Kafka's group management facility. | +| `topic` | String|true | " " (empty string)| The Kafka topic which sends messages to Pulsar. | +| `consumerConfigProperties` | Map| false | " " (empty string) | The consumer configuration properties to be passed to consumers.

    **Note: other properties specified in the connector configuration file take precedence over this configuration**. | +| `keyDeserializationClass` | String|false | org.apache.kafka.common.serialization.StringDeserializer | The deserializer class for Kafka consumers to deserialize keys.
    The deserializer is set by a specific implementation of [`KafkaAbstractSource`](https://github.com/apache/pulsar/blob/master/pulsar-io/kafka/src/main/java/org/apache/pulsar/io/kafka/KafkaAbstractSource.java). +| `valueDeserializationClass` | String|false | org.apache.kafka.common.serialization.ByteArrayDeserializer | The deserializer class for Kafka consumers to deserialize values. + + +### Example + +Before using the Kafka source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "bootstrapServers": "pulsar-kafka:9092", + "groupId": "test-pulsar-io", + "topic": "my-topic", + "sessionTimeoutMs": "10000", + "autoCommitEnabled": false + } + ``` + +* YAML + + ```yaml + configs: + bootstrapServers: "pulsar-kafka:9092" + groupId: "test-pulsar-io" + topic: "my-topic" + sessionTimeoutMs: "10000" + autoCommitEnabled: false + ``` + +## Usage + +Here is an example of using the Kafka source connecter with the configuration file as shown previously. + +1. Download a Kafka client and a Kafka connector. + + ```bash + $ wget http://central.maven.org/maven2/org/apache/kafka/kafka-clients/0.10.2.1/kafka-clients-0.10.2.1.jar + + $ wget https://archive.apache.org/dist/pulsar/pulsar-2.4.0/connectors/pulsar-io-kafka-2.4.0.nar + ``` + +2. Create a network. + + ```bash + $ docker network create kafka-pulsar + ``` + +3. Pull a ZooKeeper image and start ZooKeeper. + + ```bash + $ docker pull wurstmeister/zookeeper + + $ docker run -d -it -p 2181:2181 --name pulsar-kafka-zookeeper --network kafka-pulsar wurstmeister/zookeeper + ``` + +4. Pull a Kafka image and start Kafka. + + ```bash + $ docker pull wurstmeister/kafka:2.11-1.0.2 + + $ docker run -d -it --network kafka-pulsar -p 6667:6667 -p 9092:9092 -e KAFKA_ADVERTISED_HOST_NAME=pulsar-kafka -e KAFKA_ZOOKEEPER_CONNECT=pulsar-kafka-zookeeper:2181 --name pulsar-kafka wurstmeister/kafka:2.11-1.0.2 + ``` + +5. Pull a Pulsar image and start Pulsar standalone. + + ```bash + $ docker pull apachepulsar/pulsar:2.4.0 + + $ docker run -d -it --network kafka-pulsar -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-kafka-standalone apachepulsar/pulsar:2.4.0 bin/pulsar standalone + ``` + +6. Create a producer file _kafka-producer.py_. + + ```python + from kafka import KafkaProducer + producer = KafkaProducer(bootstrap_servers='pulsar-kafka:9092') + future = producer.send('my-topic', b'hello world') + future.get() + ``` + +7. Create a consumer file _pulsar-client.py_. + + ```python + import pulsar + + client = pulsar.Client('pulsar://localhost:6650') + consumer = client.subscribe('my-topic', subscription_name='my-aa') + + while True: + msg = consumer.receive() + print msg + print dir(msg) + print("Received message: '%s'" % msg.data()) + consumer.acknowledge(msg) + + client.close() + ``` + +8. Copy the following files to Pulsar. + + ```bash + $ docker cp pulsar-io-kafka-2.4.0.nar pulsar-kafka-standalone:/pulsar + $ docker cp kafkaSourceConfig.yaml pulsar-kafka-standalone:/pulsar/conf + $ docker cp kafka-clients-0.10.2.1.jar pulsar-kafka-standalone:/pulsar/lib + $ docker cp pulsar-client.py pulsar-kafka-standalone:/pulsar/ + $ docker cp kafka-producer.py pulsar-kafka-standalone:/pulsar/ + ``` + +9. Open a new terminal window and start the Kafka source connector in local run mode. + + ```bash + $ docker exec -it pulsar-kafka-standalone /bin/bash + + $ ./bin/pulsar-admin source localrun \ + --archive ./pulsar-io-kafka-2.4.0.nar \ + --classname org.apache.pulsar.io.kafka.KafkaBytesSource \ + --tenant public \ + --namespace default \ + --name kafka \ + --destination-topic-name my-topic \ + --source-config-file ./conf/kafkaSourceConfig.yaml \ + --parallelism 1 + ``` + +10. Open a new terminal window and run the consumer. + + ```bash + $ docker exec -it pulsar-kafka-standalone /bin/bash + + $ pip install kafka-python + + $ python3 kafka-producer.py + ``` + + The following information appears on the consumer terminal window. + + ```bash + Received message: 'hello world' + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-kinesis-sink.md b/site2/website/versioned_docs/version-2.5.0/io-kinesis-sink.md new file mode 100644 index 0000000000000..d754360798701 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-kinesis-sink.md @@ -0,0 +1,73 @@ +--- +id: version-2.5.0-io-kinesis-sink +title: Kinesis sink connector +sidebar_label: Kinesis sink connector +original_id: io-kinesis-sink +--- + +The Kinesis sink connector pulls data from Pulsar and persists data into Amazon Kinesis. + +## Configuration + +The configuration of the Kinesis sink connector has the following property. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`messageFormat`|MessageFormat|true|ONLY_RAW_PAYLOAD|Message format in which Kinesis sink converts Pulsar messages and publishes to Kinesis streams.

    Below are the available options:

  • `ONLY_RAW_PAYLOAD`: Kinesis sink directly publishes Pulsar message payload as a message into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_JSON`: Kinesis sink creates a JSON payload with Pulsar message payload, properties and encryptionCtx, and publishes JSON payload into the configured Kinesis stream.

  • `FULL_MESSAGE_IN_FB`: Kinesis sink creates a flatbuffer serialized payload with Pulsar message payload, properties and encryptionCtx, and publishes flatbuffer payload into the configured Kinesis stream. +`retainOrdering`|boolean|false|false|Whether Pulsar connectors to retain ordering when moving messages from Pulsar to Kinesis or not. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:`AwsCredentialProviderPlugin`:/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/AwsCredentialProviderPlugin.java}.

    It is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If it is empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`. +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Built-in plugins + +The following are built-in `AwsCredentialProviderPlugin` plugins: + +* `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin` + + This plugin takes no configuration, it uses the default AWS provider chain. + + For more information, see [AWS documentation](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default). + +* `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin` + + This plugin takes a configuration (via the `awsCredentialPluginParam`) that describes a role to assume when running the KCL. + + This configuration takes the form of a small json document like: + + ```json + {"roleArn": "arn...", "roleSessionName": "name"} + ``` + +### Example + +Before using the Kinesis sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "messageFormat": "ONLY_RAW_PAYLOAD", + "retainOrdering": "true" + } + ``` + +* YAML + + ```yaml + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + messageFormat: "ONLY_RAW_PAYLOAD" + retainOrdering: "true" + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-kinesis-source.md b/site2/website/versioned_docs/version-2.5.0/io-kinesis-source.md new file mode 100644 index 0000000000000..34d64430f2416 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-kinesis-source.md @@ -0,0 +1,77 @@ +--- +id: version-2.5.0-io-kinesis-source +title: Kinesis source connector +sidebar_label: Kinesis source connector +original_id: io-kinesis-source +--- + +The Kinesis source connector pulls data from Amazon Kinesis and persists data into Pulsar. + +This connector uses the [Kinesis Consumer Library](https://github.com/awslabs/amazon-kinesis-client) (KCL) to do the actual consuming of messages. The KCL uses DynamoDB to track state for consumers. + +> Note: currently, the Kinesis source connector only supports raw messages. If you use KMS encrypted messages, the encrypted messages are sent to downstream. This connector will support decrypting messages in the future release. + + +## Configuration + +The configuration of the Kinesis source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +`initialPositionInStream`|InitialPositionInStream|false|LATEST|The position where the connector starts from.

    Below are the available options:

  • `AT_TIMESTAMP`: start from the record at or after the specified timestamp.

  • `LATEST`: start after the most recent data record.

  • `TRIM_HORIZON`: start from the oldest available data record. +`startAtTime`|Date|false|" " (empty string)|If set to `AT_TIMESTAMP`, it specifies the point in time to start consumption. +`applicationName`|String|false|Pulsar IO connector|The name of the Amazon Kinesis application.

    By default, the application name is included in the user agent string used to make AWS requests. This can assist with troubleshooting, for example, distinguish requests made by separate connector instances. +`checkpointInterval`|long|false|60000|The frequency of the Kinesis stream checkpoint in milliseconds. +`backoffTime`|long|false|3000|The amount of time to delay between requests when the connector encounters a throttling exception from AWS Kinesis in milliseconds. +`numRetries`|int|false|3|The number of re-attempts when the connector encounters an exception while trying to set a checkpoint. +`receiveQueueSize`|int|false|1000|The maximum number of AWS records that can be buffered inside the connector.

    Once the `receiveQueueSize` is reached, the connector does not consume any messages from Kinesis until some messages in the queue are successfully consumed. +`dynamoEndpoint`|String|false|" " (empty string)|The Dynamo end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`cloudwatchEndpoint`|String|false|" " (empty string)|The Cloudwatch end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`useEnhancedFanOut`|boolean|false|true|If set to true, it uses Kinesis enhanced fan-out.

    If set to false, it uses polling. +`awsEndpoint`|String|false|" " (empty string)|The Kinesis end-point URL, which can be found at [here](https://docs.aws.amazon.com/general/latest/gr/rande.html). +`awsRegion`|String|false|" " (empty string)|The AWS region.

    **Example**
    us-west-1, us-west-2 +`awsKinesisStreamName`|String|true|" " (empty string)|The Kinesis stream name. +`awsCredentialPluginName`|String|false|" " (empty string)|The fully-qualified class name of implementation of {@inject: github:`AwsCredentialProviderPlugin`:/pulsar-io/kinesis/src/main/java/org/apache/pulsar/io/kinesis/AwsCredentialProviderPlugin.java}.

    `awsCredentialProviderPlugin` has the following built-in plugs:

  • `org.apache.pulsar.io.kinesis.AwsDefaultProviderChainPlugin`:
    this plugin uses the default AWS provider chain.
    For more information, see [using the default credential provider chain](https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default).

  • `org.apache.pulsar.io.kinesis.STSAssumeRoleProviderPlugin`:
    this plugin takes a configuration via the `awsCredentialPluginParam` that describes a role to assume when running the KCL.
    **JSON configuration example**
    `{"roleArn": "arn...", "roleSessionName": "name"}`

    `awsCredentialPluginName` is a factory class which creates an AWSCredentialsProvider that is used by Kinesis sink.

    If `awsCredentialPluginName` set to empty, the Kinesis sink creates a default AWSCredentialsProvider which accepts json-map of credentials in `awsCredentialPluginParam`. +`awsCredentialPluginParam`|String |false|" " (empty string)|The JSON parameter to initialize `awsCredentialsProviderPlugin`. + +### Example + +Before using the Kinesis source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "awsEndpoint": "https://some.endpoint.aws", + "awsRegion": "us-east-1", + "awsKinesisStreamName": "my-stream", + "awsCredentialPluginParam": "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}", + "applicationName": "My test application", + "checkpointInterval": "30000", + "backoffTime": "4000", + "numRetries": "3", + "receiveQueueSize": 2000, + "initialPositionInStream": "TRIM_HORIZON", + "startAtTime": "2019-03-05T19:28:58.000Z" + } + ``` + +* YAML + + ```yaml + configs: + awsEndpoint: "https://some.endpoint.aws" + awsRegion: "us-east-1" + awsKinesisStreamName: "my-stream" + awsCredentialPluginParam: "{\"accessKey\":\"myKey\",\"secretKey\":\"my-Secret\"}" + applicationName: "My test application" + checkpointInterval: 30000 + backoffTime: 4000 + numRetries: 3 + receiveQueueSize: 2000 + initialPositionInStream: "TRIM_HORIZON" + startAtTime: "2019-03-05T19:28:58.000Z" + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-mongo-sink.md b/site2/website/versioned_docs/version-2.5.0/io-mongo-sink.md new file mode 100644 index 0000000000000..ef38fc7defe6b --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-mongo-sink.md @@ -0,0 +1,52 @@ +--- +id: version-2.5.0-io-mongo-sink +title: MongoDB sink connector +sidebar_label: MongoDB sink connector +original_id: io-mongo-sink +--- + +The MongoDB sink connector pulls messages from Pulsar topics +and persists the messages to collections. + +## Configuration + +The configuration of the MongoDB sink connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `mongoUri` | String| true| " " (empty string) | The MongoDB URI to which the connector connects.

    For more information, see [connection string URI format](https://docs.mongodb.com/manual/reference/connection-string/). | +| `database` | String| true| " " (empty string)| The database name to which the collection belongs. | +| `collection` | String| true| " " (empty string)| The collection name to which the connector writes messages. | +| `batchSize` | int|false|100 | The batch size of writing messages to collections. | +| `batchTimeMs` |long|false|1000| The batch operation interval in milliseconds. | + + +### Example + +Before using the Mongo sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "mongoUri": "mongodb://localhost:27017", + "database": "pulsar", + "collection": "messages", + "batchSize": "2", + "batchTimeMs": "500" + } + ``` + +* YAML + + ```yaml + { + mongoUri: "mongodb://localhost:27017" + database: "pulsar" + collection: "messages" + batchSize: 2 + batchTimeMs: 500 + } + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-netty-source.md b/site2/website/versioned_docs/version-2.5.0/io-netty-source.md new file mode 100644 index 0000000000000..46e92d61c3482 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-netty-source.md @@ -0,0 +1,205 @@ +--- +id: version-2.5.0-io-netty-source +title: Netty source connector +sidebar_label: Netty source connector +original_id: io-netty-source +--- + +The Netty source connector opens a port that accepts incoming data via the configured network protocol +and publish it to user-defined Pulsar topics. + +This connector can be used in a containerized (for example, k8s) deployment. Otherwise, if the connector is running in process or thread mode, the instance may be conflicting on listening to ports. + +## Configuration + +The configuration of the Netty source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `type` |String| true |tcp | The network protocol over which data is transmitted to netty.

    Below are the available options:
  • tcp
  • http
  • udp | +| `host` | String|true | 127.0.0.1 | The host name or address on which the source instance listen. | +| `port` | int|true | 10999 | The port on which the source instance listen. | +| `numberOfThreads` |int| true |1 | The number of threads of Netty TCP server to accept incoming connections and handle the traffic of accepted connections. | + + +### Example + +Before using the Netty source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "type": "tcp", + "host": "127.0.0.1", + "port": "10911", + "numberOfThreads": "1" + } + ``` + +* YAML + + ```yaml + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + ``` + + +## Usage + +The following examples show how to use the Netty source connector with TCP and HTTP. + +### TCP + +1. Start Pulsar standalone. + + ```bash + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + configs: + type: "tcp" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + ``` + +4. Download the Netty source connector. + + ```bash + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + ``` + +5. Start the Netty source connector. + + ```bash + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-{{pulsar:version}}.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + ``` + +6. Consume data. + + ```bash + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ apt-get update + + $ apt-get -y install telnet + + $ root@1d19327b2c67:/pulsar# telnet 127.0.0.1 10999 + Trying 127.0.0.1... + Connected to 127.0.0.1. + Escape character is '^]'. + hello + world + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + ----- got message ----- + hello + + ----- got message ----- + world + ``` + +### HTTP + +1. Start Pulsar standalone. + + ```bash + $ docker pull apachepulsar/pulsar:{version} + + $ docker run -d -it -p 6650:6650 -p 8080:8080 -v $PWD/data:/pulsar/data --name pulsar-netty-standalone apachepulsar/pulsar:{version} bin/pulsar standalone + ``` + +2. Create a configuration file _netty-source-config.yaml_. + + ```yaml + configs: + type: "http" + host: "127.0.0.1" + port: 10999 + numberOfThreads: 1 + ``` + +3. Copy the configuration file _netty-source-config.yaml_ to Pulsar server. + + ```bash + $ docker cp netty-source-config.yaml pulsar-netty-standalone:/pulsar/conf/ + ``` + +4. Download the Netty source connector. + + ```bash + $ docker exec -it pulsar-netty-standalone /bin/bash + curl -O http://mirror-hk.koddos.net/apache/pulsar/pulsar-{version}/connectors/pulsar-io-netty-{version}.nar + ``` + +5. Start the Netty source connector. + + ```bash + $ ./bin/pulsar-admin sources localrun \ + --archive pulsar-io-{{pulsar:version}}.nar \ + --tenant public \ + --namespace default \ + --name netty \ + --destination-topic-name netty-topic \ + --source-config-file netty-source-config.yaml \ + --parallelism 1 + ``` + +6. Consume data. + + ```bash + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ ./bin/pulsar-client consume -t Exclusive -s netty-sub netty-topic -n 0 + ``` + +7. Open another terminal window to send data to the Netty source. + + ```bash + $ docker exec -it pulsar-netty-standalone /bin/bash + + $ curl -X POST --data 'hello, world!' http://127.0.0.1:10999/ + ``` + +8. The following information appears on the consumer terminal window. + + ```bash + ----- got message ----- + hello, world! + ``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-overview.md b/site2/website/versioned_docs/version-2.5.0/io-overview.md new file mode 100644 index 0000000000000..eb24b79a3f85d --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-overview.md @@ -0,0 +1,136 @@ +--- +id: version-2.5.0-io-overview +title: Pulsar connector overview +sidebar_label: Overview +original_id: io-overview +--- + +Messaging systems are most powerful when you can easily use them with external systems like databases and other messaging systems. + +**Pulsar IO connectors** enable you to easily create, deploy, and manage connectors that interact with external systems, such as [Apache Cassandra](https://cassandra.apache.org), [Aerospike](https://www.aerospike.com), and many others. + + +## Concept + +Pulsar IO connectors come in two types: **source** and **sink**. + +This diagram illustrates the relationship between source, Pulsar, and sink: + +![Pulsar IO diagram](assets/pulsar-io.png "Pulsar IO connectors (sources and sinks)") + + +### Source + +> Sources **feed data from external systems into Pulsar**. + +Common sources include other messaging systems and firehose-style data pipeline APIs. + +For the complete list of Pulsar built-in source connectors, see [source connector](io-connectors.md#source-connector). + +### Sink + +> Sinks **feed data from Pulsar into external systems**. + +Common sinks include other messaging systems and SQL and NoSQL databases. + +For the complete list of Pulsar built-in sink connectors, see [sink connector](io-connectors.md#sink-connector). + +## Processing guarantee + +Processing guarantees are used to handle errors when writing messages to Pulsar topics. + +> Pulsar connectors and Functions use the **same** processing guarantees as below. + +Delivery semantic | Description +:------------------|:------- +`at-most-once` | Each message sent to a connector is to be **processed once** or **not to be processed**. +`at-least-once` | Each message sent to a connector is to be **processed once** or **more than once**. +`effectively-once` | Each message sent to a connector has **one output associated** with it. + +> Processing guarantees for connectors not just rely on Pulsar guarantee but also **relate to external systems**, that is, **the implementation of source and sink**. + +* Source: Pulsar ensures that writing messages to Pulsar topics respects to the processing guarantees. It is within Pulsar's control. + +* Sink: the processing guarantees rely on the sink implementation. If the sink implementation does not handle retries in an idempotent way, the sink does not respect to the processing guarantees. + +### Set + +When creating a connector, you can set the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +> If `--processing-guarantees` is not specified when creating a connector, the default semantic is `ATLEAST_ONCE`. + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + + + + + +```bash +$ bin/pulsar-admin sources create \ + --processing-guarantees ATMOST_ONCE \ + # Other source configs +``` + +For more information about the options of `pulsar-admin sources create`, see [here](reference-connector-admin.md#create). + + + +```bash +$ bin/pulsar-admin sinks create \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other sink configs +``` + +For more information about the options of `pulsar-admin sinks create`, see [here](reference-connector-admin.md#create-1). + + + +### Update + +After creating a connector, you can update the processing guarantee with the following semantics: + +* ATLEAST_ONCE + +* ATMOST_ONCE + +* EFFECTIVELY_ONCE + +Here takes **Admin CLI** as an example. For more information about **REST API** or **JAVA Admin API**, see [here](io-use.md#create). + + + + + +```bash +$ bin/pulsar-admin sources update \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other source configs +``` + +For more information about the options of `pulsar-admin sources update`, see [here](reference-connector-admin.md#update). + + + +```bash +$ bin/pulsar-admin sinks update \ + --processing-guarantees ATMOST_ONCE \ + # Other sink configs +``` + +For more information about the options of `pulsar-admin sinks update`, see [here](reference-connector-admin.md#update-1). + + + + +## Work with connector + +You can manage Pulsar connectors (for example, create, update, start, stop, restart, reload, delete and perform other operations on connectors) via the [Connector Admin CLI](reference-connector-admin.md) with [sources](reference-connector-admin.md#sources) and [sinks](reference-connector-admin.md#sinks) subcommands. + +Connectors (sources and sinks) and Functions are components of instances, and they all run on Functions workers. When managing a source, sink or function via [Connector Admin CLI](reference-connector-admin.md) or [Functions Admin CLI](functions-cli.md), an instance is started on a worker. For more information, see [Functions worker](functions-worker.md#run-functions-worker-separately). + diff --git a/site2/website/versioned_docs/version-2.5.0/io-quickstart.md b/site2/website/versioned_docs/version-2.5.0/io-quickstart.md new file mode 100644 index 0000000000000..26a9df3f6113c --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-quickstart.md @@ -0,0 +1,824 @@ +--- +id: version-2.5.0-io-quickstart +title: How to connect Pulsar to database +sidebar_label: Get started +original_id: io-quickstart +--- + +This tutorial provides a hands-on look at how you can move data out of Pulsar without writing a single line of code. + +It is helpful to review the [concepts](io-overview.md) for Pulsar I/O with running the steps in this guide to gain a deeper understanding. + +At the end of this tutorial, you are able to: + +- [Connect Pulsar to Cassandra](#Connect-Pulsar-to-Cassandra) + +- [Connect Pulsar to MySQL](#Connect-Pulsar-to-MySQL) + +> #### Tip +> +> * These instructions assume you are running Pulsar in [standalone mode](getting-started-standalone.md). However, all +> the commands used in this tutorial can be used in a multi-nodes Pulsar cluster without any changes. +> +> * All the instructions are assumed to run at the root directory of a Pulsar binary distribution. + +## Install Pulsar and built-in connector + +Before connecting Pulsar to a database, you need to install Pulsar and the desired built-in connector. + +For more information about **how to install a standalone Pulsar and built-in connectors**, see [here](standalone/#installing-pulsar). + +## Start Pulsar standalone + +1. Start Pulsar locally. + + ```bash + bin/pulsar standalone + ``` + + All the components of a Pulsar service are start in order. + + You can curl those pulsar service endpoints to make sure Pulsar service is up running correctly. + +2. Check Pulsar binary protocol port. + + ```bash + telnet localhost 6650 + ``` + +3. Check Pulsar Function cluster. + + ```bash + curl -s http://localhost:8080/admin/v2/worker/cluster + ``` + + **Example output** + ```shell + [{"workerId":"c-standalone-fw-localhost-6750","workerHostname":"localhost","port":6750}] + ``` + +4. Make sure a public tenant and a default namespace exist. + + ```bash + curl -s http://localhost:8080/admin/v2/namespaces/public + ``` + + **Example outoupt** + ```shell + ["public/default","public/functions"] + ``` + +5. All built-in connectors should be listed as available. + + ```bash + curl -s http://localhost:8080/admin/v2/functions/connectors + ``` + + **Example outoupt** + + ```json + [{"name":"aerospike","description":"Aerospike database sink","sinkClass":"org.apache.pulsar.io.aerospike.AerospikeStringSink"},{"name":"cassandra","description":"Writes data into Cassandra","sinkClass":"org.apache.pulsar.io.cassandra.CassandraStringSink"},{"name":"kafka","description":"Kafka source and sink connector","sourceClass":"org.apache.pulsar.io.kafka.KafkaStringSource","sinkClass":"org.apache.pulsar.io.kafka.KafkaBytesSink"},{"name":"kinesis","description":"Kinesis sink connector","sinkClass":"org.apache.pulsar.io.kinesis.KinesisSink"},{"name":"rabbitmq","description":"RabbitMQ source connector","sourceClass":"org.apache.pulsar.io.rabbitmq.RabbitMQSource"},{"name":"twitter","description":"Ingest data from Twitter firehose","sourceClass":"org.apache.pulsar.io.twitter.TwitterFireHose"}] + ``` + + If an error occurs when starting Pulsar service, you may see an exception at the terminal running `pulsar/standalone`, + or you can navigate to the `logs` directory under the Pulsar directory to view the logs. + +## Connect Pulsar to Cassandra + +This section demonstrates how to connector Pulsar to Cassandra. + +> #### Tip +> +> * Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +> +> * The Cassandra sink connector reads messages from Pulsar topics and writes the messages into Cassandra tables. For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +### Setup a Cassandra cluster + +This example uses `cassandra` Docker image to start a single-node Cassandra cluster in Docker. + +1. Start a Cassandra cluster. + + ```bash + docker run -d --rm --name=cassandra -p 9042:9042 cassandra + ``` + + > **Note** + > + > Before moving to the next steps, make sure the Cassandra cluster is running. + +2. Make sure the Docker process is running. + + ```bash + docker ps + ``` + +3. Check the Cassandra logs to make sure the Cassandra process is running as expected. + + ```bash + docker logs cassandra + ``` + +4. Check the status of the Cassandra cluster. + + ```bash + docker exec cassandra nodetool status + ``` + + **Example output** + + ``` + Datacenter: datacenter1 + ======================= + Status=Up/Down + |/ State=Normal/Leaving/Joining/Moving + -- Address Load Tokens Owns (effective) Host ID Rack + UN 172.17.0.2 103.67 KiB 256 100.0% af0e4b2f-84e0-4f0b-bb14-bd5f9070ff26 rack1 + ``` + +5. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + $ docker exec -ti cassandra cqlsh localhost + Connected to Test Cluster at localhost:9042. + [cqlsh 5.0.1 | Cassandra 3.11.2 | CQL spec 3.4.4 | Native protocol v4] + Use HELP for help. + cqlsh> + ``` + +6. Create a keyspace `pulsar_test_keyspace`. + + ```bash + cqlsh> CREATE KEYSPACE pulsar_test_keyspace WITH replication = {'class':'SimpleStrategy', 'replication_factor':1}; + ``` + +7. Create a table `pulsar_test_table`. + + ```bash + cqlsh> USE pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> CREATE TABLE pulsar_test_table (key text PRIMARY KEY, col text); + ``` + +### Configure a Cassandra sink + +Now that we have a Cassandra cluster running locally. + +In this section, you need to configure a Cassandra sink connector. + +To run a Cassandra sink connector, you need to prepare a configuration file including the information that Pulsar connector runtime needs to know. + +For example, how Pulsar connector can find the Cassandra cluster, what is the keyspace and the table that Pulsar connector uses for writing Pulsar messages to, and so on. + +You can create a configuration file through one of the following methods. + +* JSON + + ```json + { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + } + ``` + +* YAML + + ``` + configs: + roots: "localhost:9042" + keyspace: "pulsar_test_keyspace" + columnFamily: "pulsar_test_table" + keyname: "key" + columnName: "col" + ``` + +For more information, see [Cassandra sink connector](io-cassandra-sink.md). + +### Create a Cassandra sink + +You can use the [Connector Admin CLI](io-cli.md) +to create a sink connector and perform other operations on them. + +Run the following command to create a Cassandra sink connector with sink type _cassandra_ and the config file _examples/cassandra-sink.yml_ created previously. + +```bash +bin/pulsar-admin sinks create \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink \ + --sink-type cassandra \ + --sink-config-file examples/cassandra-sink.yml \ + --inputs test_cassandra +``` + +Once the command is executed, Pulsar creates the sink connector _cassandra-test-sink_. + +This sink connector runs +as a Pulsar Function and writes the messages produced in the topic _test_cassandra_ to the Cassandra table _pulsar_test_table_. + +### Inspect a Cassandra sink + +You can use the [Connector Admin CLI](io-cli.md) +to monitor a connector and perform other operations on it. + +* Get the information of a Cassandra sink. + + ```bash + bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + ``` + + **Example output** + + ```bash + { + "tenant": "public", + "namespace": "default", + "name": "cassandra-test-sink", + "className": "org.apache.pulsar.io.cassandra.CassandraStringSink", + "inputSpecs": { + "test_cassandra": { + "isRegexPattern": false + } + }, + "configs": { + "roots": "localhost:9042", + "keyspace": "pulsar_test_keyspace", + "columnFamily": "pulsar_test_table", + "keyname": "key", + "columnName": "col" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true, + "archive": "builtin://cassandra" + } + ``` + +* Check the status of a Cassandra sink. + + ```bash + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + ``` + + **Example output** + + ```shell + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + ``` + +### Verify a Cassandra sink + +1. Produce some messages to the input topic of the Cassandra sink _test_cassandra_. + + ```bash + for i in {0..9}; do bin/pulsar-client produce -m "key-$i" -n 1 test_cassandra; done + ``` + +2. Inspect the status of the Cassandra sink _test_cassandra_. + + ```bash + bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink + ``` + + You can see 10 messages are processed by the Cassandra sink _test_cassandra_. + + **Example output** + + ```shell + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 10, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 10, + "lastReceivedTime" : 1551685489136, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + ``` + +3. Use `cqlsh` to connect to the Cassandra cluster. + + ```bash + docker exec -ti cassandra cqlsh localhost + ``` + +4. Check the data of the Cassandra table _pulsar_test_table_. + + ```bash + cqlsh> use pulsar_test_keyspace; + cqlsh:pulsar_test_keyspace> select * from pulsar_test_table; + + key | col + --------+-------- + key-5 | key-5 + key-0 | key-0 + key-9 | key-9 + key-2 | key-2 + key-1 | key-1 + key-3 | key-3 + key-6 | key-6 + key-7 | key-7 + key-4 | key-4 + key-8 | key-8 + ``` + +### Delete a Cassandra Sink + +You can use the [Connector Admin CLI](io-cli.md) +to delete a connector and perform other operations on it. + +```bash +bin/pulsar-admin sinks delete \ + --tenant public \ + --namespace default \ + --name cassandra-test-sink +``` + +## Connect Pulsar to MySQL + +This section demonstrates how to connector Pulsar to MySQL. + +> #### Tip +> +> * Make sure you have Docker installed. If you do not have one, see [install Docker](https://docs.docker.com/docker-for-mac/install/). +> +> * The JDBC sink connector pulls messages from Pulsar topics +and persists the messages to MySQL or SQlite. For more information, see [JDBC sink connector](io-jdbc-sink.md). + + +### Setup a MySQL cluster + +This example uses the MySQL 5.7 docker image to start a single-node MySQL cluster in Docker. + +1. Pull the MySQL 5.7 image from Docker. + + ```bash + $ docker pull mysql:5.7 + ``` + +2. Start MySQL. + + ```bash + $ docker run -d -it --rm \ + --name pulsar-mysql \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=jdbc \ + -e MYSQL_USER=mysqluser \ + -e MYSQL_PASSWORD=mysqlpw \ + mysql:5.7 + ``` + + #### Tip + + Flag | Description | This example + ---|---|---| + `-d` | To start a container in detached mode. | / + `-it` | Keep STDIN open even if not attached and allocate a terminal. | / + `--rm` | Remove the container automatically when it exits. | / + `-name` | Assign a name to the container. | This example specifies _pulsar-mysql_ for the container. + `-p` | Publish the port of the container to the host. | This example publishes the port _3306_ of the container to the host. + `-e` | Set environment variables. | This example sets the following variables:
    - The password for the root user is _jdbc_.
    - The name for the normal user is _mysqluser_.
    - The password for the normal user is _mysqlpw_. + + > #### Tip + > + > For more information about Docker commands, see [Docker CLI](https://docs.docker.com/engine/reference/commandline/run/). + +3. Check if MySQL has been started successfully. + + ```bash + $ docker logs -f pulsar-mysql + ``` + + MySQL has been started successfully if the following message appears. + + ```text + 2019-05-11T10:40:58.709964Z 0 [Note] Found ca.pem, server-cert.pem and server-key.pem in data directory. Trying to enable SSL support using them. + 2019-05-11T10:40:58.710155Z 0 [Warning] CA certificate ca.pem is self signed. + 2019-05-11T10:40:58.711921Z 0 [Note] Server hostname (bind-address): '*'; port: 3306 + 2019-05-11T10:40:58.711985Z 0 [Note] IPv6 is available. + 2019-05-11T10:40:58.712695Z 0 [Note] - '::' resolves to '::'; + 2019-05-11T10:40:58.712742Z 0 [Note] Server socket created on IP: '::'. + 2019-05-11T10:40:58.714334Z 0 [Warning] Insecure configuration for --pid-file: Location '/var/run/mysqld' in the path is accessible to all OS users. Consider choosing a different directory. + 2019-05-11T10:40:58.723802Z 0 [Note] Event Scheduler: Loaded 0 events + 2019-05-11T10:40:58.724200Z 0 [Note] mysqld: ready for connections. + Version: '5.7.26' socket: '/var/run/mysqld/mysqld.sock' port: 3306 MySQL Community Server (GPL) + ``` + +4. Access to MySQL. + + ```bash + $ docker exec -it pulsar-mysql /bin/bash + mysql -h localhost -uroot -pjdbc + ``` + +5. Create a MySQL table _pulsar_mysql_jdbc_sink_. + + ```bash + $ create database pulsar_mysql_jdbc_sink; + + $ use pulsar_mysql_jdbc_sink; + + $ create table if not exists pulsar_mysql_jdbc_sink + ( + id INT AUTO_INCREMENT, + name VARCHAR(255) NOT NULL, + primary key (id) + ) + engine=innodb; + ``` + +### Configure a JDBC sink + +Now that we have a MySQL running locally. + +In this section, you need to configure a JDBC sink connector. + +1. Add a configuration file. + + To run a JDBC sink connector, you need to prepare a YAML configuration file including the information that Pulsar connector runtime needs to know. + + For example, how Pulsar connector can find the MySQL cluster, what is the JDBC URL and the table that Pulsar connector uses for writing messages to. + + Create a _pulsar-mysql-jdbc-sink.yaml_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```bash + configs: + userName: "root" + password: "jdbc" + jdbcUrl: "jdbc:mysql://127.0.0.1:3306/pulsar_mysql_jdbc_sink" + tableName: "pulsar_mysql_jdbc_sink" + ``` + +2. Create a schema. + + Create a _avro-schema_ file, copy the following contents to this file, and place the file in the `pulsar/connectors` folder. + + ```bash + { + "type": "AVRO", + "schema": "{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}", + "properties": {} + } + ``` + + > #### Tip + > + > For more information about AVRO, see [Apache Avro](https://avro.apache.org/docs/1.9.1/). + + +3. Upload a schema to a topic. + + This example uploads the _avro-schema_ schema to the _pulsar-mysql-jdbc-sink-topic_ topic. + + ```bash + $ bin/pulsar-admin schemas upload pulsar-mysql-jdbc-sink-topic -f ./connectors/avro-schema + ``` + +4. Check if the schema has been uploaded successfully. + + ```bash + $ bin/pulsar-admin schemas get pulsar-mysql-jdbc-sink-topic + ``` + + The schema has been uploaded successfully if the following message appears. + + ```bash + {"name":"pulsar-mysql-jdbc-sink-topic","schema":"{\"type\":\"record\",\"name\":\"Test\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"int\"]},{\"name\":\"name\",\"type\":[\"null\",\"string\"]}]}","type":"AVRO","properties":{}} + ``` + +### Create a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to create a sink connector and perform other operations on it. + +This example creates a sink connector and specifies the desired information. + +```bash +$ bin/pulsar-admin sinks create \ +--archive ./connectors/pulsar-io-jdbc-{{pulsar:version}}.nar \ +--inputs pulsar-mysql-jdbc-sink-topic \ +--name pulsar-mysql-jdbc-sink \ +--sink-config-file ./connectors/pulsar-mysql-jdbc-sink.yaml \ +--parallelism 1 +``` + +Once the command is executed, Pulsar creates a sink connector _pulsar-mysql-jdbc-sink_. + +This sink connector runs as a Pulsar Function and writes the messages produced in the topic _pulsar-mysql-jdbc-sink-topic_ to the MySQL table _pulsar_mysql_jdbc_sink_. + + #### Tip + + Flag | Description | This example + ---|---|---| + `--archive` | The path to the archive file for the sink. | _pulsar-io-jdbc-{{pulsar:version}}.nar_ | + `--inputs` | The input topic(s) of the sink.

    Multiple topics can be specified as a comma-separated list.|| + `--name` | The name of the sink. | _pulsar-mysql-jdbc-sink_ | + `--sink-config-file` | The path to a YAML config file specifying the configuration of the sink. | _pulsar-mysql-jdbc-sink.yaml_ | + `--parallelism` | The parallelism factor of the sink.

    For example, the number of sink instances to run. | _1_ | + + > #### Tip + > + > For more information about `pulsar-admin sinks create options`, see [here](io-cli.md#sinks). + +The sink has been created successfully if the following message appears. + +```bash +"Created successfully" +``` + +### Inspect a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to monitor a connector and perform other operations on it. + +* List all running JDBC sink(s). + + ```bash + $ bin/pulsar-admin sinks list \ + --tenant public \ + --namespace default + ``` + + > #### Tip + > + > For more information about `pulsar-admin sinks list options`, see [here](io-cli.md/#list-1). + + The result shows that only the _mysql-jdbc-sink_ sink is running. + + ```bash + [ + "pulsar-mysql-jdbc-sink" + ] + ``` + +* Get the information of a JDBC sink. + + ```bash + $ bin/pulsar-admin sinks get \ + --tenant public \ + --namespace default \ + --name pulsar-mysql-jdbc-sink + ``` + + > #### Tip + > + > For more information about `pulsar-admin sinks get options`, see [here](io-cli.md/#get-1). + + The result shows the information of the sink connector, including tenant, namespace, topic and so on. + + ```bash + { + "tenant": "public", + "namespace": "default", + "name": "pulsar-mysql-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.JdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-mysql-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "jdbc", + "jdbcUrl": "jdbc:mysql://127.0.0.1:3306/pulsar_mysql_jdbc_sink", + "userName": "root", + "tableName": "pulsar_mysql_jdbc_sink" + }, + "parallelism": 1, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true + } + ``` + +* Get the status of a JDBC sink + + ```bash + $ bin/pulsar-admin sinks status \ + --tenant public \ + --namespace default \ + --name pulsar-mysql-jdbc-sink + ``` + + > #### Tip + > + > For more information about `pulsar-admin sinks status options`, see [here](io-cli.md/#status-1). + + The result shows the current status of sink connector, including the number of instance, running status, worker ID and so on. + + ```bash + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReadFromPulsar" : 0, + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "numSinkExceptions" : 0, + "latestSinkExceptions" : [ ], + "numWrittenToSink" : 0, + "lastReceivedTime" : 0, + "workerId" : "c-standalone-fw-192.168.2.52-8080" + } + } ] + } + ``` + +### Stop a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to stop a connector and perform other operations on it. + +```bash +$ bin/pulsar-admin sinks stop \ +--tenant public \ +--namespace default \ +--name pulsar-mysql-jdbc-sink \ +--instance-id 0 +``` + +> #### Tip +> +> For more information about `pulsar-admin sinks stop options`, see [here](io-cli.md/#stop-1). + +The sink instance has been stopped successfully if the following message disappears. + +```bash +"Stopped successfully" +``` + +### Restart a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to restart a connector and perform other operations on it. + +```bash +$ bin/pulsar-admin sinks restart \ +--tenant public \ +--namespace default \ +--name pulsar-mysql-jdbc-sink \ +--instance-id 0 +``` + +> #### Tip +> +> For more information about `pulsar-admin sinks restart options`, see [here](io-cli.md/#restart-1). + +The sink instance has been started successfully if the following message disappears. + +```bash +"Started successfully" +``` + +> #### Tip +> +> * Optionally, you can run a standalone sink connector using `pulsar-admin sinks localrun options`. +> +> Note that `pulsar-admin sinks localrun options` **runs a sink connector locally**, while `pulsar-admin sinks start options` **starts a sink connector in a cluster**. +> +> * For more information about `pulsar-admin sinks localrun options`, see [here](io-cli.md#localrun-1). + +### Update a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to update a connector and perform other operations on it. + +This example updates the parallelism of the _pulsar-mysql-jdbc-sink_ sink connector to 2. + +```bash +$ bin/pulsar-admin sinks update \ +--name pulsar-mysql-jdbc-sink \ +--parallelism 2 +``` + +> #### Tip +> +> For more information about `pulsar-admin sinks update options`, see [here](io-cli.md/#update-1). + +The sink connector has been updated successfully if the following message disappears. + +```bash +"Updated successfully" +``` + +This example double-checks the information. + +```bash +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-mysql-jdbc-sink +``` + +The result shows that the parallelism is 2. + +```text +{ + "tenant": "public", + "namespace": "default", + "name": "pulsar-mysql-jdbc-sink", + "className": "org.apache.pulsar.io.jdbc.JdbcAutoSchemaSink", + "inputSpecs": { + "pulsar-mysql-jdbc-sink-topic": { + "isRegexPattern": false + } + }, + "configs": { + "password": "jdbc", + "jdbcUrl": "jdbc:mysql://127.0.0.1:3306/pulsar_mysql_jdbc_sink", + "userName": "root", + "tableName": "pulsar_mysql_jdbc_sink" + }, + "parallelism": 2, + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "autoAck": true +} +``` + +### Delete a JDBC sink + +You can use the [Connector Admin CLI](io-cli.md) +to delete a connector and perform other operations on it. + +This example deletes the _pulsar-mysql-jdbc-sink_ sink connector. + +```text +$ bin/pulsar-admin sinks delete \ +--tenant public \ +--namespace default \ +--name pulsar-mysql-jdbc-sink +``` + +> #### Tip +> +> For more information about `pulsar-admin sinks delete options`, see [here](io-cli.md/#delete-1). + +The sink connector has been deleted successfully if the following message appears. + +```text +"Deleted successfully" +``` + +This example double-checks the status of the sink connector. + +```text +$ bin/pulsar-admin sinks get \ +--tenant public \ +--namespace default \ +--name pulsar-mysql-jdbc-sink +``` + +The results shows that the sink connector does not exist. + +```text +HTTP 404 Not Found + +Reason: Sink pulsar-mysql-jdbc-sink doesn't exist +``` diff --git a/site2/website/versioned_docs/version-2.5.0/io-rabbitmq-sink.md b/site2/website/versioned_docs/version-2.5.0/io-rabbitmq-sink.md new file mode 100644 index 0000000000000..a8cae79f38a06 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-rabbitmq-sink.md @@ -0,0 +1,81 @@ +--- +id: version-2.5.0-io-rabbitmq-sink +title: RabbitMQ sink connector +sidebar_label: RabbitMQ sink connector +original_id: io-rabbitmq-sink +--- + +The RabbitMQ sink connector pulls messages from Pulsar topics +and persist the messages to RabbitMQ queues. + + +## Configuration + +The configuration of the RabbitMQ sink connector has the following properties. + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The exchange to publish messages. | +| `exchangeName` | String|true | " " (empty string) | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` |String|true | " " (empty string) |The routing key used to publish messages. | + + +### Example + +Before using the RabbitMQ sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "exchangeName": "test-exchange", + "routingKey": "test-key" + } + ``` + +* YAML + + ```yaml + configs: + host: "localhost" + port: 5672 + virtualHost: "/", + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + exchangeName: "test-exchange" + routingKey: "test-key" + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-rabbitmq-source.md b/site2/website/versioned_docs/version-2.5.0/io-rabbitmq-source.md new file mode 100644 index 0000000000000..923bb6865c0d2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-rabbitmq-source.md @@ -0,0 +1,78 @@ +--- +id: version-2.5.0-io-rabbitmq-source +title: RabbitMQ source connector +sidebar_label: RabbitMQ source connector +original_id: io-rabbitmq-source +--- + +The RabbitMQ source connector receives messages from RabbitMQ clusters +and writes messages to Pulsar topics. + +## Configuration + +The configuration of the RabbitMQ source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `connectionName` |String| true | " " (empty string) | The connection name. | +| `host` | String| true | " " (empty string) | The RabbitMQ host. | +| `port` | int |true | 5672 | The RabbitMQ port. | +| `virtualHost` |String|true | / | The virtual host used to connect to RabbitMQ. | +| `username` | String|false | guest | The username used to authenticate to RabbitMQ. | +| `password` | String|false | guest | The password used to authenticate to RabbitMQ. | +| `queueName` | String|true | " " (empty string) | The RabbitMQ queue name that messages should be read from or written to. | +| `requestedChannelMax` | int|false | 0 | The initially requested maximum channel number.

    0 means unlimited. | +| `requestedFrameMax` | int|false |0 | The initially requested maximum frame size in octets.

    0 means unlimited. | +| `connectionTimeout` | int|false | 60000 | The timeout of TCP connection establishment in milliseconds.

    0 means infinite. | +| `handshakeTimeout` | int|false | 10000 | The timeout of AMQP0-9-1 protocol handshake in milliseconds. | +| `requestedHeartbeat` | int|false | 60 | The requested heartbeat timeout in seconds. | +| `prefetchCount` | int|false | 0 | The maximum number of messages that the server delivers.

    0 means unlimited. | +| `prefetchGlobal` | boolean|false | false |Whether the setting should be applied to the entire channel rather than each consumer. | + +### Example + +Before using the RabbitMQ source connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "host": "localhost", + "port": "5672", + "virtualHost": "/", + "username": "guest", + "password": "guest", + "queueName": "test-queue", + "connectionName": "test-connection", + "requestedChannelMax": "0", + "requestedFrameMax": "0", + "connectionTimeout": "60000", + "handshakeTimeout": "10000", + "requestedHeartbeat": "60", + "prefetchCount": "0", + "prefetchGlobal": "false" + } + ``` + +* YAML + + ```yaml + configs: + host: "localhost" + port: 5672 + virtualHost: "/", + username: "guest" + password: "guest" + queueName: "test-queue" + connectionName: "test-connection" + requestedChannelMax: 0 + requestedFrameMax: 0 + connectionTimeout: 60000 + handshakeTimeout: 10000 + requestedHeartbeat: 60 + prefetchCount: 0 + prefetchGlobal: "false" + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-redis-sink.md b/site2/website/versioned_docs/version-2.5.0/io-redis-sink.md new file mode 100644 index 0000000000000..c27a28128d9e6 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-redis-sink.md @@ -0,0 +1,70 @@ +--- +id: version-2.5.0-io-redis-sink +title: Redis sink connector +sidebar_label: Redis sink connector +original_id: io-redis-sink +--- + +The Redis sink connector pulls messages from Pulsar topics +and persists the messages to a Redis database. + + + +## Configuration + +The configuration of the Redis sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `redisHosts` |String|true|" " (empty string) | A comma-separated list of Redis hosts to connect to. | +| `redisPassword` |String|false|" " (empty string) | The password used to connect to Redis. | +| `redisDatabase` | int|true|0 | The Redis database to connect to. | +| `clientMode` |String| false|Standalone | The client mode when interacting with Redis cluster.

    Below are the available options:
  • Standalone
  • Cluster | +| `autoReconnect` | boolean|false|true | Whether the Redis client automatically reconnect or not. | +| `requestQueue` | int|false|2147483647 | The maximum number of queued requests to Redis. | +| `tcpNoDelay` |boolean| false| false | Whether to enable TCP with no delay or not. | +| `keepAlive` | boolean|false | false |Whether to enable a keepalive to Redis or not. | +| `connectTimeout` |long| false|10000 | The time to wait before timing out when connecting in milliseconds. | +| `operationTimeout` | long|false|10000 | The time before an operation is marked as timed out in milliseconds . | +| `batchTimeMs` | int|false|1000 | The Redis operation time in milliseconds. | +| `batchSize` | int|false|200 | The batch size of writing to Redis database. | + + +### Example + +Before using the Redis sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "redisHosts": "localhost:6379", + "redisPassword": "fake@123", + "redisDatabase": "1", + "clientMode": "Standalone", + "operationTimeout": "2000", + "batchSize": "100", + "batchTimeMs": "1000", + "connectTimeout": "3000" + } + ``` + +* YAML + + ```yaml + { + redisHosts: "localhost:6379" + redisPassword: "fake@123" + redisDatabase: 1 + clientMode: "Standalone" + operationTimeout: 2000 + batchSize: 100 + batchTimeMs: 1000 + connectTimeout: 3000 + } + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-solr-sink.md b/site2/website/versioned_docs/version-2.5.0/io-solr-sink.md new file mode 100644 index 0000000000000..1a14137075c21 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-solr-sink.md @@ -0,0 +1,61 @@ +--- +id: version-2.5.0-io-solr-sink +title: Solr sink connector +sidebar_label: Solr sink connector +original_id: io-solr-sink +--- + +The Solr sink connector pulls messages from Pulsar topics +and persists the messages to Solr collections. + + + +## Configuration + +The configuration of the Solr sink connector has the following properties. + + + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `solrUrl` | String|true|" " (empty string) |
  • Comma-separated zookeeper hosts with chroot used in the SolrCloud mode.
    **Example**
    `localhost:2181,localhost:2182/chroot`

  • URL to connect to Solr used in standalone mode.
    **Example**
    `localhost:8983/solr` | +| `solrMode` | String|true|SolrCloud| The client mode when interacting with the Solr cluster.

    Below are the available options:
  • Standalone
  • SolrCloud| +| `solrCollection` |String|true| " " (empty string) | Solr collection name to which records need to be written. | +| `solrCommitWithinMs` |int| false|10 | The time within million seconds for Solr updating commits.| +| `username` |String|false| " " (empty string) | The username for basic authentication.

    **Note: `usename` is case-sensitive.** | +| `password` | String|false| " " (empty string) | The password for basic authentication.

    **Note: `password` is case-sensitive.** | + + + +### Example + +Before using the Solr sink connector, you need to create a configuration file through one of the following methods. + +* JSON + + ```json + { + "solrUrl": "localhost:2181,localhost:2182/chroot", + "solrMode": "SolrCloud", + "solrCollection": "techproducts", + "solrCommitWithinMs": 100, + "username": "fakeuser", + "password": "fake@123" + } + ``` + +* YAML + + ```yaml + { + solrUrl: "localhost:2181,localhost:2182/chroot" + solrMode: "SolrCloud" + solrCollection: "techproducts" + solrCommitWithinMs: 100 + username: "fakeuser" + password: "fake@123" + } + ``` + diff --git a/site2/website/versioned_docs/version-2.5.0/io-twitter-source.md b/site2/website/versioned_docs/version-2.5.0/io-twitter-source.md new file mode 100644 index 0000000000000..b91a0c4d6ad0c --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-twitter-source.md @@ -0,0 +1,28 @@ +--- +id: version-2.5.0-io-twitter-source +title: Twitter Firehose source connector +sidebar_label: Twitter Firehose source connector +original_id: io-twitter-source +--- + +The Twitter Firehose source connector receives tweets from Twitter Firehose and +writes the tweets to Pulsar topics. + +## Configuration + +The configuration of the Twitter Firehose source connector has the following properties. + +### Property + +| Name | Type|Required | Default | Description +|------|----------|----------|---------|-------------| +| `consumerKey` | String|true | " " (empty string) | The twitter OAuth consumer key.

    For more information, see [Access tokens](https://developer.twitter.com/en/docs/basics/authentication/guides/access-tokens). | +| `consumerSecret` | String |true | " " (empty string) | The twitter OAuth consumer secret. | +| `token` | String|true | " " (empty string) | The twitter OAuth token. | +| `tokenSecret` | String|true | " " (empty string) | The twitter OAuth secret. | +| `guestimateTweetTime`|Boolean|false|false|Most firehose events have null createdAt time.

    If `guestimateTweetTime` set to true, the connector estimates the createdTime of each firehose event to be current time. +| `clientName` | String |false | openconnector-twitter-source| The twitter firehose client name. | +| `clientHosts` |String| false | Constants.STREAM_HOST | The twitter firehose hosts to which client connects. | +| `clientBufferSize` | int|false | 50000 | The buffer size for buffering tweets fetched from twitter firehose. | + +> For more information about OAuth credentials, see [Twitter developers portal](https://developer.twitter.com/en.html). diff --git a/site2/website/versioned_docs/version-2.5.0/io-twitter.md b/site2/website/versioned_docs/version-2.5.0/io-twitter.md new file mode 100644 index 0000000000000..125797388d8d2 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/io-twitter.md @@ -0,0 +1,7 @@ +--- +id: version-2.5.0-io-twitter +title: Twitter Firehose Connector +sidebar_label: Twitter Firehose Connector +original_id: io-twitter +--- + diff --git a/site2/website/versioned_docs/version-2.5.0/reference-cli-tools.md b/site2/website/versioned_docs/version-2.5.0/reference-cli-tools.md new file mode 100644 index 0000000000000..bd5762a480875 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/reference-cli-tools.md @@ -0,0 +1,731 @@ +--- +id: version-2.5.0-reference-cli-tools +title: Pulsar command-line tools +sidebar_label: Pulsar CLI tools +original_id: reference-cli-tools +--- + +Pulsar offers several command-line tools that you can use for managing Pulsar installations, performance testing, using command-line producers and consumers, and more. + +All Pulsar command-line tools can be run from the `bin` directory of your [installed Pulsar package](getting-started-standalone.md). The following tools are currently documented: + +* [`pulsar`](#pulsar) +* [`pulsar-client`](#pulsar-client) +* [`pulsar-daemon`](#pulsar-daemon) +* [`pulsar-perf`](#pulsar-perf) +* [`bookkeeper`](#bookkeeper) +* [`broker-tool`](#broker-tool) + +> ### Getting help +> You can get help for any CLI tool, command, or subcommand using the `--help` flag, or `-h` for short. Here's an example: +> ```shell +> $ bin/pulsar broker --help +> ``` + +## `pulsar` + +The pulsar tool is used to start Pulsar components, such as bookies and ZooKeeper, in the foreground. + +These processes can also be started in the background, using nohup, using the pulsar-daemon tool, which has the same command interface as pulsar. + +Usage: +```bash +$ pulsar command +``` +Commands: +* `bookie` +* `broker` +* `compact-topic` +* `discovery` +* `configuration-store` +* `initialize-cluster-metadata` +* `proxy` +* `standalone` +* `websocket` +* `zookeeper` +* `zookeeper-shell` + +Example: +```bash +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker +``` + +The table below lists the environment variables that you can use to configure the `pulsar` tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|`conf/log4j2.yaml`| +|`PULSAR_BROKER_CONF`|Configuration file for broker|`conf/broker.conf`| +|`PULSAR_BOOKKEEPER_CONF`|description: Configuration file for bookie|`conf/bookkeeper.conf`| +|`PULSAR_ZK_CONF`|Configuration file for zookeeper|`conf/zookeeper.conf`| +|`PULSAR_CONFIGURATION_STORE_CONF`|Configuration file for the configuration store|`conf/global_zookeeper.conf`| +|`PULSAR_DISCOVERY_CONF`|Configuration file for discovery service|`conf/discovery.conf`| +|`PULSAR_WEBSOCKET_CONF`|Configuration file for websocket proxy|`conf/websocket.conf`| +|`PULSAR_STANDALONE_CONF`|Configuration file for standalone|`conf/standalone.conf`| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the jvm|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| +|`PULSAR_PID_DIR`|Folder where the pulsar server PID file should be stored|| +|`PULSAR_STOP_TIMEOUT`|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + + +### `bookie` + +Starts up a bookie server + +Usage: +```bash +$ pulsar bookie options +``` + +Options + +|Option|Description|Default| +|---|---|---| +|`-readOnly`|Force start a read-only bookie server|false| +|`-withAutoRecovery`|Start auto-recover service bookie server|false| + + +Example +```bash +$ PULSAR_BOOKKEEPER_CONF=/path/to/bookkeeper.conf pulsar bookie \ + -readOnly \ + -withAutoRecovery +``` + +### `broker` + +Starts up a Pulsar broker + +Usage +```bash +$ pulsar broker options +``` + +Options +|Option|Description|Default| +|---|---|---| +|`-bc` , `--bookie-conf`|Configuration file for BookKeeper|| +|`-rb` , `--run-bookie`|Run a BookKeeper bookie on the same host as the Pulsar broker|false| +|`-ra` , `--run-bookie-autorecovery`|Run a BookKeeper autorecovery daemon on the same host as the Pulsar broker|false| + +Example +```bash +$ PULSAR_BROKER_CONF=/path/to/broker.conf pulsar broker +``` + +### `compact-topic` + +Run compaction against a Pulsar topic (in a new process) + +Usage +```bash +$ pulsar compact-topic options +``` +Options +|Flag|Description|Default| +|---|---|---| +|`-t` , `--topic`|The Pulsar topic that you would like to compact|| + +Example +```bash +$ pulsar compact-topic --topic topic-to-compact +``` + +### `discovery` + +Run a discovery server + +Usage +```bash +$ pulsar discovery +``` + +Example +```bash +$ PULSAR_DISCOVERY_CONF=/path/to/discovery.conf pulsar discovery +``` + +### `configuration-store` + +Starts up the Pulsar configuration store + +Usage +```bash +$ pulsar configuration-store +``` + +Example +```bash +$ PULSAR_CONFIGURATION_STORE_CONF=/path/to/configuration_store.conf pulsar configuration-store +``` + +### `initialize-cluster-metadata` + +One-time cluster metadata initialization + +Usage +```bash +$ pulsar initialize-cluster-metadata options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-ub` , `--broker-service-url`|The broker service URL for the new cluster|| +|`-tb` , `--broker-service-url-tls`|The broker service URL for the new cluster with TLS encryption|| +|`-c` , `--cluster`|Cluster name|| +|`--configuration-store`|The configuration store quorum connection string|| +|`-uw` , `--web-service-url`|The web service URL for the new cluster|| +|`-tw` , `--web-service-url-tls`|The web service URL for the new cluster with TLS encryption|| +|`-zk` , `--zookeeper`|The local ZooKeeper quorum connection string|| + + +### `proxy` + +Manages the Pulsar proxy + +Usage +```bash +$ pulsar proxy options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--configuration-store`|Configuration store connection string|| +|`-zk` , `--zookeeper-servers`|Local ZooKeeper connection string|| + +Example +```bash +$ PULSAR_PROXY_CONF=/path/to/proxy.conf pulsar proxy \ + --zookeeper-servers zk-0,zk-1,zk2 \ + --configuration-store zk-0,zk-1,zk-2 +``` + +### `standalone` + +Run a broker service with local bookies and local ZooKeeper + +Usage +```bash +$ pulsar standalone options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-a` , `--advertised-address`|The standalone broker advertised address|| +|`--bookkeeper-dir`|Local bookies’ base data directory|data/standalone/bookeeper| +|`--bookkeeper-port`|Local bookies’ base port|3181| +|`--no-broker`|Only start ZooKeeper and BookKeeper services, not the broker|false| +|`--num-bookies`|The number of local bookies|1| +|`--only-broker`|Only start the Pulsar broker service (not ZooKeeper or BookKeeper)|| +|`--wipe-data`|Clean up previous ZooKeeper/BookKeeper data|| +|`--zookeeper-dir`|Local ZooKeeper’s data directory|data/standalone/zookeeper| +|`--zookeeper-port` |Local ZooKeeper’s port|2181| + +Example +```bash +$ PULSAR_STANDALONE_CONF=/path/to/standalone.conf pulsar standalone +``` + +### `websocket` + +Usage +```bash +$ pulsar websocket +``` + +Example +```bash +$ PULSAR_WEBSOCKET_CONF=/path/to/websocket.conf pulsar websocket +``` + +### `zookeeper` + +Starts up a ZooKeeper cluster + +Usage +```bash +$ pulsar zookeeper +``` + +Example +```bash +$ PULSAR_ZK_CONF=/path/to/zookeeper.conf pulsar zookeeper +``` + + +### `zookeeper-shell` + +Connects to a running ZooKeeper cluster using the ZooKeeper shell + +Usage +```bash +$ pulsar zookeeper-shell options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration file for ZooKeeper|| + + + +## `pulsar-client` + +The pulsar-client tool + +Usage +```bash +$ pulsar-client command +``` + +Commands +* `produce` +* `consume` + + +Options +|Flag|Description|Default| +|---|---|---| +|`--auth-params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{\"key1\":\"val1\",\"key2\":\"val2\"}"|{"saslJaasClientSectionName":"PulsarClient", "serverType":"broker"}| +|`--auth-plugin`|Authentication plugin class name|org.apache.pulsar.client.impl.auth.AuthenticationSasl| +|`--url`|Broker URL to which to connect|pulsar://localhost:6650/
    ws://localhost:8080 | + + +### `produce` +Send a message or messages to a specific broker and topic + +Usage +```bash +$ pulsar-client produce topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-f`, `--files`|Comma-separated file paths to send; either -m or -f must be specified|[]| +|`-m`, `--messages`|Comma-separated string of messages to send; either -m or -f must be specified|[]| +|`-n`, `--num-produce`|The number of times to send the message(s); the count of messages/files * num-produce should be below 1000|1| +|`-r`, `--rate`|Rate (in messages per second) at which to produce; a value 0 means to produce messages as fast as possible|0.0| + + +### `consume` +Consume messages from a specific broker and topic + +Usage +```bash +$ pulsar-client consume topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--hex`|Display binary messages in hexadecimal format.|false| +|`-n`, `--num-messages`|Number of messages to consume, 0 means to consume forever.|1| +|`-r`, `--rate`|Rate (in messages per second) at which to consume; a value 0 means to consume messages as fast as possible|0.0| +|`-s`, `--subscription-name`|Subscription name|| +|`-t`, `--subscription-type`|The type of the subscription. Possible values: Exclusive, Shared, Failover, Key_Shared.|Exclusive| + + + +## `pulsar-daemon` +A wrapper around the pulsar tool that’s used to start and stop processes, such as ZooKeeper, bookies, and Pulsar brokers, in the background using nohup. + +pulsar-daemon has a similar interface to the pulsar command but adds start and stop commands for various services. For a listing of those services, run pulsar-daemon to see the help output or see the documentation for the pulsar command. + +Usage +```bash +$ pulsar-daemon command +``` + +Commands +* `start` +* `stop` + + +### `start` +Start a service in the background using nohup. + +Usage +```bash +$ pulsar-daemon start service +``` + +### `stop` +Stop a service that’s already been started using start. + +Usage +```bash +$ pulsar-daemon stop service options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|-force|Stop the service forcefully if not stopped by normal shutdown.|false| + + + +## `pulsar-perf` +A tool for performance testing a Pulsar broker. + +Usage +```bash +$ pulsar-perf command +``` + +Commands +* `consume` +* `produce` +* `read` +* `websocket-producer` +* `managed-ledger` +* `monitor-brokers` +* `simulation-client` +* `simulation-controller` +* `help` + +Environment variables + +The table below lists the environment variables that you can use to configure the pulsar-perf tool. + +|Variable|Description|Default| +|---|---|---| +|`PULSAR_LOG_CONF`|Log4j configuration file|conf/log4j2.yaml| +|`PULSAR_CLIENT_CONF`|Configuration file for the client|conf/client.conf| +|`PULSAR_EXTRA_OPTS`|Extra options to be passed to the JVM|| +|`PULSAR_EXTRA_CLASSPATH`|Extra paths for Pulsar's classpath|| + + +### `consume` +Run a consumer + +Usage +``` +$ pulsar-perf consume options +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth_params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}.|| +|`--auth_plugin`|Authentication plugin class name|| +|`--acks-delay-millis`|Acknowlegments grouping delay in millis|100| +|`-k`, `--encryption-key-name`|The private key name to decrypt payload|| +|`-v`, `--encryption-key-value-file`|The file which contains the private key to decrypt payload|| +|`-h`, `--help`|Help message|false| +|`--conf-file`|Configuration file|| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-n`, `--num-consumers`|Number of consumers (per topic)|1| +|`-t`, `--num-topic`|The number of topics|1| +|`-r`, `--rate`|Simulate a slow message consumer (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-u`, `--service-url`|Pulsar service URL|| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled|0| +|`-s`, `--subscriber-name`|Subscriber name prefix|sub| +|`-st`, `--subscription-type`|Subscriber name prefix. Possible values are Exclusive, Shared, Failover.|Exclusive| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| + + +### `produce` +Run a producer + +Usage +```bash +$ pulsar-perf produce options +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--auth_params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}.|| +|`--auth_plugin`|Authentication plugin class name|| +|`-b`, `--batch-time-window`|Batch messages in a window of the specified number of milliseconds|1| +|`-z`, `--compression`|Compress messages’ payload. Possible values are NONE, LZ4, ZLIB, ZSTD or SNAPPY.|| +|`--conf-file`|Configuration file|| +|`-k`, `--encryption-key-name`|The public key name to encrypt payload|| +|`-v`, `--encryption-key-value-file`|The file which contains the public key to encrypt payload|| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-o`, `--max-outstanding`|Max number of outstanding messages|1000| +|`-p`, `--max-outstanding-across-partitions`|Max number of outstanding messages across partitions|50000| +|`-m`, `--num-messages`|Number of messages to publish in total. If set to 0, it will keep publishing.|0| +|`-n`, `--num-producers`|The number of producers (per topic)|1| +|`-t`, `--num-topic`|The number of topics|1| +|`-f`, `--payload-file`|Use payload from an UTF-8 encoded text file and a payload will be randomly selected when publishing messages|| +|`-e`, `--payload-delimiter`|The delimiter used to split lines when using payload from a file|\n| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`-u`, `--service-url`|Pulsar service URL|| +|`-s`, `--size`|Message size (in bytes)|1024| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`-time`, `--test-duration`|Test duration in secs. If set to 0, it will keep publishing.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--warmup-time`|Warm-up time in seconds|1| + + +### `read` +Run a topic reader + +Usage +```bash +$ pulsar-perf read options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--auth_params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}.|| +|`--auth_plugin`|Authentication plugin class name|| +|`--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single broker|100| +|`-t`, `--num-topic`|The number of topics|1| +|`-r`, `--rate`|Simulate a slow message reader (rate in msg/s)|0| +|`-q`, `--receiver-queue-size`|Size of the receiver queue|1000| +|`-u`, `--service-url`|Pulsar service URL|| +|`-m`, `--start-message-id`|Start message id. This can be either 'earliest', 'latest' or a specific message id by using 'lid:eid'|earliest| +|`-i`, `--stats-interval-seconds`|Statistics interval seconds. If 0, statistics will be disabled.|0| +|`--trust-cert-file`|Path for the trusted TLS certificate file|| +|`--use-tls`|Use TLS encryption on the connection|false| + + +### `websocket-producer` +Run a websocket producer + +Usage +```bash +$ pulsar-perf websocket-producer options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--auth_params`|Authentication parameters, whose format is determined by the implementation of method `configure` in authentication plugin class, for example "key1:val1,key2:val2" or "{"key1":"val1","key2":"val2"}.|| +|`--auth_plugin`|Authentication plugin class name|| +|`--conf-file`|Configuration file|| +|`-h`, `--help`|Help message|false| +|`-m`, `--num-messages`|Number of messages to publish in total. If 0, it will keep publishing|0| +|`-t`, `--num-topic`|The number of topics|1| +|`-f`, `--payload-file`|Use payload from a file instead of empty buffer|| +|`-u`, `--proxy-url`|Pulsar Proxy URL, e.g., "ws://localhost:8080/"|| +|`-r`, `--rate`|Publish rate msg/s across topics|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration in secs. If 0, it will keep publishing|0| + + +### `managed-ledger` +Write directly on managed-ledgers + +Usage +```bash +$ pulsar-perf managed-ledger options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-a`, `--ack-quorum`|Ledger ack quorum|1| +|`-dt`, `--digest-type`|BookKeeper digest type. Possible Values: [CRC32, MAC, CRC32C, DUMMY]|CRC32C| +|`-e`, `--ensemble-size`|Ledger ensemble size|1| +|`-h`, `--help`|Help message|false| +|`-c`, `--max-connections`|Max number of TCP connections to a single bookie|1| +|`-o`, `--max-outstanding`|Max number of outstanding requests|1000| +|`-m`, `--num-messages`|Number of messages to publish in total. If 0, it will keep publishing|0| +|`-t`, `--num-topic`|Number of managed ledgers|1| +|`-r`, `--rate`|Write rate msg/s across managed ledgers|100| +|`-s`, `--size`|Message size in byte|1024| +|`-time`, `--test-duration`|Test duration in secs. If 0, it will keep publishing|0| +|`--threads`|Number of threads writing|1| +|`-w`, `--write-quorum`|Ledger write quorum|1| +|`-zk`, `--zookeeperServers`|ZooKeeper connection string|| + + +### `monitor-brokers` +Continuously receive broker data and/or load reports + +Usage +```bash +$ pulsar-perf monitor-brokers options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--connect-string`|A connection string for one or more ZooKeeper servers|| +|`-h`, `--help`|Help message|false| + + +### `simulation-client` +Run a simulation server acting as a Pulsar client. Uses the client configuration specified in `conf/client.conf`. + +Usage +```bash +$ pulsar-perf simulation-client options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--port`|Port to listen on for controller|0| +|`--service-url`|Pulsar Service URL|| +|`-h`, `--help`|Help message|false| + +### `simulation-controller` +Run a simulation controller to give commands to servers + +Usage +```bash +$ pulsar-perf simulation-controller options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--client-port`|The port that the clients are listening on|0| +|`--clients`|Comma-separated list of client hostnames|| +|`--cluster`|The cluster to test on|| +|`-h`, `--help`|Help message|false| + + +### `help` +This help message + +Usage +```bash +$ pulsar-perf help +``` + + +## `bookkeeper` +A tool for managing BookKeeper. + +Usage +```bash +$ bookkeeper command +``` + +Commands +* `auto-recovery` +* `bookie` +* `localbookie` +* `upgrade` +* `shell` + + +Environment variables + +The table below lists the environment variables that you can use to configure the bookkeeper tool. + +|Variable|Description|Default| +|---|---|---| +|BOOKIE_LOG_CONF|Log4j configuration file|conf/log4j2.yaml| +|BOOKIE_CONF|BookKeeper configuration file|conf/bk_server.conf| +|BOOKIE_EXTRA_OPTS|Extra options to be passed to the JVM|| +|BOOKIE_EXTRA_CLASSPATH|Extra paths for BookKeeper's classpath|| +|ENTRY_FORMATTER_CLASS|The Java class used to format entries|| +|BOOKIE_PID_DIR|Folder where the BookKeeper server PID file should be stored|| +|BOOKIE_STOP_TIMEOUT|Wait time before forcefully killing the Bookie server instance if attempts to stop it are not successful|| + + +### `auto-recovery` +Runs an auto-recovery service daemon + +Usage +```bash +$ bookkeeper auto-recovery options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| + + +### `bookie` +Starts up a BookKeeper server (aka bookie) + +Usage +```bash +$ bookkeeper bookie options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| +|-readOnly|Force start a read-only bookie server|false| +|-withAutoRecovery|Start auto-recovery service bookie server|false| + + +### `localbookie` +Runs a test ensemble of N bookies locally + +Usage +```bash +$ bookkeeper localbookie N +``` + +### `upgrade` +Upgrade the bookie’s filesystem + +Usage +```bash +$ bookkeeper upgrade options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--conf`|Configuration for the auto-recovery daemon|| +|`-u`, `--upgrade`|Upgrade the bookie’s directories|| + + +### `shell` +Run shell for admin commands. To see a full listing of those commands, run bookkeeper shell without an argument. + +Usage +```bash +$ bookkeeper shell +``` + +Example +```bash +$ bookkeeper shell bookiesanity +``` + +## `broker-tool` + +The `broker- tool` is used for operations on a specific broker. + +Usage +```bash +$ broker-tool command +``` +Commands +* `load-report` +* `help` + +Example +Two ways to get more information about a command as below: + +```bash +$ broker-tool help command +$ broker-tool command --help +``` + +### `load-report` + +Collect the load report of a specific broker. +The command is run on a broker, and used for troubleshooting why broker can’t collect right load report. + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--interval`| Interval to collect load report, in milliseconds || +|`-h`, `--help`| Display help information || + diff --git a/site2/website/versioned_docs/version-2.5.0/reference-configuration.md b/site2/website/versioned_docs/version-2.5.0/reference-configuration.md new file mode 100644 index 0000000000000..86ab90091a419 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/reference-configuration.md @@ -0,0 +1,500 @@ +--- +id: version-2.5.0-reference-configuration +title: Pulsar configuration +sidebar_label: Pulsar configuration +original_id: reference-configuration +--- + + + + +Pulsar configuration can be managed either via a series of configuration files contained in the [`conf`](https://github.com/apache/pulsar/tree/master/conf) directory of a Pulsar [installation](getting-started-standalone.md) + +- [BookKeeper](#bookkeeper) +- [Broker](#broker) +- [Client](#client) +- [Service discovery](#service-discovery) +- [Log4j](#log4j) +- [Log4j shell](#log4j-shell) +- [Standalone](#standalone) +- [WebSocket](#websocket) +- [Pulsar proxy](#pulsar-proxy) +- [ZooKeeper](#zookeeper) + +## BookKeeper + +BookKeeper is a replicated log storage system that Pulsar uses for durable storage of all messages. + + +|Name|Description|Default| +|---|---|---| +|bookiePort|The port on which the bookie server listens.|3181| +|allowLoopback|Whether the bookie is allowed to use a loopback interface as its primary interface (i.e. the interface used to establish its identity). By default, loopback interfaces are not allowed as the primary interface. Using a loopback interface as the primary interface usually indicates a configuration error. For example, it’s fairly common in some VPS setups to not configure a hostname or to have the hostname resolve to `127.0.0.1`. If this is the case, then all bookies in the cluster will establish their identities as `127.0.0.1:3181` and only one will be able to join the cluster. For VPSs configured like this, you should explicitly set the listening interface.|false| +|listeningInterface|The network interface on which the bookie listens. If not set, the bookie will listen on all interfaces.|eth0| +|journalDirectory|The directory where Bookkeeper outputs its write-ahead log (WAL)|data/bookkeeper/journal| +|ledgerDirectories|The directory where Bookkeeper outputs ledger snapshots. This could define multiple directories to store snapshots separated by comma, for example `ledgerDirectories=/tmp/bk1-data,/tmp/bk2-data`. Ideally, ledger dirs and the journal dir are each in a different device, which reduces the contention between random I/O and sequential write. It is possible to run with a single disk, but performance will be significantly lower.|data/bookkeeper/ledgers| +|ledgerManagerType|The type of ledger manager used to manage how ledgers are stored, managed, and garbage collected. See [BookKeeper Internals](http://bookkeeper.apache.org/docs/latest/getting-started/concepts) for more info.|hierarchical| +|zkLedgersRootPath|The root ZooKeeper path used to store ledger metadata. This parameter is used by the ZooKeeper-based ledger manager as a root znode to store all ledgers.|/ledgers| +|ledgerStorageClass|Ledger storage implementation class|org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage| +|entryLogFilePreallocationEnabled|Enable or disable entry logger preallocation|true| +|logSizeLimit|Max file size of the entry logger, in bytes. A new entry log file will be created when the old one reaches the file size limitation.|2147483648| +|minorCompactionThreshold|Threshold of minor compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a minor compaction. If set to less than zero, the minor compaction is disabled.|0.2| +|minorCompactionInterval|Time interval to run minor compaction, in seconds. If set to less than zero, the minor compaction is disabled.|3600| +|majorCompactionThreshold|The threshold of major compaction. Entry log files whose remaining size percentage reaches below this threshold will be compacted in a major compaction. Those entry log files whose remaining size percentage is still higher than the threshold will never be compacted. If set to less than zero, the minor compaction is disabled.|0.5| +|majorCompactionInterval|The time interval to run major compaction, in seconds. If set to less than zero, the major compaction is disabled.|86400| +|compactionMaxOutstandingRequests|Sets the maximum number of entries that can be compacted without flushing. When compacting, the entries are written to the entrylog and the new offsets are cached in memory. Once the entrylog is flushed the index is updated with the new offsets. This parameter controls the number of entries added to the entrylog before a flush is forced. A higher value for this parameter means more memory will be used for offsets. Each offset consists of 3 longs. This parameter should not be modified unless you’re fully aware of the consequences.|100000| +|compactionRate|The rate at which compaction will read entries, in adds per second.|1000| +|isThrottleByBytes|Throttle compaction by bytes or by entries.|false| +|compactionRateByEntries|The rate at which compaction will read entries, in adds per second.|1000| +|compactionRateByBytes|Set the rate at which compaction will readd entries. The unit is bytes added per second.|1000000| +|journalMaxSizeMB|Max file size of journal file, in megabytes. A new journal file will be created when the old one reaches the file size limitation.|2048| +|journalMaxBackups|The max number of old journal filse to keep. Keeping a number of old journal files would help data recovery in special cases.|5| +|journalPreAllocSizeMB|How space to pre-allocate at a time in the journal.|16| +|journalWriteBufferSizeKB|The of the write buffers used for the journal.|64| +|journalRemoveFromPageCache|Whether pages should be removed from the page cache after force write.|true| +|journalAdaptiveGroupWrites|Whether to group journal force writes, which optimizes group commit for higher throughput.|true| +|journalMaxGroupWaitMSec|The maximum latency to impose on a journal write to achieve grouping.|1| +|journalAlignmentSize|All the journal writes and commits should be aligned to given size|4096| +|journalBufferedWritesThreshold|Maximum writes to buffer to achieve grouping|524288| +|journalFlushWhenQueueEmpty|If we should flush the journal when journal queue is empty|false| +|numJournalCallbackThreads|The number of threads that should handle journal callbacks|8| +|rereplicationEntryBatchSize|The number of max entries to keep in fragment for re-replication|5000| +|gcWaitTime|How long the interval to trigger next garbage collection, in milliseconds. Since garbage collection is running in background, too frequent gc will heart performance. It is better to give a higher number of gc interval if there is enough disk capacity.|900000| +|gcOverreplicatedLedgerWaitTime|How long the interval to trigger next garbage collection of overreplicated ledgers, in milliseconds. This should not be run very frequently since we read the metadata for all the ledgers on the bookie from zk.|86400000| +|flushInterval|How long the interval to flush ledger index pages to disk, in milliseconds. Flushing index files will introduce much random disk I/O. If separating journal dir and ledger dirs each on different devices, flushing would not affect performance. But if putting journal dir and ledger dirs on same device, performance degrade significantly on too frequent flushing. You can consider increment flush interval to get better performance, but you need to pay more time on bookie server restart after failure.|60000| +|bookieDeathWatchInterval|Interval to watch whether bookie is dead or not, in milliseconds|1000| +|zkServers|A list of one of more servers on which zookeeper is running. The server list can be comma separated values, for example: zkServers=zk1:2181,zk2:2181,zk3:2181.|localhost:2181| +|zkTimeout|ZooKeeper client session timeout in milliseconds Bookie server will exit if it received SESSION_EXPIRED because it was partitioned off from ZooKeeper for more than the session timeout JVM garbage collection, disk I/O will cause SESSION_EXPIRED. Increment this value could help avoiding this issue|30000| +|serverTcpNoDelay|This settings is used to enabled/disabled Nagle’s algorithm, which is a means of improving the efficiency of TCP/IP networks by reducing the number of packets that need to be sent over the network. If you are sending many small messages, such that more than one can fit in a single IP packet, setting server.tcpnodelay to false to enable Nagle algorithm can provide better performance.|true| +|openFileLimit|Max number of ledger index files could be opened in bookie server If number of ledger index files reaches this limitation, bookie server started to swap some ledgers from memory to disk. Too frequent swap will affect performance. You can tune this number to gain performance according your requirements.|0| +|pageSize|Size of a index page in ledger cache, in bytes A larger index page can improve performance writing page to disk, which is efficent when you have small number of ledgers and these ledgers have similar number of entries. If you have large number of ledgers and each ledger has fewer entries, smaller index page would improve memory usage.|8192| +|pageLimit|How many index pages provided in ledger cache If number of index pages reaches this limitation, bookie server starts to swap some ledgers from memory to disk. You can increment this value when you found swap became more frequent. But make sure pageLimit*pageSize should not more than JVM max memory limitation, otherwise you would got OutOfMemoryException. In general, incrementing pageLimit, using smaller index page would gain bettern performance in lager number of ledgers with fewer entries case If pageLimit is -1, bookie server will use 1/3 of JVM memory to compute the limitation of number of index pages.|0| +|readOnlyModeEnabled|If all ledger directories configured are full, then support only read requests for clients. If “readOnlyModeEnabled=true” then on all ledger disks full, bookie will be converted to read-only mode and serve only read requests. Otherwise the bookie will be shutdown. By default this will be disabled.|true| +|diskUsageThreshold|For each ledger dir, maximum disk space which can be used. Default is 0.95f. i.e. 95% of disk can be used at most after which nothing will be written to that partition. If all ledger dir partions are full, then bookie will turn to readonly mode if ‘readOnlyModeEnabled=true’ is set, else it will shutdown. Valid values should be in between 0 and 1 (exclusive).|0.95| +|diskCheckInterval|Disk check interval in milli seconds, interval to check the ledger dirs usage.|10000| +|auditorPeriodicCheckInterval|Interval at which the auditor will do a check of all ledgers in the cluster. By default this runs once a week. The interval is set in seconds. To disable the periodic check completely, set this to 0. Note that periodic checking will put extra load on the cluster, so it should not be run more frequently than once a day.|604800| +|auditorPeriodicBookieCheckInterval|The interval between auditor bookie checks. The auditor bookie check, checks ledger metadata to see which bookies should contain entries for each ledger. If a bookie which should contain entries is unavailable, thea the ledger containing that entry is marked for recovery. Setting this to 0 disabled the periodic check. Bookie checks will still run when a bookie fails. The interval is specified in seconds.|86400| +|numAddWorkerThreads|number of threads that should handle write requests. if zero, the writes would be handled by netty threads directly.|0| +|numReadWorkerThreads|number of threads that should handle read requests. if zero, the reads would be handled by netty threads directly.|8| +|maxPendingReadRequestsPerThread|If read workers threads are enabled, limit the number of pending requests, to avoid the executor queue to grow indefinitely.|2500| +|readBufferSizeBytes|The number of bytes we should use as capacity for BufferedReadChannel.|4096| +|writeBufferSizeBytes|The number of bytes used as capacity for the write buffer|65536| +|useHostNameAsBookieID|Whether the bookie should use its hostname to register with the coordination service (e.g.: zookeeper service). When false, bookie will use its ipaddress for the registration.|false| +|statsProviderClass||org.apache.bookkeeper.stats.prometheus.PrometheusMetricsProvider| +|prometheusStatsHttpPort||8000| +|dbStorage_writeCacheMaxSizeMb|Size of Write Cache. Memory is allocated from JVM direct memory. Write cache is used to buffer entries before flushing into the entry log For good performance, it should be big enough to hold a sub|25% of direct memory| +|dbStorage_readAheadCacheMaxSizeMb|Size of Read cache. Memory is allocated from JVM direct memory. This read cache is pre-filled doing read-ahead whenever a cache miss happens|25% of direct memory| +|dbStorage_readAheadCacheBatchSize|How many entries to pre-fill in cache after a read cache miss|1000| +|dbStorage_rocksDB_blockCacheSize|Size of RocksDB block-cache. For best performance, this cache should be big enough to hold a significant portion of the index database which can reach ~2GB in some cases|10% of direct memory| +|dbStorage_rocksDB_writeBufferSizeMB||64| +|dbStorage_rocksDB_sstSizeInMB||64| +|dbStorage_rocksDB_blockSize||65536| +|dbStorage_rocksDB_bloomFilterBitsPerKey||10| +|dbStorage_rocksDB_numLevels||-1| +|dbStorage_rocksDB_numFilesInLevel0||4| +|dbStorage_rocksDB_maxSizeInLevel1MB||256| + + + +## Broker + +Pulsar brokers are responsible for handling incoming messages from producers, dispatching messages to consumers, replicating data between clusters, and more. + +|Name|Description|Default| +|---|---|---| +|enablePersistentTopics| Whether persistent topics are enabled on the broker |true| +|enableNonPersistentTopics| Whether non-persistent topics are enabled on the broker |true| +|functionsWorkerEnabled| Whether the Pulsar Functions worker service is enabled in the broker |false| +|zookeeperServers| Zookeeper quorum connection string || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| Broker data port |6650| +|brokerServicePortTls| Broker data port for TLS |6651| +|webServicePort| Port to use to server HTTP request |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|webSocketServiceEnabled| Enable the WebSocket API service in broker |false| +|bindAddress| Hostname or IP address the service binds on, default is 0.0.0.0. |0.0.0.0| +|advertisedAddress| Hostname or IP address the service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| Name of the cluster to which this broker belongs to || +|brokerDeduplicationEnabled| Sets the default behavior for message deduplication in the broker. If enabled, the broker will reject messages that were already stored in the topic. This setting can be overridden on a per-namespace basis. |false| +|brokerDeduplicationMaxNumberOfProducers| The maximum number of producers for which information will be stored for deduplication purposes. |10000| +|brokerDeduplicationEntriesInterval| The number of entries after which a deduplication informational snapshot is taken. A larger interval will lead to fewer snapshots being taken, though this would also lengthen the topic recovery time (the time required for entries published after the snapshot to be replayed). |1000| +|brokerDeduplicationProducerInactivityTimeoutMinutes| The time of inactivity (in minutes) after which the broker will discard deduplication information related to a disconnected producer. |360| +|zooKeeperSessionTimeoutMillis| Zookeeper session timeout in milliseconds |30000| +|brokerShutdownTimeoutMs| Time to wait for broker graceful shutdown. After this time elapses, the process will be killed |60000| +|backlogQuotaCheckEnabled| Enable backlog quota check. Enforces action on topic when the quota is reached |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the quota |60| +|backlogQuotaDefaultLimitGB| Default per-topic backlog quota limit |10| +|allowAutoTopicCreation| Enable topic auto creation if a new producer or consumer connected |true| +|allowAutoTopicCreationType| The topic type (partitioned or non-partitioned) that is allowed to be automatically created. |Partitioned| +|defaultNumPartitions| The number of partitioned topics that is allowed to be automatically created if `allowAutoTopicCreationType` is partitioned |1| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics |60| +|messageExpiryCheckIntervalInMinutes| How frequently to proactively check and purge expired messages |5| +|brokerServiceCompactionMonitorIntervalInSeconds| Interval between checks to see if topics with compaction policies need to be compacted |60| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable check for minimum allowed client library version |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| Path for the file used to determine the rotation status for the broker when responding to service discovery health checks || +|preferLaterVersions| If true, (and ModularLoadManagerImpl is being used), the load manager will attempt to use only brokers running the latest software version (to minimize impact to bundles) |false| +|tlsEnabled| Enable TLS |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate file || +|tlsAllowInsecureConnection| Accept untrusted TLS certificate from client |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.2```, ```TLSv1.1```, ```TLSv1``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +|ttlDurationDefaultInSeconds| The default ttl for namespaces if ttl is not configured at namespace policies. |0| +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`|| +|tokenPublicAlg| Configure the algorithm to be used to validate auth tokens. This can be any of the asymettric algorithms supported by Java JWT (https://github.com/jwtk/jjwt#signature-algorithms-keys) |RS256| +|tokenAuthClaim| Specify which of the token's claims will be used as the authentication "principal" or "role". The default "sub" claim will be used if this is left blank || +|maxUnackedMessagesPerConsumer| Max number of unacknowledged messages allowed to receive messages by a consumer on a shared subscription. Broker will stop sending messages to consumer once, this limit reaches until consumer starts acknowledging messages back. Using a value of 0, is disabling unackeMessage limit check and consumer can receive messages without any restriction |50000| +|maxUnackedMessagesPerSubscription| Max number of unacknowledged messages allowed per shared subscription. Broker will stop dispatching messages to all consumers of the subscription once this limit reaches until consumer starts acknowledging messages back and unack count reaches to limit/2. Using a value of 0, is disabling unackedMessage-limit check and dispatcher can dispatch messages without any restriction |200000| +|subscriptionRedeliveryTrackerEnabled| Enable subscription message redelivery tracker |true| +|maxConcurrentLookupRequest| Max number of concurrent lookup request broker allows to throttle heavy incoming lookup traffic |50000| +|maxConcurrentTopicLoadRequest| Max number of concurrent topic loading request broker allows to control number of zk-operations |5000| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Autentication provider name list, which is comma separated list of class names || +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics || +|brokerClientAuthenticationPlugin| Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters || +|brokerClientAuthenticationParameters||| +|athenzDomainNames| Supported Athenz provider domain names(comma separated) for authentication || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to use when connecting to bookies || +|bookkeeperClientAuthenticationParametersName| BookKeeper auth plugin implementatation specifics parameters name and values || +|bookkeeperClientAuthenticationParameters||| +|bookkeeperClientTimeoutInSeconds| Timeout for BK add / read operations |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time Using a value of 0, is disabling the speculative reads |0| +|bookkeeperClientHealthCheckEnabled| Enable bookies health check. Bookies that have more than the configured number of failure within the interval will be quarantined for some time. During this period, new ledgers won’t be created on these bookies |true| +|bookkeeperClientHealthCheckIntervalSeconds||60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval||5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds ||1800| +|bookkeeperClientRackawarePolicyEnabled| Enable rack-aware bookie selection policy. BK will chose bookies from different racks when forming a new bookie ensemble |true| +|bookkeeperClientRegionawarePolicyEnabled| Enable region-aware bookie selection policy. BK will chose bookies from different regions and racks when forming a new bookie ensemble. If enabled, the value of bookkeeperClientRackawarePolicyEnabled is ignored |false| +|bookkeeperClientReorderReadSequenceEnabled| Enable/disable reordering read sequence on reading entries. |false| +|bookkeeperClientIsolationGroups| Enable bookie isolation by specifying a list of bookie groups to choose from. Any bookie outside the specified groups will not be used by the broker || +|bookkeeperClientSecondaryIsolationGroups| Enable bookie secondary-isolation group if bookkeeperClientIsolationGroups doesn't have enough bookie available. || +|bookkeeperClientMinAvailableBookiesInIsolationGroups| Minimum bookies that should be available as part of bookkeeperClientIsolationGroups else broker will include bookkeeperClientSecondaryIsolationGroups bookies in isolated list. || +|bookkeeperEnableStickyReads | Enable/disable having read operations for a ledger to be sticky to a single bookie. If this flag is enabled, the client will use one single bookie (by preference) to read all entries for a ledger. | true | +|managedLedgerDefaultEnsembleSize| Number of bookies to use when creating a ledger |2| +|managedLedgerDefaultWriteQuorum| Number of copies to store for each message |2| +|managedLedgerDefaultAckQuorum| Number of guaranteed copies (acks to wait before write is complete) |2| +|managedLedgerCacheSizeMB| Amount of memory to use for caching data payload in managed ledger. This memory is allocated from JVM direct memory and it’s shared across all the topics running in the same broker. By default, uses 1/5th of available direct memory || +|managedLedgerCacheCopyEntries| Whether we should make a copy of the entry payloads when inserting in cache| false| +|managedLedgerCacheEvictionWatermark| Threshold to which bring down the cache level when eviction is triggered |0.9| +|managedLedgerCacheEvictionFrequency| Configure the cache eviction frequency for the managed ledger cache (evictions/sec) | 100.0 | +|managedLedgerCacheEvictionTimeThresholdMillis| All entries that have stayed in cache for more than the configured time, will be evicted | 1000 | +|managedLedgerCursorBackloggedThreshold| Configure the threshold (in number of entries) from where a cursor should be considered 'backlogged' and thus should be set as inactive. | 1000| +|managedLedgerDefaultMarkDeleteRateLimit| Rate limit the amount of writes per second generated by consumer acking the messages |1.0| +|managedLedgerMaxEntriesPerLedger| Max number of entries to append to a ledger before triggering a rollover. A ledger rollover is triggered on these conditions:
    • Either the max rollover time has been reached
    • or max entries have been written to the ledged and at least min-time has passed
    |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| Minimum time between ledger rollover for a topic |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| Maximum time before forcing a ledger rollover for a topic |240| +|managedLedgerCursorMaxEntriesPerLedger| Max number of entries to append to a cursor ledger |50000| +|managedLedgerCursorRolloverTimeInSeconds| Max time before triggering a rollover on a cursor ledger |14400| +|managedLedgerMaxUnackedRangesToPersist| Max number of “acknowledgment holes” that are going to be persistently stored. When acknowledging out of order, a consumer will leave holes that are supposed to be quickly filled by acking all the messages. The information of which messages are acknowledged is persisted by compressing in “ranges” of messages that were acknowledged. After the max number of ranges is reached, the information will only be tracked in memory and messages will be redelivered in case of crashes. |1000| +|autoSkipNonRecoverableData| Skip reading non-recoverable/unreadable data-ledger under managed-ledger’s list.It helps when data-ledgers gets corrupted at bookkeeper and managed-cursor is stuck at that ledger. |false| +|loadBalancerEnabled| Enable load balancer |true| +|loadBalancerPlacementStrategy| Strategy to assign a new bundle weightedRandomSelection || +|loadBalancerReportUpdateThresholdPercentage| Percentage of change to trigger load report update |10| +|loadBalancerReportUpdateMaxIntervalMinutes| maximum interval to update load report |15| +|loadBalancerHostUsageCheckIntervalMinutes| Frequency of report to collect |1| +|loadBalancerSheddingIntervalMinutes| Load shedding interval. Broker periodically checks whether some traffic should be offload from some over-loaded broker to other under-loaded brokers |30| +|loadBalancerSheddingGracePeriodMinutes| Prevent the same topics to be shed and moved to other broker more that once within this timeframe |30| +|loadBalancerBrokerMaxTopics| Usage threshold to allocate max number of topics to broker |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| Usage threshold to determine a broker as under-loaded |1| +|loadBalancerBrokerOverloadedThresholdPercentage| Usage threshold to determine a broker as over-loaded |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| Interval to update namespace bundle resource quotat |15| +|loadBalancerBrokerComfortLoadLevelPercentage| Usage threshold to determine a broker is having just right level of load |65| +|loadBalancerAutoBundleSplitEnabled| enable/disable namespace bundle auto split |false| +|loadBalancerNamespaceBundleMaxTopics| maximum topics in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxSessions| maximum sessions (producers + consumers) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxMsgRate| maximum msgRate (in + out) in a bundle, otherwise bundle split will be triggered |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| maximum bandwidth (in + out) in a bundle, otherwise bundle split will be triggered |100| +|loadBalancerNamespaceMaximumBundles| maximum number of bundles in a namespace |128| +|replicationMetricsEnabled| Enable replication metrics |true| +|replicationConnectionsPerBroker| Max number of connections to open for each broker in a remote cluster More connections host-to-host lead to better throughput over high-latency links. |16| +|replicationProducerQueueSize| Replicator producer queue size |1000| +|replicatorPrefix| Replicator prefix used for replicator producer name and cursor name pulsar.repl|| +|replicationTlsEnabled| Enable TLS when talking with other clusters to replicate messages |false| +|defaultRetentionTimeInMinutes| Default message retention time || +|defaultRetentionSizeInMB| Default retention size |0| +|keepAliveIntervalSeconds| How often to check whether the connections are still alive |30| +|loadManagerClassName| Name of load manager to use |org.apache.pulsar.broker.loadbalance.impl.SimpleLoadManagerImpl| +|managedLedgerOffloadDriver| Driver to use to offload old data to long term storage (Possible values: S3) || +|managedLedgerOffloadMaxThreads| Maximum number of thread pool threads for ledger offloading |2| +|managedLedgerOffloadDeletionLagMs|Delay between a ledger being successfully offloaded to long term storage and the ledger being deleted from bookkeeper | 14400000| +|managedLedgerOffloadAutoTriggerSizeThresholdBytes|The number of bytes before triggering automatic offload to long term storage |-1 (disabled)| +|s3ManagedLedgerOffloadRegion| For Amazon S3 ledger offload, AWS region || +|s3ManagedLedgerOffloadBucket| For Amazon S3 ledger offload, Bucket to place offloaded ledger into || +|s3ManagedLedgerOffloadServiceEndpoint| For Amazon S3 ledger offload, Alternative endpoint to connect to (useful for testing) || +|s3ManagedLedgerOffloadMaxBlockSizeInBytes| For Amazon S3 ledger offload, Max block size in bytes. (64MB by default, 5MB minimum) |67108864| +|s3ManagedLedgerOffloadReadBufferSizeInBytes| For Amazon S3 ledger offload, Read buffer size in bytes (1MB by default) |1048576| +|s3ManagedLedgerOffloadRole| For Amazon S3 ledger offload, provide a role to assume before writing to s3 || +|s3ManagedLedgerOffloadRoleSessionName| For Amazon S3 ledger offload, provide a role session name when using a role |pulsar-s3-offload| + + + + +## Client + +The [`pulsar-client`](reference-cli-tools.md#pulsar-client) CLI tool can be used to publish messages to Pulsar and consume messages from Pulsar topics. This tool can be used in lieu of a client library. + +|Name|Description|Default| +|---|---|---| +|webServiceUrl| The web URL for the cluster. |http://localhost:8080/| +|brokerServiceUrl| The Pulsar protocol URL for the cluster. |pulsar://localhost:6650/| +|authPlugin| The authentication plugin. || +|authParams| The authentication parameters for the cluster, as a comma-separated string. || +|useTls| Whether or not TLS authentication will be enforced in the cluster. |false| +|tlsAllowInsecureConnection||| +|tlsTrustCertsFilePath||| + + +## Service discovery + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| Zookeeper quorum connection string (comma-separated) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout |30000| +|servicePort| Port to use to server binary-proto request |6650| +|servicePortTls| Port to use to server binary-proto-tls request |6651| +|webServicePort| Port that discovery service listen on |8080| +|webServicePortTls| Port to use to server HTTPS request |8443| +|bindOnLocalhost| Control whether to bind directly on localhost rather than on normal hostname |false| +|authenticationEnabled| Enable authentication |false| +|authenticationProviders| Authentication provider name list, which is comma separated list of class names (comma-separated) || +|authorizationEnabled| Enforce authorization |false| +|superUserRoles| Role names that are treated as “super-user”, meaning they will be able to do all admin operations and publish/consume from all topics (comma-separated) || +|tlsEnabled| Enable TLS |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || + + + +## Log4j + + +|Name|Default| +|---|---| +|pulsar.root.logger| WARN,CONSOLE| +|pulsar.log.dir| logs| +|pulsar.log.file| pulsar.log| +|log4j.rootLogger| ${pulsar.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ISO8601} - %-5p - [%t:%C{1}@%L] - %m%n| +|log4j.appender.ROLLINGFILE| org.apache.log4j.DailyRollingFileAppender| +|log4j.appender.ROLLINGFILE.Threshold| DEBUG| +|log4j.appender.ROLLINGFILE.File| ${pulsar.log.dir}/${pulsar.log.file}| +|log4j.appender.ROLLINGFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.ROLLINGFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n| +|log4j.appender.TRACEFILE| org.apache.log4j.FileAppender| +|log4j.appender.TRACEFILE.Threshold| TRACE| +|log4j.appender.TRACEFILE.File| pulsar-trace.log| +|log4j.appender.TRACEFILE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.TRACEFILE.layout.ConversionPattern| %d{ISO8601} - %-5p [%t:%C{1}@%L][%x] - %m%n| + + +## Log4j shell + +|Name|Default| +|---|---| +|bookkeeper.root.logger| ERROR,CONSOLE| +|log4j.rootLogger| ${bookkeeper.root.logger}| +|log4j.appender.CONSOLE| org.apache.log4j.ConsoleAppender| +|log4j.appender.CONSOLE.Threshold| DEBUG| +|log4j.appender.CONSOLE.layout| org.apache.log4j.PatternLayout| +|log4j.appender.CONSOLE.layout.ConversionPattern| %d{ABSOLUTE} %-5p %m%n| +|log4j.logger.org.apache.zookeeper| ERROR| +|log4j.logger.org.apache.bookkeeper| ERROR| +|log4j.logger.org.apache.bookkeeper.bookie.BookieShell| INFO| + + +## Standalone + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The quorum connection string for local ZooKeeper || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|brokerServicePort| The port on which the standalone broker listens for connections |6650| +|webServicePort| THe port used by the standalone broker for HTTP requests |8080| +|bindAddress| The hostname or IP address on which the standalone service binds |0.0.0.0| +|advertisedAddress| The hostname or IP address that the standalone service advertises to the outside world. If not set, the value of `InetAddress.getLocalHost().getHostName()` is used. || +|clusterName| The name of the cluster that this broker belongs to. |standalone| +|zooKeeperSessionTimeoutMillis| The ZooKeeper session timeout, in milliseconds. |30000| +|brokerShutdownTimeoutMs| The time to wait for graceful broker shutdown. After this time elapses, the process will be killed. |60000| +|backlogQuotaCheckEnabled| Enable the backlog quota check, which enforces a specified action when the quota is reached. |true| +|backlogQuotaCheckIntervalInSeconds| How often to check for topics that have reached the backlog quota. |60| +|backlogQuotaDefaultLimitGB| The default per-topic backlog quota limit. |10| +|ttlDurationDefaultInSeconds| The default ttl for namespaces if ttl is not configured at namespace policies. |0| +|brokerDeleteInactiveTopicsEnabled| Enable the deletion of inactive topics. |true| +|brokerDeleteInactiveTopicsFrequencySeconds| How often to check for inactive topics, in seconds. |60| +|messageExpiryCheckIntervalInMinutes| How often to proactively check and purged expired messages. |5| +|activeConsumerFailoverDelayTimeMillis| How long to delay rewinding cursor and dispatching messages when active consumer is changed. |1000| +|clientLibraryVersionCheckEnabled| Enable checks for minimum allowed client library version. |false| +|clientLibraryVersionCheckAllowUnversioned| Allow client libraries with no version information |true| +|statusFilePath| The path for the file used to determine the rotation status for the broker when responding to service discovery health checks |/usr/local/apache/htdocs| +|maxUnackedMessagesPerConsumer| The maximum number of unacknowledged messages allowed to be received by consumers on a shared subscription. The broker will stop sending messages to a consumer once this limit is reached or until the consumer begins acknowledging messages. A value of 0 disables the unacked message limit check and thus allows consumers to receive messages without any restrictions. |50000| +|maxUnackedMessagesPerSubscription| The same as above, except per subscription rather than per consumer. |200000| +|authenticationEnabled| Enable authentication for the broker. |false| +|authenticationProviders| A comma-separated list of class names for authentication providers. |false| +|authorizationEnabled| Enforce authorization in brokers. |false| +|superUserRoles| Role names that are treated as “superusers.” Superusers are authorized to perform all admin tasks. || +|brokerClientAuthenticationPlugin| The authentication settings of the broker itself. Used when the broker connects to other brokers either in the same cluster or from other clusters. || +|brokerClientAuthenticationParameters| The parameters that go along with the plugin specified using brokerClientAuthenticationPlugin. || +|athenzDomainNames| Supported Athenz authentication provider domain names as a comma-separated list. || +|bookkeeperClientAuthenticationPlugin| Authentication plugin to be used when connecting to bookies (BookKeeper servers). || +|bookkeeperClientAuthenticationParametersName| BookKeeper authentication plugin implementation parameters and values. || +|bookkeeperClientAuthenticationParameters| Parameters associated with the bookkeeperClientAuthenticationParametersName || +|bookkeeperClientTimeoutInSeconds| Timeout for BookKeeper add and read operations. |30| +|bookkeeperClientSpeculativeReadTimeoutInMillis| Speculative reads are initiated if a read request doesn’t complete within a certain time. A value of 0 disables speculative reads. |0| +|bookkeeperClientHealthCheckEnabled| Enable bookie health checks. |true| +|bookkeeperClientHealthCheckIntervalSeconds| The time interval, in seconds, at which health checks are performed. New ledgers are not created during health checks. |60| +|bookkeeperClientHealthCheckErrorThresholdPerInterval| Error threshold for health checks. |5| +|bookkeeperClientHealthCheckQuarantineTimeInSeconds| If bookies have more than the allowed number of failures within the time interval specified by bookkeeperClientHealthCheckIntervalSeconds |1800| +|bookkeeperClientRackawarePolicyEnabled| |true| +|bookkeeperClientRegionawarePolicyEnabled| |false| +|bookkeeperClientReorderReadSequenceEnabled| |false| +|bookkeeperClientIsolationGroups||| +|managedLedgerDefaultEnsembleSize| |1| +|managedLedgerDefaultWriteQuorum| |1| +|managedLedgerDefaultAckQuorum| |1| +|managedLedgerCacheSizeMB| |1024| +|managedLedgerCacheEvictionWatermark| |0.9| +|managedLedgerDefaultMarkDeleteRateLimit| |0.1| +|managedLedgerMaxEntriesPerLedger| |50000| +|managedLedgerMinLedgerRolloverTimeMinutes| |10| +|managedLedgerMaxLedgerRolloverTimeMinutes| |240| +|managedLedgerCursorMaxEntriesPerLedger| |50000| +|managedLedgerCursorRolloverTimeInSeconds| |14400| +|autoSkipNonRecoverableData| |false| +|loadBalancerEnabled| |false| +|loadBalancerPlacementStrategy| |weightedRandomSelection| +|loadBalancerReportUpdateThresholdPercentage| |10| +|loadBalancerReportUpdateMaxIntervalMinutes| |15| +|loadBalancerHostUsageCheckIntervalMinutes| |1| +|loadBalancerSheddingIntervalMinutes| |30| +|loadBalancerSheddingGracePeriodMinutes| |30| +|loadBalancerBrokerMaxTopics| |50000| +|loadBalancerBrokerUnderloadedThresholdPercentage| |1| +|loadBalancerBrokerOverloadedThresholdPercentage| |85| +|loadBalancerResourceQuotaUpdateIntervalMinutes| |15| +|loadBalancerBrokerComfortLoadLevelPercentage| |65| +|loadBalancerAutoBundleSplitEnabled| |false| +|loadBalancerNamespaceBundleMaxTopics| |1000| +|loadBalancerNamespaceBundleMaxSessions| |1000| +|loadBalancerNamespaceBundleMaxMsgRate| |1000| +|loadBalancerNamespaceBundleMaxBandwidthMbytes| |100| +|loadBalancerNamespaceMaximumBundles| |128| +|replicationMetricsEnabled| |true| +|replicationConnectionsPerBroker| |16| +|replicationProducerQueueSize| |1000| +|defaultRetentionTimeInMinutes| |0| +|defaultRetentionSizeInMB| |0| +|keepAliveIntervalSeconds| |30| + + + + + +## WebSocket + +|Name|Description|Default| +|---|---|---| +|configurationStoreServers ||| +|zooKeeperSessionTimeoutMillis| |30000| +|serviceUrl||| +|serviceUrlTls||| +|brokerServiceUrl||| +|brokerServiceUrlTls||| +|webServicePort||8080| +|webServicePortTls||8443| +|bindAddress||0.0.0.0| +|clusterName ||| +|authenticationEnabled||false| +|authenticationProviders||| +|authorizationEnabled||false| +|superUserRoles ||| +|brokerClientAuthenticationPlugin||| +|brokerClientAuthenticationParameters||| +|tlsEnabled||false| +|tlsAllowInsecureConnection||false| +|tlsCertificateFilePath||| +|tlsKeyFilePath ||| +|tlsTrustCertsFilePath||| + + +## Pulsar proxy + +The [Pulsar proxy](concepts-architecture-overview.md#pulsar-proxy) can be configured in the `conf/proxy.conf` file. + + +|Name|Description|Default| +|---|---|---| +|zookeeperServers| The ZooKeeper quorum connection string (as a comma-separated list) || +|configurationStoreServers| Configuration store connection string (as a comma-separated list) || +|zookeeperSessionTimeoutMs| ZooKeeper session timeout (in milliseconds) |30000| +|servicePort| The port to use for server binary Protobuf requests |6650| +|servicePortTls| The port to use to server binary Protobuf TLS requests |6651| +|statusFilePath| Path for the file used to determine the rotation status for the proxy instance when responding to service discovery health checks || +|authenticationEnabled| Whether authentication is enabled for the Pulsar proxy |false| +|authenticateMetricsEndpoint| Whether the '/metrics' endpoint requires authentication. Defaults to true. 'authenticationEnabled' must also be set for this to take effect. |true| +|authenticationProviders| Authentication provider name list (a comma-separated list of class names) || +|authorizationEnabled| Whether authorization is enforced by the Pulsar proxy |false| +|authorizationProvider| Authorization provider as a fully qualified class name |org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider| +|brokerClientAuthenticationPlugin| The authentication plugin used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientAuthenticationParameters| The authentication parameters used by the Pulsar proxy to authenticate with Pulsar brokers || +|brokerClientTrustCertsFilePath| The path to trusted certificates used by the Pulsar proxy to authenticate with Pulsar brokers || +|superUserRoles| Role names that are treated as “super-users,” meaning that they will be able to perform all admin || +|forwardAuthorizationCredentials| Whether client authorization credentials are forwared to the broker for re-authorization. Authentication must be enabled via authenticationEnabled=true for this to take effect. |false| +|maxConcurrentInboundConnections| Max concurrent inbound connections. The proxy will reject requests beyond that. |10000| +|maxConcurrentLookupRequests| Max concurrent outbound connections. The proxy will error out requests beyond that. |50000| +|tlsEnabledInProxy| Whether TLS is enabled for the proxy |false| +|tlsEnabledWithBroker| Whether TLS is enabled when communicating with Pulsar brokers |false| +|tlsCertificateFilePath| Path for the TLS certificate file || +|tlsKeyFilePath| Path for the TLS private key file || +|tlsTrustCertsFilePath| Path for the trusted TLS certificate pem file || +|tlsHostnameVerificationEnabled| Whether the hostname is validated when the proxy creates a TLS connection with brokers |false| +|tlsRequireTrustedClientCertOnConnect| Whether client certificates are required for TLS. Connections are rejected if the client certificate isn’t trusted. |false| +|tlsProtocols|Specify the tls protocols the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLSv1.2```, ```TLSv1.1```, ```TLSv1``` || +|tlsCiphers|Specify the tls cipher the broker will use to negotiate during TLS Handshake. Multiple values can be specified, separated by commas. Example:- ```TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256```|| +|tokenSecretKey| Configure the secret key to be used to validate auth tokens. The key can be specified like: `tokenSecretKey=data:base64,xxxxxxxxx` or `tokenSecretKey=file:///my/secret.key`|| +|tokenPublicKey| Configure the public key to be used to validate auth tokens. The key can be specified like: `tokenPublicKey=data:base64,xxxxxxxxx` or `tokenPublicKey=file:///my/secret.key`|| +|tokenPublicAlg| Configure the algorithm to be used to validate auth tokens. This can be any of the asymettric algorithms supported by Java JWT (https://github.com/jwtk/jjwt#signature-algorithms-keys) |RS256| +|tokenAuthClaim| Specify the token claim that will be used as the authentication "principal" or "role". The "subject" field will be used if this is left blank || + +## ZooKeeper + +ZooKeeper handles a broad range of essential configuration- and coordination-related tasks for Pulsar. The default configuration file for ZooKeeper is in the `conf/zookeeper.conf` file in your Pulsar installation. The following parameters are available: + + +|Name|Description|Default| +|---|---|---| +|tickTime| The tick is the basic unit of time in ZooKeeper, measured in milliseconds and used to regulate things like heartbeats and timeouts. tickTime is the length of a single tick. |2000| +|initLimit| The maximum time, in ticks, that the leader ZooKeeper server allows follower ZooKeeper servers to successfully connect and sync. The tick time is set in milliseconds using the tickTime parameter. |10| +|syncLimit| The maximum time, in ticks, that a follower ZooKeeper server is allowed to sync with other ZooKeeper servers. The tick time is set in milliseconds using the tickTime parameter. |5| +|dataDir| The location where ZooKeeper will store in-memory database snapshots as well as the transaction log of updates to the database. |data/zookeeper| +|clientPort| The port on which the ZooKeeper server will listen for connections. |2181| +|autopurge.snapRetainCount| In ZooKeeper, auto purge determines how many recent snapshots of the database stored in dataDir to retain within the time interval specified by autopurge.purgeInterval (while deleting the rest). |3| +|autopurge.purgeInterval| The time interval, in hours, by which the ZooKeeper database purge task is triggered. Setting to a non-zero number will enable auto purge; setting to 0 will disable. Read this guide before enabling auto purge. |1| +|maxClientCnxns| The maximum number of client connections. Increase this if you need to handle more ZooKeeper clients. |60| + + + + +In addition to the parameters in the table above, configuring ZooKeeper for Pulsar involves adding +a `server.N` line to the `conf/zookeeper.conf` file for each node in the ZooKeeper cluster, where `N` is the number of the ZooKeeper node. Here's an example for a three-node ZooKeeper cluster: + +```properties +server.1=zk1.us-west.example.com:2888:3888 +server.2=zk2.us-west.example.com:2888:3888 +server.3=zk3.us-west.example.com:2888:3888 +``` + +> We strongly recommend consulting the [ZooKeeper Administrator's Guide](https://zookeeper.apache.org/doc/current/zookeeperAdmin.html) for a more thorough and comprehensive introduction to ZooKeeper configuration diff --git a/site2/website/versioned_docs/version-2.5.0/reference-connector-admin.md b/site2/website/versioned_docs/version-2.5.0/reference-connector-admin.md new file mode 100644 index 0000000000000..7addc1790ec69 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/reference-connector-admin.md @@ -0,0 +1,7 @@ +--- +id: version-2.5.0-reference-connector-admin +title: Connector Admin CLI +sidebar_label: Connector Admin CLI +original_id: reference-connector-admin +--- + diff --git a/site2/website/versioned_docs/version-2.5.0/reference-metrics.md b/site2/website/versioned_docs/version-2.5.0/reference-metrics.md new file mode 100644 index 0000000000000..e0269c3b4a89d --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/reference-metrics.md @@ -0,0 +1,246 @@ +--- +id: version-2.5.0-reference-metrics +title: Pulsar Metrics +sidebar_label: Pulsar Metrics +original_id: reference-metrics +--- + + + +Pulsar exposes metrics in Prometheus format that can be collected and used for monitoring the health of the cluster. + +* [ZooKeeper](#zookeeper) +* [BookKeeper](#bookkeeper) +* [Broker](#broker) + +## Overview + +The metrics exposed by Pulsar are in Prometheus format. The types of metrics are: + +- [Counter](https://prometheus.io/docs/concepts/metric_types/#counter): a cumulative metric that represents a single monotonically increasing counter whose value can only increase or be reset to zero on restart. +- [Gauge](https://prometheus.io/docs/concepts/metric_types/#gauge): a *gauge* is a metric that represents a single numerical value that can arbitrarily go up and down. +- [Histogram](https://prometheus.io/docs/concepts/metric_types/#histogram): a histogram samples observations (usually things like request durations or response sizes) and counts them in configurable buckets. +- [Summary](https://prometheus.io/docs/concepts/metric_types/#summary): similar to a histogram, a summary samples observations (usually things like request durations and response sizes). While it also provides a total count of observations and a sum of all observed values, it calculates configurable quantiles over a sliding time window. + +## ZooKeeper + +The ZooKeeper metrics are exposed under "/metrics" at port 8000. You can use a different port +by configuring the `stats_server_port` system property. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| zookeeper_server_znode_count | Gauge | The number of z-nodes stored. | +| zookeeper_server_data_size_bytes | Gauge | The total size of all of z-nodes stored. | +| zookeeper_server_connections | Gauge | The number of currently opened connections. | +| zookeeper_server_watches_count | Gauge | The number of watchers registered. | +| zookeeper_server_ephemerals_count | Gauge | The number of ephemeral z-nodes. | + +### Request metrics + +| Name | Type | Description | +|---|---|---| +| zookeeper_server_requests | Counter | The total number of requests received by a particular server. | +| zookeeper_server_requests_latency_ms | Summary | The requests latency calculated in milliseconds.
    Available labels: *type* (write, read).
    • *write*: the requests that write data to ZooKeeper.
    • *read*: the requests that read data from ZooKeeper.
    | + +## BookKeeper + +The BookKeeper metrics are exposed under "/metrics" at port 8000. You can change the port by updating `prometheusStatsHttpPort` +in `bookkeeper.conf` configuration file. + +### Server metrics + +| Name | Type | Description | +|---|---|---| +| bookie_SERVER_STATUS | Gauge | The server status for bookie server.
    • 1: the bookie is running in writable mode.
    • 0: the bookie is running in readonly mode.
    | +| bookkeeper_server_ADD_ENTRY_count | Counter | The total number of ADD_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_count | Counter | The total number of READ_ENTRY requests received at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_WRITE_BYTES | Counter | The total number of bytes written to the bookie. | +| bookie_READ_BYTES | Counter | The total number of bytes read from the bookie. | +| bookkeeper_server_ADD_ENTRY_REQUEST | Histogram | The histogram of request latency of ADD_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | +| bookkeeper_server_READ_ENTRY_REQUEST | Histogram | The histogram of request latency of READ_ENTRY requests at the bookie. The `success` label is used to distinguish successes and failures. | + +### Journal metrics + +| Name | Type | Description | +|---|---|---| +| bookie_journal_JOURNAL_SYNC_count | Counter | The total number of journal fsync operations happening at the bookie. The `success` label is used to distinguish successes and failures. | +| bookie_journal_JOURNAL_QUEUE_SIZE | Gauge | The total number of requests pending in the journal queue. | +| bookie_journal_JOURNAL_FORCE_WRITE_QUEUE_SIZE | Gauge | The total number of force write (fsync) requests pending in the force-write queue. | +| bookie_journal_JOURNAL_CB_QUEUE_SIZE | Gauge | The total number of callbacks pending in the callback queue. | +| bookie_journal_JOURNAL_ADD_ENTRY | Histogram | The histogram of request latency of adding entries to the journal. | +| bookie_journal_JOURNAL_SYNC | Histogram | The histogram of fsync latency of syncing data to the journal disk. | + +### Storage metrics + +| Name | Type | Description | +|---|---|---| +| bookie_ledgers_count | Gauge | The total number of ledgers stored in the bookie. | +| bookie_entries_count | Gauge | The total number of entries stored in the bookie. | +| bookie_write_cache_size | Gauge | The bookie write cache size (in bytes). | +| bookie_read_cache_size | Gauge | The bookie read cache size (in bytes). | +| bookie_DELETED_LEDGER_COUNT | Counter | The total number of ledgers deleted since the bookie has started. | +| bookie_ledger_writable_dirs | Gauge | The number of writable directories in the bookie. | + +## Broker + +The broker metrics are exposed under "/metrics" at port 8080. You can change the port by updating `webServicePort` to a different port +in `broker.conf` configuration file. + +All the metrics exposed by a broker are labelled with `cluster=${pulsar_cluster}`. The value of `${pulsar_cluster}` is the pulsar cluster +name you configured in `broker.conf`. + +Broker has the following kinds of metrics: + +* [Namespace metrics](#namespace-metrics) + * [Replication metrics](#replication-metrics) +* [Topic metrics](#topic-metrics) + * [Replication metrics](#replication-metrics-1) +* [Subscription metrics](#subscription-metrics) +* [Consumer metrics](#consumer-metrics) + +### Namespace metrics + +> Namespace metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to `false`. + +All the namespace metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. + +| Name | Type | Description | +|---|---|---| +| pulsar_topics_count | Gauge | The number of Pulsar topics of the namespace owned by this broker. | +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the namespace served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the namespace connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the namespace connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the namespace coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the namespace going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the namespace coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the namespace going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this namespace owned by this broker (bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this namespace owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this namespace offloaded to the tiered storage (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this namespace (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this namespace (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a namespace that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a namespace that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | + +#### Replication metrics + +If a namespace is configured to be replicated between multiple Pulsar clusters, the corresponding replication metrics will also be exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics will also be labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the namespace replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the namespace replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the namespace replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the namespace replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the namespace replicating to remote cluster (messages). | + +### Topic metrics + +> Topic metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to true. + +All the topic metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscriptions_count | Gauge | The number of Pulsar subscriptions of the topic served by this broker. | +| pulsar_producers_count | Gauge | The number of active producers of the topic connected to this broker. | +| pulsar_consumers_count | Gauge | The number of active consumers of the topic connected to this broker. | +| pulsar_rate_in | Gauge | The total message rate of the topic coming into this broker (messages/second). | +| pulsar_rate_out | Gauge | The total message rate of the topic going out from this broker (messages/second). | +| pulsar_throughput_in | Gauge | The total throughput of the topic coming into this broker (bytes/second). | +| pulsar_throughput_out | Gauge | The total throughput of the topic going out from this broker (bytes/second). | +| pulsar_storage_size | Gauge | The total storage size of the topics in this topic owned by this broker (bytes). | +| pulsar_storage_backlog_size | Gauge | The total backlog size of the topics of this topic owned by this broker (messages). | +| pulsar_storage_offloaded_size | Gauge | The total amount of the data in this topic offloaded to the tiered storage (bytes). | +| pulsar_storage_write_rate | Gauge | The total message batches (entries) written to the storage for this topic (message batches / second). | +| pulsar_storage_read_rate | Gauge | The total message batches (entries) read from the storage for this topic (message batches / second). | +| pulsar_subscription_delayed | Gauge | The total message batches (entries) are delayed for dispatching. | +| pulsar_storage_write_latency_le_* | Histogram | The entry rate of a topic that the storage write latency is smaller with a given threshold.
    Available thresholds:
    • pulsar_storage_write_latency_le_0_5: <= 0.5ms
    • pulsar_storage_write_latency_le_1: <= 1ms
    • pulsar_storage_write_latency_le_5: <= 5ms
    • pulsar_storage_write_latency_le_10: <= 10ms
    • pulsar_storage_write_latency_le_20: <= 20ms
    • pulsar_storage_write_latency_le_50: <= 50ms
    • pulsar_storage_write_latency_le_100: <= 100ms
    • pulsar_storage_write_latency_le_200: <= 200ms
    • pulsar_storage_write_latency_le_1000: <= 1s
    • pulsar_storage_write_latency_le_overflow: > 1s
    | +| pulsar_entry_size_le_* | Histogram | The entry rate of a topic that the entry size is smaller with a given threshold.
    Available thresholds:
    • pulsar_entry_size_le_128: <= 128 bytes
    • pulsar_entry_size_le_512: <= 512 bytes
    • pulsar_entry_size_le_1_kb: <= 1 KB
    • pulsar_entry_size_le_2_kb: <= 2 KB
    • pulsar_entry_size_le_4_kb: <= 4 KB
    • pulsar_entry_size_le_16_kb: <= 16 KB
    • pulsar_entry_size_le_100_kb: <= 100 KB
    • pulsar_entry_size_le_1_mb: <= 1 MB
    • pulsar_entry_size_le_overflow: > 1 MB
    | +| pulsar_in_bytes_total | Counter | The total number of bytes received for this topic | +| pulsar_producers_count | Counter | The total number of messages received for this topic | + +#### Replication metrics + +If a namespace that a topic belongs to is configured to be replicated between multiple Pulsar clusters, the corresponding replication metrics will also be exposed when `replicationMetricsEnabled` is enabled. + +All the replication metrics will also be labelled with `remoteCluster=${pulsar_remote_cluster}`. + +| Name | Type | Description | +|---|---|---| +| pulsar_replication_rate_in | Gauge | The total message rate of the topic replicating from remote cluster (messages/second). | +| pulsar_replication_rate_out | Gauge | The total message rate of the topic replicating to remote cluster (messages/second). | +| pulsar_replication_throughput_in | Gauge | The total throughput of the topic replicating from remote cluster (bytes/second). | +| pulsar_replication_throughput_out | Gauge | The total throughput of the topic replicating to remote cluster (bytes/second). | +| pulsar_replication_backlog | Gauge | The total backlog of the topic replicating to remote cluster (messages). | + + +### Subscription metrics + +> Subscription metrics are only exposed when `exposeTopicLevelMetricsInPrometheus` is set to true. + +All the subscription metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. + +| Name | Type | Description | +|---|---|---| +| pulsar_subscription_back_log | Gauge | The total backlog of a subscription (messages). | +| pulsar_subscription_delayed | Gauge | The total number of messages are delayed to be dispatched for a subscription (messages). | +| pulsar_subscription_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_subscription_unacked_messages | Gauge | The total number of unacknowledged messages of a subscription (messages). | +| pulsar_subscription_blocked_on_unacked_messages | Gauge | Indicate whether a subscription is blocked on unacknowledged messages or not.
    • 1 means the subscription is blocked on waiting unacknowledged messages to be acked.
    • 0 means the subscription is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_subscription_msg_rate_out | Gauge | The total message dispatch rate for a subscription (messages/second). | +| pulsar_subscription_msg_throughput_out | Gauge | The total message dispatch throughput for a subscription (bytes/second). | + +### Consumer metrics + +> Consumer metrics are only exposed when both `exposeTopicLevelMetricsInPrometheus` and `exposeConsumerLevelMetricsInPrometheus` +> are set to true. + +All the consumer metrics are labelled with the following labels: + +- *cluster*: `cluster=${pulsar_cluster}`. `${pulsar_cluster}` is the cluster name that you configured in `broker.conf`. +- *namespace*: `namespace=${pulsar_namespace}`. `${pulsar_namespace}` is the namespace name. +- *topic*: `topic=${pulsar_topic}`. `${pulsar_topic}` is the topic name. +- *subscription*: `subscription=${subscription}`. `${subscription}` is the topic subscription name. +- *consumer_name*: `consumer_name=${consumer_name}`. `${consumer_name}` is the topic consumer name. +- *consumer_id*: `consumer_id=${consumer_id}`. `${consumer_id}` is the topic consumer id. + +| Name | Type | Description | +|---|---|---| +| pulsar_consumer_msg_rate_redeliver | Gauge | The total message rate for message being redelivered (messages/second). | +| pulsar_consumer_unacked_messages | Gauge | The total number of unacknowledged messages of a consumer (messages). | +| pulsar_consumer_blocked_on_unacked_messages | Gauge | Indicate whether a consumer is blocked on unacknowledged messages or not.
    • 1 means the consumer is blocked on waiting unacknowledged messages to be acked.
    • 0 means the consumer is not blocked on waiting unacknowledged messages to be acked.
    | +| pulsar_consumer_msg_rate_out | Gauge | The total message dispatch rate for a consumer (messages/second). | +| pulsar_consumer_msg_throughput_out | Gauge | The total message dispatch throughput for a consumer (bytes/second). | +| pulsar_consumer_available_permits | Gauge | The available permits for for a consumer. | + +## Monitor + +You can [set up a Prometheus instance](https://prometheus.io/) to collect all the metrics exposed at Pulsar components and set up +[Grafana](https://grafana.com/) dashboards to display the metrics and monitor your Pulsar cluster. + +The following are some Grafana dashboards examples: + +- [pulsar-grafana](http://pulsar.apache.org/docs/en/deploy-monitoring/#grafana): A grafana dashboard that displays metrics collected in Prometheus for Pulsar clusters running on Kubernetes. +- [apache-pulsar-grafana-dashboard](https://github.com/streamnative/apache-pulsar-grafana-dashboard): A collection of grafana dashboard templates for different Pulsar components running on both Kubernetes and on-premise machines. diff --git a/site2/website/versioned_docs/version-2.5.0/reference-pulsar-admin.md b/site2/website/versioned_docs/version-2.5.0/reference-pulsar-admin.md new file mode 100644 index 0000000000000..fb285ef2e90fc --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/reference-pulsar-admin.md @@ -0,0 +1,2213 @@ +--- +id: version-2.5.0-pulsar-admin +title: Pulsar admin CLI +sidebar_label: Pulsar Admin CLI +original_id: pulsar-admin +--- + +The `pulsar-admin` tool enables you to manage Pulsar installations, including clusters, brokers, namespaces, tenants, and more. + +Usage +```bash +$ pulsar-admin command +``` + +Commands +* `broker-stats` +* `brokers` +* `clusters` +* `functions` +* `functions-worker` +* `namespaces` +* `ns-isolation-policy` +* `sources` + + For more information, see [here](reference-connector-admin.md#sources) +* `sinks` + + For more information, see [here](reference-connector-admin.md#sinks) +* `topics` +* `tenants` +* `resource-quotas` +* `schemas` + +## `broker-stats` + +Operations to collect broker statistics + +```bash +$ pulsar-admin broker-stats subcommand +``` + +Subcommands +* `allocator-stats` +* `topics(destinations)` +* `mbeans` +* `monitoring-metrics` +* `load-report` + + +### `allocator-stats` + +Dump allocator stats + +Usage +```bash +$ pulsar-admin broker-stats allocator-stats allocator-name +``` + +### `topics(destinations)` + +Dump topic stats + +Usage +```bash +$ pulsar-admin broker-stats topics options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + +### `mbeans` + +Dump Mbean stats + +Usage +```bash +$ pulsar-admin broker-stats mbeans options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `monitoring-metrics` + +Dump metrics for monitoring + +Usage +```bash +$ pulsar-admin broker-stats monitoring-metrics options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-i`, `--indent`|Indent JSON output|false| + + +### `load-report` + +Dump broker load-report + +Usage +```bash +$ pulsar-admin broker-stats load-report +``` + + +## `brokers` + +Operations about brokers + +```bash +$ pulsar-admin brokers subcommand +``` + +Subcommands +* `list` +* `namespaces` +* `update-dynamic-config` +* `list-dynamic-config` +* `get-all-dynamic-config` +* `get-internal-config` +* `get-runtime-config` +* `healthcheck` + +### `list` +List active brokers of the cluster + +Usage +```bash +$ pulsar-admin brokers list cluster-name +``` + +### `namespaces` +List namespaces owned by the broker + +Usage +```bash +$ pulsar-admin brokers namespaces cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--url`|The URL for the broker|| + + +### `update-dynamic-config` +Update a broker's dynamic service configuration + +Usage +```bash +$ pulsar-admin brokers update-dynamic-config options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| +|`--value`|Value for the configuration parameter value specified using the `--config` flag|| + + +### `list-dynamic-config` +Get list of updatable configuration name + +Usage +```bash +$ pulsar-admin brokers list-dynamic-config +``` + +### `delete-dynamic-config` +Delete dynamic-serviceConfiguration of broker + +Usage +```bash +$ pulsar-admin brokers delete-dynamic-config options +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--config`|Service configuration parameter name|| + + +### `get-all-dynamic-config` +Get all overridden dynamic-configuration values + +Usage +```bash +$ pulsar-admin brokers get-all-dynamic-config +``` + +### `get-internal-config` +Get internal configuration information + +Usage +```bash +$ pulsar-admin brokers get-internal-config +``` + +### `get-runtime-config` +Get runtime configuration values + +Usage +```bash +$ pulsar-admin brokers get-runtime-config +``` + +### `healthcheck` +Run a health check against the broker + +Usage +```bash +$ pulsar-admin brokers healthcheck +``` + + +## `clusters` +Operations about clusters + +Usage +```bash +$ pulsar-admin clusters subcommand +``` + +Subcommands +* `get` +* `create` +* `update` +* `delete` +* `list` +* `update-peer-clusters` +* `get-peer-clusters` +* `get-failure-domain` +* `create-failure-domain` +* `update-failure-domain` +* `delete-failure-domain` +* `list-failure-domains` + + +### `get` +Get the configuration data for the specified cluster + +Usage +```bash +$ pulsar-admin clusters get cluster-name +``` + +### `create` +Provisions a new cluster. This operation requires Pulsar super-user privileges. + +Usage +```bash +$ pulsar-admin clusters create cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `update` +Update the configuration for a cluster + +Usage +```bash +$ pulsar-admin clusters update cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-url`|The URL for the broker service.|| +|`--broker-url-secure`|The broker service URL for a secure connection|| +|`--url`|service-url|| +|`--url-secure`|service-url for secure connection|| + + +### `delete` +Deletes an existing cluster + +Usage +```bash +$ pulsar-admin clusters delete cluster-name +``` + +### `list` +List the existing clusters + +Usage +```bash +$ pulsar-admin clusters list +``` + +### `update-peer-clusters` +Update peer cluster names + +Usage +```bash +$ pulsar-admin clusters update-peer-clusters cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--peer-clusters`|Comma separated peer cluster names (Pass empty string "" to delete list)|| + +### `get-peer-clusters` +Get list of peer clusters + +Usage +```bash +$ pulsar-admin clusters get-peer-clusters +``` + +### `get-failure-domain` +Get the configuration brokers of a failure domain + +Usage +```bash +$ pulsar-admin clusters get-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `create-failure-domain` +Create a new failure domain for a cluster (updates it if already created) + +Usage +```bash +$ pulsar-admin clusters create-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `update-failure-domain` +Update failure domain for a cluster (creates a new one if not exist) + +Usage +```bash +$ pulsar-admin clusters update-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--broker-list`|Comma separated broker list|| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `delete-failure-domain` +Delete an existing failure domain + +Usage +```bash +$ pulsar-admin clusters delete-failure-domain cluster-name options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--domain-name`|The failure domain name, which is a logical domain under a Pulsar cluster|| + +### `list-failure-domains` +List the existing failure domains for a cluster + +Usage +```bash +$ pulsar-admin clusters list-failure-domains cluster-name +``` + + +## `functions` + +A command-line interface for Pulsar Functions + +Usage +```bash +$ pulsar-admin functions subcommand +``` + +Subcommands +* `localrun` +* `create` +* `delete` +* `update` +* `get` +* `restart` +* `stop` +* `start` +* `status` +* `stats` +* `list` +* `querystate` +* `putstate` +* `trigger` + + +### `localrun` +Run the Pulsar Function locally (rather than deploying it to the Pulsar cluster) + + +Usage +```bash +$ pulsar-admin functions localrun options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--broker-service-url `|The URL of the Pulsar broker|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--client-auth-params`|Client authentication param|| +|`--client-auth-plugin`|Client authentication plugin using which function-process can connect to broker|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--hostname-verification-enabled`|Enable hostname verification|false| +|`--instance-id-offset`|Start the instanceIds from this offset|0| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--state-storage-service-url`|The URL for the state storage service (by default Apache BookKeeper)|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed successfully are sent|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--timeout-ms`|The message timeout in milliseconds|| +|`--tls-allow-insecure`|Allow insecure tls connection|false| +|`--tls-trust-cert-path`|The tls trust cert file path|| +|`--use-tls`|Use tls connection|false| + + +### `create` +Create a Pulsar Function in cluster mode (i.e. deploy it on a Pulsar cluster) + +Usage +``` +$ pulsar-admin functions create options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--timeout-ms`|The message timeout in milliseconds|| + + +### `delete` +Delete a Pulsar Function that's running on a Pulsar cluster + +Usage +```bash +$ pulsar-admin functions delete options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `update` +Update a Pulsar Function that's been deployed to a Pulsar cluster + +Usage +```bash +$ pulsar-admin functions update options +``` + + +Options +|Flag|Description|Default| +|---|---|---| +|`--cpu`|The cpu in cores that need to be allocated per function instance(applicable only to docker runtime)|| +|`--ram`|The ram in bytes that need to be allocated per function instance(applicable only to process/docker runtime)|| +|`--disk`|The disk in bytes that need to be allocated per function instance(applicable only to docker runtime)|| +|`--auto-ack`|Whether or not the framework will automatically acknowledge messages|| +|`--subs-name`|Pulsar source subscription name if user wants a specific subscription-name for input-topic consumer|| +|`--classname`|The function's class name|| +|`--custom-serde-inputs`|The map of input topics to SerDe class names (as a JSON string)|| +|`--custom-schema-inputs`|The map of input topics to Schema class names (as a JSON string)|| +|`--function-config-file`|The path to a YAML config file specifying the function's configuration|| +|`--inputs`|The function's input topic or topics (multiple topics can be specified as a comma-separated list)|| +|`--log-topic`|The topic to which the function's logs are produced|| +|`--jar`|Path to the jar file for the function (if the function is written in Java). It also supports url-path [http/https/file (file protocol assumes that file already exists on worker host)] from which worker can download the package.|| +|`--name`|The function's name|| +|`--namespace`|The function’s namespace|| +|`--output`|The function's output topic (If none is specified, no output is written)|| +|`--output-serde-classname`|The SerDe class to be used for messages output by the function|| +|`--parallelism`|The function’s parallelism factor, i.e. the number of instances of the function to run|1| +|`--processing-guarantees`|The processing guarantees (aka delivery semantics) applied to the function. Possible Values: [ATLEAST_ONCE, ATMOST_ONCE, EFFECTIVELY_ONCE]|ATLEAST_ONCE| +|`--py`|Path to the main Python file/Python Wheel file for the function (if the function is written in Python)|| +|`--schema-type`|The builtin schema type or custom schema class name to be used for messages output by the function|| +|`--sliding-interval-count`|The number of messages after which the window slides|| +|`--sliding-interval-duration-ms`|The time duration after which the window slides|| +|`--tenant`|The function’s tenant|| +|`--topics-pattern`|The topic pattern to consume from list of topics under a namespace that match the pattern. [--input] and [--topic-pattern] are mutually exclusive. Add SerDe class name for a pattern in --custom-serde-inputs (supported for java fun only)|| +|`--user-config`|User-defined config key/values|| +|`--window-length-count`|The number of messages per window|| +|`--window-length-duration-ms`|The time duration of the window in milliseconds|| +|`--dead-letter-topic`|The topic where all messages which could not be processed|| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--max-message-retries`|How many times should we try to process a message before giving up|| +|`--retain-ordering`|Function consumes and processes messages in order|| +|`--timeout-ms`|The message timeout in milliseconds|| + + +### `get` +Fetch information about a Pulsar Function + +Usage +```bash +$ pulsar-admin functions get options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `restart` +Restart function instance + +Usage +```bash +$ pulsar-admin functions restart options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (restart all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stop` +Stops function instance + +Usage +```bash +$ pulsar-admin functions stop options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (stop all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `start` +Starts a stopped function instance + +Usage +```bash +$ pulsar-admin functions start options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (start all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `status` +Check the current status of a Pulsar Function + +Usage +```bash +$ pulsar-admin functions status options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-status of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `stats` +Get the current stats of a Pulsar Function + +Usage +```bash +$ pulsar-admin functions stats options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--instance-id`|The function instanceId (Get-stats of all instances if instance-id is not provided)|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + +### `list` +List all of the Pulsar Functions running under a specific tenant and namespace + +Usage +```bash +$ pulsar-admin functions list options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| + + +### `querystate` +Fetch the current state associated with a Pulsar Function running in cluster mode + +Usage +```bash +$ pulsar-admin functions querystate options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`-k`, `--key`|The key for the state you want to fetch|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`-w`, `--watch`|Watch for changes in the value associated with a key for a Pulsar Function|false| + +### `putstate` +Put a key/value pair to the state associated with a Pulsar Function + +Usage +```bash +$ pulsar-admin functions putstate options +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the Pulsar Function|| +|`--name`|The name of a Pulsar Function|| +|`--namespace`|The namespace of a Pulsar Function|| +|`--tenant`|The tenant of a Pulsar Function|| +|`-s`, `--state`|The FunctionState that needs to be put|| + +### `trigger` +Triggers the specified Pulsar Function with a supplied value + +Usage +```bash +$ pulsar-admin functions trigger options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--fqfn`|The Fully Qualified Function Name (FQFN) for the function|| +|`--name`|The function's name|| +|`--namespace`|The function's namespace|| +|`--tenant`|The function's tenant|| +|`--topic`|The specific topic name that the function consumes from that you want to inject the data to|| +|`--trigger-file`|The path to the file that contains the data with which you'd like to trigger the function|| +|`--trigger-value`|The value with which you want to trigger the function|| + + +## `functions-worker` +Operations to collect function-worker statistics + +```bash +$ pulsar-admin functions-worker subcommand +``` + +Subcommands + +* `function-stats` +* `get-cluster` +* `get-cluster-leader` +* `get-function-assignments` +* `monitoring-metrics` + +### `function-stats` + +Dump all functions stats running on this broker + +Usage +```bash +$ pulsar-admin functions-worker function-stats +``` + +### `get-cluster` + +Get all workers belonging to this cluster + +Usage +```bash +$ pulsar-admin functions-worker get-cluster +``` + +### `get-cluster-leader` + +Get the leader of the worker cluster + +Usage +```bash +$ pulsar-admin functions-worker get-cluster-leader +``` + +### `get-function-assignments` + +Get the assignments of the functions across the worker cluster + +Usage +```bash +$ pulsar-admin functions-worker get-function-assignments +``` + +### `monitoring-metrics` + +Dump metrics for Monitoring + +Usage +```bash +$ pulsar-admin functions-worker monitoring-metrics +``` + +## `namespaces` + +Operations for managing namespaces + + +```bash +$ pulsar-admin namespaces subcommand +``` + +Subcommands +* `list` +* `topics` +* `policies` +* `create` +* `delete` +* `set-deduplication` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `grant-subscription-permission` +* `revoke-subscription-permission` +* `set-clusters` +* `get-clusters` +* `get-backlog-quotas` +* `set-backlog-quota` +* `remove-backlog-quota` +* `get-persistence` +* `set-persistence` +* `get-message-ttl` +* `set-message-ttl` +* `get-anti-affinity-group` +* `set-anti-affinity-group` +* `get-anti-affinity-namespaces` +* `delete-anti-affinity-group` +* `get-retention` +* `set-retention` +* `unload` +* `split-bundle` +* `set-dispatch-rate` +* `get-dispatch-rate` +* `set-replicator-dispatch-rate` +* `get-replicator-dispatch-rate` +* `set-subscribe-rate` +* `get-subscribe-rate` +* `set-subscription-dispatch-rate` +* `get-subscription-dispatch-rate` +* `clear-backlog` +* `unsubscribe` +* `set-encryption-required` +* `set-subscription-auth-mode` +* `get-max-producers-per-topic` +* `set-max-producers-per-topic` +* `get-max-consumers-per-topic` +* `set-max-consumers-per-topic` +* `get-max-consumers-per-subscription` +* `set-max-consumers-per-subscription` +* `get-compaction-threshold` +* `set-compaction-threshold` +* `get-offload-threshold` +* `set-offload-threshold` +* `get-offload-deletion-lag` +* `set-offload-deletion-lag` +* `clear-offload-deletion-lag` +* `get-schema-autoupdate-strategy` +* `set-schema-autoupdate-strategy` + + +### `list` +Get the namespaces for a tenant + +Usage +```bash +$ pulsar-admin namespaces list tenant-name +``` + +### `topics` +Get the list of topics for a namespace + +Usage +```bash +$ pulsar-admin namespaces topics tenant/namespace +``` + +### `policies` +Get the configuration policies of a namespace + +Usage +```bash +$ pulsar-admin namespaces policies tenant/namespace +``` + +### `create` +Create a new namespace + +Usage +```bash +$ pulsar-admin namespaces create tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundles`|The number of bundles to activate|0| +|`-c`, `--clusters`|List of clusters this namespace will be assigned|| + + +### `delete` +Deletes a namespace. The namespace needs to be empty + +Usage +```bash +$ pulsar-admin namespaces delete tenant/namespace +``` + +### `set-deduplication` +Enable or disable message deduplication on a namespace + +Usage +```bash +$ pulsar-admin namespaces set-deduplication tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--enable`, `-e`|Enable message deduplication on the specified namespace|false| +|`--disable`, `-d`|Disable message deduplication on the specified namespace|false| + + +### `permissions` +Get the permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces permissions tenant/namespace +``` + +### `grant-permission` +Grant permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces grant-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions on a namespace + +Usage +```bash +$ pulsar-admin namespaces revoke-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| + +### `grant-subscription-permission` +Grant permissions to access subscription admin-api + +Usage +```bash +$ pulsar-admin namespaces grant-subscription-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--roles`|The client roles to which to grant the permissions (comma separated roles)|| +|`--subscription`|The subscription name for which permission will be granted to roles|| + +### `revoke-subscription-permission` +Revoke permissions to access subscription admin-api + +Usage +```bash +$ pulsar-admin namespaces revoke-subscription-permission tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--role`|The client role to which to revoke the permissions|| +|`--subscription`|The subscription name for which permission will be revoked to roles|| + +### `set-clusters` +Set replication clusters for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-clusters tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-c`, `--clusters`|Replication clusters ID list (comma-separated values)|| + + +### `get-clusters` +Get replication clusters for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-clusters tenant/namespace +``` + +### `get-backlog-quotas` +Get the backlog quota policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-backlog-quotas tenant/namespace +``` + +### `set-backlog-quota` +Set a backlog quota policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-backlog-quota tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-l`, `--limit`|The backlog size limit (for example `10M` or `16G`)|| +|`-p`, `--policy`|The retention policy to enforce when the limit is reached. The valid options are: `producer_request_hold`, `producer_exception` or `consumer_backlog_eviction`| + +Example +```bash +$ pulsar-admin namespaces set-backlog-quota my-tenant/my-ns \ +--limit 2G \ +--policy producer_request_hold +``` + +### `remove-backlog-quota` +Remove a backlog quota policy from a namespace + +Usage +```bash +$ pulsar-admin namespaces remove-backlog-quota tenant/namespace +``` + +### `get-persistence` +Get the persistence policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-persistence tenant/namespace +``` + +### `set-persistence` +Set the persistence policies for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-persistence tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-a`, `--bookkeeper-ack-quorom`|The number of acks (guaranteed copies) to wait for each entry|0| +|`-e`, `--bookkeeper-ensemble`|The number of bookies to use for a topic|0| +|`-w`, `--bookkeeper-write-quorum`|How many writes to make of each entry|0| +|`-r`, `--ml-mark-delete-max-rate`|Throttling rate of mark-delete operation (0 means no throttle)|| + + +### `get-message-ttl` +Get the message TTL for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-message-ttl tenant/namespace +``` + +### `set-message-ttl` +Set the message TTL for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-message-ttl tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-ttl`, `--messageTTL`|Message TTL in seconds|0| + +### `get-anti-affinity-group` +Get Anti-affinity group name for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-anti-affinity-group tenant/namespace +``` + +### `set-anti-affinity-group` +Set Anti-affinity group name for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-anti-affinity-group tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-g`, `--group`|Anti-affinity group name|| + +### `get-anti-affinity-namespaces` +Get Anti-affinity namespaces grouped with the given anti-affinity group name + +Usage +```bash +$ pulsar-admin namespaces get-anti-affinity-namespaces options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--cluster`|Cluster name|| +|`-g`, `--group`|Anti-affinity group name|| +|`-p`, `--tenant`|Tenant is only used for authorization. Client has to be admin of any of the tenant to access this api|| + +### `delete-anti-affinity-group` +Remove Anti-affinity group name for a namespace + +Usage +```bash +$ pulsar-admin namespaces delete-anti-affinity-group tenant/namespace +``` + +### `get-retention` +Get the retention policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-retention tenant/namespace +``` + +### `set-retention` +Set the retention policy for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-retention tenant/namespace +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|The retention size limits (for example 10M, 16G or 3T). 0 means no retention and -1 means infinite size retention|| +|`-t`, `--time`|The retention time in minutes, hours, days, or weeks. Examples: 100m, 13h, 2d, 5w. 0 means no retention and -1 means infinite time retention|| + + +### `unload` +Unload a namespace or namespace bundle from the current serving broker. + +Usage +```bash +$ pulsar-admin namespaces unload tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| + +### `split-bundle` +Split a namespace-bundle from the current serving broker + +Usage +```bash +$ pulsar-admin namespaces split-bundle tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-u`, `--unload`|Unload newly split bundles after splitting old bundle|false| + +### `set-dispatch-rate` +Set message-dispatch-rate for all topics of the namespace + +Usage +```bash +$ pulsar-admin namespaces set-dispatch-rate tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-dispatch-rate` +Get configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage +```bash +$ pulsar-admin namespaces get-dispatch-rate tenant/namespace +``` + +### set-replicator-dispatch-rate +Set replicator message-dispatch-rate for all topics of the namespace + +Usage +```bash +$ pulsar-admin namespaces set-replicator-dispatch-rate tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### get-replicator-dispatch-rate +Get replicator configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage +```bash +$ pulsar-admin namespaces get-replicator-dispatch-rate tenant/namespace +``` + +### `set-subscribe-rate` +Set subscribe-rate per consumer for all topics of the namespace + +Usage +```bash +$ pulsar-admin namespaces set-subscribe-rate tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-sr`, `--subscribe-rate`|The subscribe rate (default -1 will be overwrite if not passed)|-1| +|`-st`, `--subscribe-rate-period`|The subscribe rate period in second type (default 30 second will be overwrite if not passed)|30| + +### `get-subscribe-rate` +Get configured subscribe-rate per consumer for all topics of the namespace + +Usage +```bash +$ pulsar-admin namespaces get-subscribe-rate tenant/namespace +``` + +### `set-subscription-dispatch-rate` +Set subscription message-dispatch-rate for all subscription of the namespace + +Usage +```bash +$ pulsar-admin namespaces set-subscription-dispatch-rate tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bd`, `--byte-dispatch-rate`|The byte dispatch rate (default -1 will be overwrite if not passed)|-1| +|`-dt`, `--dispatch-rate-period`|The dispatch rate period in second type (default 1 second will be overwrite if not passed)|1| +|`-md`, `--sub-msg-dispatch-rate`|The message dispatch rate (default -1 will be overwrite if not passed)|-1| + +### `get-subscription-dispatch-rate` +Get subscription configured message-dispatch-rate for all topics of the namespace (Disabled if value < 0) + +Usage +```bash +$ pulsar-admin namespaces get-subscription-dispatch-rate tenant/namespace +``` + +### `clear-backlog` +Clear the backlog for a namespace + +Usage +```bash +$ pulsar-admin namespaces clear-backlog tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-force`, `--force`|Whether to force a clear backlog without prompt|false| +|`-s`, `--sub`|The subscription name|| + + +### `unsubscribe` +Unsubscribe the given subscription on all destinations on a namespace + +Usage +```bash +$ pulsar-admin namespaces unsubscribe tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|{start-boundary}_{end-boundary} (e.g. 0x00000000_0xffffffff)|| +|`-s`, `--sub`|The subscription name|| + +### `set-encryption-required` +Enable or disable message encryption required for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-encryption-required tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-d`, `--disable`|Disable message encryption required|false| +|`-e`, `--enable`|Enable message encryption required|false| + +### `set-subscription-auth-mode` +Set subscription auth mode on a namespace + +Usage +```bash +$ pulsar-admin namespaces set-subscription-auth-mode tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-m`, `--subscription-auth-mode`|Subscription authorization mode for Pulsar policies. Valid options are: [None, Prefix]|| + +### `get-max-producers-per-topic` +Get maxProducersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-max-producers-per-topic tenant/namespace +``` + +### `set-max-producers-per-topic` +Set maxProducersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-max-producers-per-topic tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-p`, `--max-producers-per-topic`|maxProducersPerTopic for a namespace|0| + +### `get-max-consumers-per-topic` +Get maxConsumersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-max-consumers-per-topic tenant/namespace +``` + +### `set-max-consumers-per-topic` +Set maxConsumersPerTopic for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-max-consumers-per-topic tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-topic`|maxConsumersPerTopic for a namespace|0| + +### `get-max-consumers-per-subscription` +Get maxConsumersPerSubscription for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-max-consumers-per-subscription tenant/namespace +``` + +### `set-max-consumers-per-subscription` +Set maxConsumersPerSubscription for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-max-consumers-per-subscription tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--max-consumers-per-subscription`|maxConsumersPerSubscription for a namespace|0| + + +### `get-compaction-threshold` +Get compactionThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-compaction-threshold tenant/namespace +``` + +### `set-compaction-threshold` +Set compactionThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-compaction-threshold tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-t`, `--threshold`|Maximum number of bytes in a topic backlog before compaction is triggered (eg: 10M, 16G, 3T). 0 disables automatic compaction|0| + + +### `get-offload-threshold` +Get offloadThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-offload-threshold tenant/namespace +``` + +### `set-offload-threshold` +Set offloadThreshold for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-offload-threshold tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-s`, `--size`|Maximum number of bytes stored in the pulsar cluster for a topic before data will start being automatically offloaded to longterm storage (eg: 10M, 16G, 3T, 100). Negative values disable automatic offload. 0 triggers offloading as soon as possible.|-1| + +### `get-offload-deletion-lag` +Get offloadDeletionLag, in minutes, for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-offload-deletion-lag tenant/namespace +``` + +### `set-offload-deletion-lag` +Set offloadDeletionLag for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-offload-deletion-lag tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-l`, `--lag`|Duration to wait after offloading a ledger segment, before deleting the copy of that segment from cluster local storage. (eg: 10m, 5h, 3d, 2w).|-1| + +### `clear-offload-deletion-lag` +Clear offloadDeletionLag for a namespace + +Usage +```bash +$ pulsar-admin namespaces clear-offload-deletion-lag tenant/namespace +``` + +### `get-schema-autoupdate-strategy` +Get the schema auto-update strategy for a namespace + +Usage +```bash +$ pulsar-admin namespaces get-schema-autoupdate-strategy tenant/namespace +``` + +### `set-schema-autoupdate-strategy` +Set the schema auto-update strategy for a namespace + +Usage +```bash +$ pulsar-admin namespaces set-schema-autoupdate-strategy tenant/namespace options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--compatibility`|Compatibility level required for new schemas created via a Producer. Possible values (Full, Backward, Forward, None).|Full| +|`-d`, `--disabled`|Disable automatic schema updates.|false| + + +## `ns-isolation-policy` +Operations for managing namespace isolation policies. + +Usage +```bash +$ pulsar-admin ns-isolation-policy subcommand +``` + +Subcommands +* `set` +* `get` +* `list` +* `delete` +* `brokers` +* `broker` + +### `set` +Create/update a namespace isolation policy for a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy set cluster-name policy-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--auto-failover-policy-params`|Comma-separated name=value auto failover policy parameters|[]| +|`--auto-failover-policy-type`|Auto failover policy type name. Currently available options: min_available.|[]| +|`--namespaces`|Comma-separated namespaces regex list|[]| +|`--primary`|Comma-separated primary broker regex list|[]| +|`--secondary`|Comma-separated secondary broker regex list|[]| + + +### `get` +Get the namespace isolation policy of a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy get cluster-name policy-name +``` + +### `list` +List all namespace isolation policies of a cluster. This operation requires Pulsar superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy list cluster-name +``` + +### `delete` +Delete namespace isolation policy of a cluster. This operation requires superuser privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy delete +``` + +### `brokers` +List all brokers with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy brokers cluster-name +``` + +### `broker` +Get broker with namespace-isolation policies attached to it. This operation requires Pulsar super-user privileges. + +Usage +```bash +$ pulsar-admin ns-isolation-policy broker cluster-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--broker`|Broker name to get namespace-isolation policies attached to it|| + +## `topics` +Operations for managing Pulsar topics (both persistent and non persistent) + +Usage +```bash +$ pulsar-admin topics subcommand +``` + +Subcommands +* `compact` +* `compaction-status` +* `offload` +* `offload-status` +* `create-partitioned-topic` +* `create-missed-partitions` +* `delete-partitioned-topic` +* `create` +* `get-partitioned-topic-metadata` +* `update-partitioned-topic` +* `list` +* `list-in-bundle` +* `terminate` +* `permissions` +* `grant-permission` +* `revoke-permission` +* `lookup` +* `bundle-range` +* `delete` +* `unload` +* `subscriptions` +* `unsubscribe` +* `stats` +* `stats-internal` +* `info-internal` +* `partitioned-stats` +* `skip` +* `clear-backlog` +* `expire-messages` +* `expire-messages-all-subscriptions` +* `peek-messages` +* `reset-cursor` + + +### `compact` +Run compaction on the specified topic (persistent topics only) + +Usage +``` +$ pulsar-admin topics compact persistent://tenant/namespace/topic +``` + +### `compaction-status` +Check the status of a topic compaction (persistent topics only) + +Usage +```bash +$ pulsar-admin topics compaction-status persistent://tenant/namespace/topic +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `offload` +Trigger offload of data from a topic to long-term storage (e.g. Amazon S3) + +Usage +```bash +$ pulsar-admin topics offload persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--size-threshold`|The maximum amount of data to keep in BookKeeper for the specific topic|| + + +### `offload-status` +Check the status of data offloading from a topic to long-term storage + +Usage +```bash +$ pulsar-admin topics offload-status persistent://tenant/namespace/topic op +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-w`, `--wait-complete`|Wait for compaction to complete|false| + + +### `create-partitioned-topic` +Create a partitioned topic. A partitioned topic must be created before producers can publish to it. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +Usage +```bash +$ pulsar-admin topics create-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `create-missed-partitions` +Try to create partitions for partitioned topic. The partitions of partition topic has to be created, +can be used by repair partitions when topic auto creation is disabled + +Usage +```bash +$ pulsar-admin topics create-missed-partitions persistent://tenant/namespace/topic +``` + +### `delete-partitioned-topic` +Delete a partitioned topic. This will also delete all the partitions of the topic if they exist. + +Usage +```bash +$ pulsar-admin topics delete-partitioned-topic {persistent|non-persistent} +``` + +### `create` +Creates a non-partitioned topic. A non-partitioned topic must explicitly be created by the user if allowAutoTopicCreation or createIfMissing is disabled. + +> #### Note +> +> By default, after 60 seconds of creation, topics are considered inactive and deleted automatically to prevent from generating trash data. +> +> To disable this feature, set `brokerDeleteInactiveTopicsEnabled` to `false`. +> +> To change the frequency of checking inactive topics, set `brokerDeleteInactiveTopicsFrequencySeconds` to your desired value. +> +> For more information about these two parameters, see [here](reference-configuration.md#broker). + +Usage +```bash +$ pulsar-admin topics create {persistent|non-persistent}://tenant/namespace/topic +``` + +### `get-partitioned-topic-metadata` +Get the partitioned topic metadata. If the topic is not created or is a non-partitioned topic, this will return an empty topic with zero partitions. + +Usage +```bash +$ pulsar-admin topics get-partitioned-topic-metadata {persistent|non-persistent}://tenant/namespace/topic +``` + +### `update-partitioned-topic` +Update existing non-global partitioned topic. New updating number of partitions must be greater than existing number of partitions. + +Usage +```bash +$ pulsar-admin topics update-partitioned-topic {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-p`, `--partitions`|The number of partitions for the topic|0| + +### `list` +Get the list of topics under a namespace + +Usage +``` +$ pulsar-admin topics list tenant/cluster/namespace +``` + +### `list-in-bundle` +Get a list of non-persistent topics present under a namespace bundle + +Usage +``` +$ pulsar-admin topics list-in-bundle tenant/namespace options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-b`, `--bundle`|The bundle range|| + + +### `terminate` +Terminate a topic (disallow further messages from being published on the topic) + +Usage +```bash +$ pulsar-admin topics terminate {persistent|non-persistent}://tenant/namespace/topic +``` + +### `permissions` +Get the permissions on a topic. Retrieve the effective permissions for a desination. These permissions are defined by the permissions set at the namespace level combined (union) with any eventual specific permissions set on the topic. + +Usage +```bash +$ pulsar-admin topics permissions topic +``` + +### `grant-permission` +Grant a new permission to a client role on a single topic + +Usage +```bash +$ pulsar-admin topics grant-permission {persistent|non-persistent}://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--actions`|Actions to be granted (`produce` or `consume`)|| +|`--role`|The client role to which to grant the permissions|| + + +### `revoke-permission` +Revoke permissions to a client role on a single topic. If the permission was not set at the topic level, but rather at the namespace level, this operation will return an error (HTTP status code 412). + +Usage +```bash +$ pulsar-admin topics revoke-permission topic +``` + +### `lookup` +Look up a topic from the current serving broker + +Usage +```bash +$ pulsar-admin topics lookup topic +``` + +### `bundle-range` +Get the namespace bundle which contains the given topic + +Usage +```bash +$ pulsar-admin topics bundle-range topic +``` + +### `delete` +Delete a topic. The topic cannot be deleted if there are any active subscriptions or producers connected to the topic. + +Usage +```bash +$ pulsar-admin topics delete topic +``` + +### `unload` +Unload a topic + +Usage +```bash +$ pulsar-admin topics unload topic +``` + +### `subscriptions` +Get the list of subscriptions on the topic + +Usage +```bash +$ pulsar-admin topics subscriptions topic +``` + +### `unsubscribe` +Delete a durable subscriber from a topic + +Usage +```bash +$ pulsar-admin topics unsubscribe topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to delete|| + + +### `stats` +Get the stats for the topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage +```bash +$ pulsar-admin topics stats topic +``` + +> Note +> The unit of `storageSize` and `averageMsgSize` is Byte. + +### `stats-internal` +Get the internal stats for the topic + +Usage +```bash +$ pulsar-admin topics stats-internal topic +``` + +### `info-internal` +Get the internal metadata info for the topic + +Usage +```bash +$ pulsar-admin topics info-internal topic +``` + +### `partitioned-stats` +Get the stats for the partitioned topic and its connected producers and consumers. All rates are computed over a 1-minute window and are relative to the last completed 1-minute period. + +Usage +```bash +$ pulsar-admin topics partitioned-stats topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`--per-partition`|Get per-partition stats|false| + + +### `skip` +Skip some messages for the subscription + +Usage +```bash +$ pulsar-admin topics skip topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages to skip|0| +|`-s`, `--subscription`|The subscription on which to skip messages|| + + +### `clear-backlog` +Clear backlog (skip all the messages) for the subscription + +Usage +```bash +$ pulsar-admin topics clear-backlog topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|The subscription to clear|| + + +### `expire-messages` +Expire messages that are older than the given expiry time (in seconds) for the subscription. + +Usage +```bash +$ pulsar-admin topics expire-messages topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| +|`-s`, `--subscription`|The subscription to skip messages on|| + + +### `expire-messages-all-subscriptions` +Expire messages older than the given expiry time (in seconds) for all subscriptions + +Usage +```bash +$ pulsar-admin topics expire-messages-all-subscriptions topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-t`, `--expireTime`|Expire messages older than the time (in seconds)|0| + + +### `peek-messages` +Peek some messages for the subscription. + +Usage +```bash +$ pulsar-admin topics peek-messages topic options +``` + +Options +|Flag|Description|Default| +|---|---|---| +|`-n`, `--count`|The number of messages|0| +|`-s`, `--subscription`|Subscription to get messages from|| + + +### `reset-cursor` +Reset position for subscription to a position that is closest to timestamp or messageId. + +Usage +```bash +$ pulsar-admin topics reset-cursor topic options +``` + +Options + +|Flag|Description|Default| +|---|---|---| +|`-s`, `--subscription`|Subscription to reset position on|| +|`-t`, `--time`|The time in minutes to reset back to (or minutes, hours, days, weeks, etc.). Examples: `100m`, `3h`, `2d`, `5w`.|| +|`-m`, `--messageId`| The messageId to reset back to (ledgerId:entryId). || + + + +## `tenants` +Operations for managing tenants + +Usage +```bash +$ pulsar-admin tenants subcommand +``` + +Subcommands +* `list` +* `get` +* `create` +* `update` +* `delete` + +### `list` +List the existing tenants + +Usage +```bash +$ pulsar-admin tenants list +``` + +### `get` +Gets the configuration of a tenant + +Usage +```bash +$ pulsar-admin tenants get tenant-name +``` + +### `create` +Creates a new tenant + +Usage +```bash +$ pulsar-admin tenants create tenant-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + +### `update` +Updates a tenant + +Usage +```bash +$ pulsar-admin tenants update tenant-name options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-r`, `--admin-roles`|Comma-separated admin roles|| +|`-c`, `--allowed-clusters`|Comma-separated allowed clusters|| + + +### `delete` +Deletes an existing tenant + +Usage +```bash +$ pulsar-admin tenants delete tenant-name +``` + + +## `resource-quotas` +Operations for managing resource quotas + +Usage +```bash +$ pulsar-admin resource-quotas subcommand +``` + +Subcommands +* `get` +* `set` +* `reset-namespace-bundle-quota` + + +### `get` +Get the resource quota for a specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage +```bash +$ pulsar-admin resource-quotas get options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + +### `set` +Set the resource quota for the specified namespace bundle, or default quota if no namespace/bundle is specified. + +Usage +```bash +$ pulsar-admin resource-quotas set options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-bi`, `--bandwidthIn`|The expected inbound bandwidth (in bytes/second)|0| +|`-bo`, `--bandwidthOut`|Expected outbound bandwidth (in bytes/second)0| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-d`, `--dynamic`|Allow to be dynamically re-calculated (or not)|false| +|`-mem`, `--memory`|Expectred memory usage (in megabytes)|0| +|`-mi`, `--msgRateIn`|Expected incoming messages per second|0| +|`-mo`, `--msgRateOut`|Expected outgoing messages per second|0| +|`-n`, `--namespace`|The namespace as tenant/namespace, for example my-tenant/my-ns. Must be specified together with -b/--bundle.|| + + +### `reset-namespace-bundle-quota` +Reset the specifed namespace bundle's resource quota to a default value. + +Usage +```bash +$ pulsar-admin resource-quotas reset-namespace-bundle-quota options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-b`, `--bundle`|A bundle of the form {start-boundary}_{end_boundary}. This must be specified together with -n/--namespace.|| +|`-n`, `--namespace`|The namespace|| + + + +## `schemas` +Operations related to Schemas associated with Pulsar topics. + +Usage +``` +$ pulsar-admin schemas subcommand +``` + +Subcommands +* `upload` +* `delete` +* `get` +* `extract` + + +### `upload` +Upload the schema definition for a topic + +Usage +```bash +$ pulsar-admin schemas upload persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--filename`|The path to the schema definition file. An example schema file is available under conf directory.|| + + +### `delete` +Delete the schema definition associated with a topic + +Usage +```bash +$ pulsar-admin schemas delete persistent://tenant/namespace/topic +``` + + +### `get` +Retrieve the schema definition assoicated with a topic (at a given version if version is supplied). + +Usage +```bash +$ pulsar-admin schemas get persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`--version`|The version of the schema definition to retrive for a topic.|| + +### `extract` +Provide the schema definition for a topic via Java class name contained in a JAR file + +Usage +```bash +$ pulsar-admin schemas extract persistent://tenant/namespace/topic options +``` + +Options +|Flag|Description|Default| +|----|---|---| +|`-c`, `--classname`|The Java class name|| +|`-j`, `--jar`|A path to the JAR file which contains the above Java class|| +|`-t`, `--type`|The type of the schema (avro or json)|| + + diff --git a/site2/website/versioned_docs/version-2.5.0/schema-evolution-compatibility.md b/site2/website/versioned_docs/version-2.5.0/schema-evolution-compatibility.md new file mode 100644 index 0000000000000..54bc20f9696fc --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/schema-evolution-compatibility.md @@ -0,0 +1,953 @@ +--- +id: version-2.5.0-schema-evolution-compatibility +title: Schema evolution and compatibility +sidebar_label: Schema evolution and compatibility +original_id: schema-evolution-compatibility +--- + +Normally, schemas do not stay the same over a long period of time. Instead, they undergo evolutions to satisfy new needs. + +This chapter examines how Pulsar schema evolves and what Pulsar schema compatibility check strategies are. + +## Schema evolution + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +Each `SchemaInfo` stored with a topic has a version. The version is used to manage the schema changes happening within a topic. + +The message produced with `SchemaInfo` is tagged with a schema version. When a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and use the correct schema information to deserialize data. + +### What is schema evolution? + +Schemas store the details of attributes and types. To satisfy new business requirements, you need to update schemas inevitably over time, which is called **schema evolution**. + +Any schema changes affect downstream consumers. Schema evolution ensures that the downstream consumers can seamlessly handle data encoded with both old schemas and new schemas. + +### How Pulsar schema should evolve? + +The answer is Pulsar schema compatibility check strategy. It determines how schema compares old schemas with new schemas in topics. + +For more information, see [Schema compatibility check strategy](#schema-compatibility-check-strategy). + +### How does Pulsar support schema evolution? + +1. When a producer/consumer/reader connects to a broker, the broker deploys the schema compatibility checker configured by `schemaRegistryCompatibilityCheckers` to enforce schema compatibility check. + + The schema compatibility checker is one instance per schema type. + + Currently, Avro and JSON have their own compatibility checkers, while all the other schema types share the default compatibility checker which disables schema evolution. + +2. The producer/consumer/reader sends its client `SchemaInfo` to the broker. + +3. The broker knows the schema type and locates the schema compatibility checker for that type. + +4. The broker uses the checker to check if the `SchemaInfo` is compatible with the latest schema of the topic by applying its compatibility check strategy. + + Currently, the compatibility check strategy is configured at the namespace level and applied to all the topics within that namespace. + +## Schema compatibility check strategy + +Pulsar has 8 schema compatibility check strategies, which are summarized in the following table. + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +Compatibility check strategy + + + +Definition + + + +Changes allowed + + + +Check against which schema + + + +Upgrade first + +
    + +`ALWAYS_COMPATIBLE` + + + +Disable schema compatibility check. + + + +All changes are allowed + + + +All previous versions + + + +Any order + +
    + +`ALWAYS_INCOMPATIBLE` + + + +Disable schema evolution. + + + +All changes are disabled + + + +None + + + +None + +
    + +`BACKWARD` + + + +Consumers using the schema V3 can process data written by producers using the schema V3 or V2. + + + +* Add optional fields + +* Delete fields + + + +Latest version + + + +Consumers + +
    + +`BACKWARD_TRANSITIVE` + + + +Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. + + + +* Add optional fields + +* Delete fields + + + +All previous versions + + + +Consumers + +
    + +`FORWARD` + + + +Consumers using the schema V3 or V2 can process data written by producers using the schema V3. + + + +* Add fields + +* Delete optional fields + + + +Latest version + + + +Producers + +
    + +`FORWARD_TRANSITIVE` + + + +Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. + + + +* Add fields + +* Delete optional fields + + + +All previous versions + + + +Producers + +
    + +`FULL` + + + +Backward and forward compatible between the schema V3 and V2. + + + +* Modify optional fields + + + +Latest version + + + +Any order + +
    + +`FULL_TRANSITIVE` + + + +Backward and forward compatible among the schema V3, V2, and V1. + + + +* Modify optional fields + + + +All previous versions + + + +Any order + +
    + +### ALWAYS_COMPATIBLE and ALWAYS_INCOMPATIBLE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +Compatibility check strategy + + + +Definition + + + +Note + +
    + +`ALWAYS_COMPATIBLE` + + + +Disable schema compatibility check. + + + +None + +
    + +`ALWAYS_INCOMPATIBLE` + + + +Disable schema evolution, that is, any schema change is rejected. + + + +* For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`. + +* For Avro and JSON, the default schema compatibility check strategy is `FULL`. + +
    + +#### Example + +* Example 1 + + In some situations, an application needs to store events of several different types in the same Pulsar topic. + + In particular, when developing a data model in an `Event Sourcing` style, you might have several kinds of events that affect the state of an entity. + + For example, for a user entity, there are `userCreated`, `userAddressChanged` and `userEnquiryReceived` events. The application requires that those events are always read in the same order. + + Consequently, those events need to go in the same Pulsar partition to maintain order. This application can use `ALWAYS_COMPATIBLE` to allow different kinds of events co-exist in the same topic. + +* Example 2 + + Sometimes we also make incompatible changes. + + For example, you are modifying a field type from `string` to `int`. + + In this case, you need to: + + * Upgrade all producers and consumers to the new schema versions at the same time. + + * Optionally, create a new topic and start migrating applications to use the new topic and the new schema, avoiding the need to handle two incompatible versions in the same topic. + +### BACKWARD and BACKWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`BACKWARD` | Consumers using the new schema can process data written by producers using the **last schema**. | The consumers using the schema V3 can process data written by producers using the schema V3 or V2. | +`BACKWARD_TRANSITIVE` | Consumers using the new schema can process data written by producers using **all previous schemas**. | The consumers using the schema V3 can process data written by producers using the schema V3, V2, or V1. | + +#### Example + +* Example 1 + + Remove a field. + + A consumer constructed to process events without one field can process events written with the old schema containing the field, and the consumer will ignore that field. + +* Example 2 + + You want to load all Pulsar data into a Hive data warehouse and run SQL queries against the data. + + Same SQL queries must continue to work even the data is changed. To support it, you can evolve the schemas using the `BACKWARD` strategy. + +### FORWARD and FORWARD_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + +| Compatibility check strategy | Definition | Description | +|---|---|---| +`FORWARD` | Consumers using the **last schema** can process data written by producers using a new schema, even though they may not be able to use the full capabilities of the new schema. | The consumers using the schema V3 or V2 can process data written by producers using the schema V3. | +`FORWARD_TRANSITIVE` | Consumers using **all previous schemas** can process data written by producers using a new schema. | The consumers using the schema V3, V2, or V1 can process data written by producers using the schema V3. + +#### Example + +* Example 1 + + Add a field. + + In most data formats, consumers written to process events without new fields can continue doing so even when they receive new events containing new fields. + +* Example 2 + + If a consumer has an application logic tied to a full version of a schema, the application logic may not be updated instantly when the schema evolves. + + In this case, you need to project data with a new schema onto an old schema that the application understands. + + Consequently, you can evolve the schemas using the `FORWARD` strategy to ensure that the old schema can process data encoded with the new schema. + +### FULL and FULL_TRANSITIVE + +Suppose that you have a topic containing three schemas (V1, V2, and V3), V1 is the oldest and V3 is the latest: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +Compatibility check strategy + + + +Definition + + + +Description + + + +Note + +
    + +`FULL` + + + +Schemas are both backward and forward compatible, which means: + +Consumers using the last schema can process data written by producers using the new schema. + +AND + +Consumers using the new schema can process data written by producers using the last schema. + + + +Consumers using the schema V3 can process data written by producers using the schema V3 or V2. + +AND + +Consumers using the schema V3 or V2 can process data written by producers using the schema V3. + + + +* For Avro and JSON, the default schema compatibility check strategy is `FULL`. + +* For all schema types except Avro and JSON, the default schema compatibility check strategy is `ALWAYS_INCOMPATIBLE`. + +
    + +`FULL_TRANSITIVE` + + + +The new schema is backward and forward compatible with all previously registered schemas. + + + +Consumers using the schema V3 can process data written by producers using the schema V3, V2 or V1. + +AND + +Consumers using the schema V3, V2 or V1 can process data written by producers using the schema V3. + + + +None + +
    + +#### Example + +In some data formats, for example, Avro, you can define fields with default values. Consequently, adding or removing a field with a default value is a fully compatible change. + +## Schema verification + +When a producer or a consumer tries to connect to a topic, a broker performs some checks to verify a schema. + +### Producer + +When a producer tries to connect to a topic (suppose ignore the schema auto creation), a broker does the following checks: + +* Check if the schema carried by the producer exists in the schema registry or not. + + * If the schema is already registered, then the producer is connected to a broker and produce messages with that schema. + + * If the schema is not registered, then Pulsar verifies if the schema is allowed to be registered based on the configured compatibility check strategy. + +### Consumer +When a consumer tries to connect to a topic, a broker checks if a carried schema is compatible with a registered schema based on the configured schema compatibility check strategy. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +Compatibility check strategy + + + +Check logic + +
    + +`ALWAYS_COMPATIBLE` + + + +All pass + +
    + +`ALWAYS_INCOMPATIBLE` + + + +No pass + +
    + +`BACKWARD` + + + +Can read the last schema + +
    + +`BACKWARD_TRANSITIVE` + + + +Can read all schemas + +
    + +`FORWARD` + + + +Can read the last schema + +
    + +`FORWARD_TRANSITIVE` + + + +Can read the last schema + +
    + +`FULL` + + + +Can read the last schema + +
    + +`FULL_TRANSITIVE` + + + +Can read all schemas + +
    + +## Order of upgrading clients + +The order of upgrading client applications is determined by the compatibility check strategy. + +For example, the producers using schemas to write data to Pulsar and the consumers using schemas to read data from Pulsar. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +Compatibility check strategy + + + +Upgrade first + + + +Description + +
    + +`ALWAYS_COMPATIBLE` + + + +Any order + + + +The compatibility check is disabled. + +Consequently, you can upgrade the producers and consumers in **any order**. + +
    + +`ALWAYS_INCOMPATIBLE` + + + +None + + + +The schema evolution is disabled. + +
    + +* `BACKWARD` + +* `BACKWARD_TRANSITIVE` + + + +Consumers + + + +There is no guarantee that consumers using the old schema can read data produced using the new schema. + +Consequently, **upgrade all consumers first**, and then start producing new data. + +
    + +* `FORWARD` + +* `FORWARD_TRANSITIVE` + + + +Producers + + + +There is no guarantee that consumers using the new schema can read data produced using the old schema. + +Consequently, **upgrade all producers first** to use the new schema and ensure that the data already produced using the old schemas are not available to consumers, and then upgrade the consumers. + +
    + +* `FULL` + +* `FULL_TRANSITIVE` + + + +Any order + + + +There is no guarantee that consumers using the old schema can read data produced using the new schema and consumers using the new schema can read data produced using the old schema. + +Consequently, you can upgrade the producers and consumers in **any order**. + +
    + + + + diff --git a/site2/website/versioned_docs/version-2.5.0/schema-get-started.md b/site2/website/versioned_docs/version-2.5.0/schema-get-started.md new file mode 100644 index 0000000000000..5d6c5c02e896d --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/schema-get-started.md @@ -0,0 +1,91 @@ +--- +id: version-2.5.0-schema-get-started +title: Get started +sidebar_label: Get started +original_id: schema-get-started +--- + +This chapter introduces Pulsar schemas and explains why they are important. + +## Schema Registry + +Type safety is extremely important in any application built around a message bus like Pulsar. + +Producers and consumers need some kind of mechanism for coordinating types at the topic level to avoid various potential problems arise. For example, serialization and deserialization issues. + +Applications typically adopt one of the following approaches to guarantee type safety in messaging. Both approaches are available in Pulsar, and you're free to adopt one or the other or to mix and match on a per-topic basis. + +### Client-side approach + +Producers and consumers are responsible for not only serializing and deserializing messages (which consist of raw bytes) but also "knowing" which types are being transmitted via which topics. + +If a producer is sending temperature sensor data on the topic `topic-1`, consumers of that topic will run into trouble if they attempt to parse that data as moisture sensor readings. + +Producers and consumers can send and receive messages consisting of raw byte arrays and leave all type safety enforcement to the application on an "out-of-band" basis. + +### Server-side approach + +Producers and consumers inform the system which data types can be transmitted via the topic. + +With this approach, the messaging system enforces type safety and ensures that producers and consumers remain synced. + +Pulsar has a built-in **schema registry** that enables clients to upload data schemas on a per-topic basis. Those schemas dictate which data types are recognized as valid for that topic. + +## Why use schema + +When a schema is enabled, Pulsar does parse data, it takes bytes as inputs and sends bytes as outputs. While data has meaning beyond bytes, you need to parse data and might encounter parse exceptions which mainly occur in the following situations: + +* The field does not exist + +* The field type has changed (for example, `string` is changed to `int`) + +There are a few methods to prevent and overcome these exceptions, for example, you can catch exceptions when parsing errors, which makes code hard to maintain; or you can adopt a schema management system to perform schema evolution, not to break downstream applications, and enforces type safety to max extend in the language you are using, the solution is Pulsar Schema. + +Pulsar schema enables you to use language-specific types of data when constructing and handling messages from simple types like `string` to more complex application-specific types. + +**Example** + +You can use the _User_ class to define the messages sent to Pulsar topics. + +``` +public class User { + String name; + int age; +} +``` + +When constructing a producer with the _User_ class, you can specify a schema or not as below. + +### Without schema + +If you construct a producer without specifying a schema, then the producer can only produce messages of type `byte[]`. If you have a POJO class, you need to serialize the POJO into bytes before sending messages. + +**Example** + +``` +Producer producer = client.newProducer() + .topic(topic) + .create(); +User user = new User(“Tom”, 28); +byte[] message = … // serialize the `user` by yourself; +producer.send(message); +``` +### With schema + +If you construct a producer with specifying a schema, then you can send a class to a topic directly without worrying about how to serialize POJOs into bytes. + +**Example** + +This example constructs a producer with the _JSONSchema_, and you can send the _User_ class to topics directly without worrying about how to serialize it into bytes. + +``` +Producer producer = client.newProducer(JSONSchema.of(User.class)) + .topic(topic) + .create(); +User user = new User(“Tom”, 28); +producer.send(User); +``` + +### Summary + +When constructing a producer with a schema, you do not need to serialize messages into bytes, instead Pulsar schema does this job in the background. diff --git a/site2/website/versioned_docs/version-2.5.0/schema-manage.md b/site2/website/versioned_docs/version-2.5.0/schema-manage.md new file mode 100644 index 0000000000000..248299efce8b1 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/schema-manage.md @@ -0,0 +1,809 @@ +--- +id: version-2.5.0-schema-manage +title: Manage schema +sidebar_label: Manage schema +original_id: schema-manage +--- + +This guide demonstrates the ways to manage schemas: + +* Automatically + + * [Schema AutoUpdate](#schema-autoupdate) + +* Manually + + * [Schema manual management](#schema-manual-management) + + * [Custom schema storage](#custom-schema-storage) + +## Schema AutoUpdate + +If a schema passes the schema compatibility check, Pulsar producer automatically updates this schema to the topic it produces by default. + +### AutoUpdate for producer + +For a producer, the `AutoUpdate` happens in the following cases: + +* If a **topic doesn’t have a schema**, Pulsar registers a schema automatically. + +* If a **topic has a schema**: + + * If a **producer doesn’t carry a schema**: + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **disabled** in the namespace to which the topic belongs, the producer is allowed to connect to the topic and produce data. + + * If `isSchemaValidationEnforced` or `schemaValidationEnforced` is **enabled** in the namespace to which the topic belongs, the producer is rejected and disconnected. + + * If a **producer carries a schema**: + + A broker performs the compatibility check based on the configured compatibility check strategy of the namespace to which the topic belongs. + + * If the schema is registered, a producer is connected to a broker. + + * If the schema is not registered: + + * If `isAllowAutoUpdateSchema` sets to **false**, the producer is rejected to connect to a broker. + + * If `isAllowAutoUpdateSchema` sets to **true**: + + * If the schema passes the compatibility check, then the broker registers a new schema automatically for the topic and the producer is connected. + + * If the schema does not pass the compatibility check, then the broker does not register a schema and the producer is rejected to connect to a broker. + +![AutoUpdate Producer](assets/schema-producer.png) + +### AutoUpdate for consumer + +For a consumer, the `AutoUpdate` happens in the following cases: + +* If a **consumer connects to a topic without a schema** (which means the consumer receiving raw bytes), the consumer can connect to the topic successfully without doing any compatibility check. + +* If a **consumer connects to a topic with a schema**. + + * If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + + * If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +![AutoUpdate Consumer](assets/schema-consumer.png) + + +### Manage AutoUpdate strategy + +You can use the `pulsar-admin` command to manage the `AutoUpdate` strategy as below: + +* [Enable AutoUpdate](#enable-autoupdate) + +* [Disable AutoUpdate](#disable-autoupdate) + +* [Adjust compatibility](#adjust-compatibility) + +#### Enable AutoUpdate + +To enable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --enable tenant/namespace +``` + +#### Disable AutoUpdate + +To disable `AutoUpdate` on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-is-allow-auto-update-schema --disable tenant/namespace +``` + +Once the `AutoUpdate` is disabled, you can only register a new schema using the `pulsar-admin` command. + +#### Adjust compatibility + +To adjust the schema compatibility level on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-schema-compatibility-strategy --compatibility tenant/namespace +``` + +### Schema validation + +By default, `schemaValidationEnforced` is **disabled** for producers: + +* This means a producer without a schema can produce any kind of messages to a topic with schemas, which may result in producing trash data to the topic. + +* This allows non-java language clients that don’t support schema can produce messages to a topic with schemas. + +However, if you want a stronger guarantee on the topics with schemas, you can enable `schemaValidationEnforced` across the whole cluster or on a per-namespace basis. + +#### Enable schema validation + +To enable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-schema-validation-enforce --enable tenant/namespace +``` + +#### Disable schema validation + +To disable `schemaValidationEnforced` on a namespace, you can use the `pulsar-admin` command. + +```bash +bin/pulsar-admin namespaces set-schema-validation-enforce --disable tenant/namespace +``` + +## Schema manual management + +To manage schemas, you can use one of the following methods. + + + + + + + + + + + + + + + + + + +
    MethodDescription
    + +**Admin CLI** + +You can use the `pulsar-admin` tool to manage Pulsar schemas, brokers, clusters, sources, sinks, topics, tenants and so on. + +For more information about how to use the `pulsar-admin` tool, see [here](reference-pulsar-admin.md). +
    + +**REST API** + + +Pulsar exposes schema related management API in Pulsar’s admin RESTful API. You can access the admin RESTful endpoint directly to manage schemas. + +For more information about how to use the Pulsar REST API, see [here](http://pulsar.apache.org/admin-rest-api/). +
    + +**Java Admin API** + Pulsar provides Java admin library.
    + +### Upload a schema + +To upload (register) a new schema for a topic, you can use one of the following methods. + + + + + +Use the `upload` subcommand. + +```bash +$ pulsar-admin schemas upload --filename +``` + +The `schema-definition-file` is in JSON format. + +```json +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} +``` + +The `schema-definition-file` includes the following fields: + + + + + + + + + + + + + + + + + + +
    FieldDescription
    + +`type` + + The schema type.
    + +`schema` + + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
    + + `properties` + The additional properties associated with the schema.
    + +Here are examples of the `schema-definition-file` for a JSON schema. + +**Example 1** + +```json +{ + "type": "JSON", + "schema": "{\"type\":\"record\",\"name\":\"User\",\"namespace\":\"com.foo\",\"fields\":[{\"name\":\"file1\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"file2\",\"type\":\"string\",\"default\":null},{\"name\":\"file3\",\"type\":[\"null\",\"string\"],\"default\":\"dfdf\"}]}", + "properties": {} +} +``` + +**Example 2** + +```json +{ + "type": "STRING", + "schema": "", + "properties": { + "key1": "value1" + } +} +``` + + + +Send a `POST` request to this endpoint: {@inject: endpoint|POST|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/uploadSchema} + +The post payload is in JSON format. + +```json +{ + "type": "", + "schema": "", + "properties": {} // the properties associated with the schema +} +``` + +The post payload includes the following fields: + + + + + + + + + + + + + + + + + + +
    FieldDescription
    + +`type` + + The schema type.
    + +`schema` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
    + + `properties` + The additional properties associated with the schema.
    + + + +```java +void createSchema(String topic, PostSchemaPayload schemaPayload) +``` + +The `PostSchemaPayload` includes the following fields: + + + + + + + + + + + + + + + + + + +
    FieldDescription
    + +`type` + + The schema type.
    + +`schema` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
    + + `properties` + The additional properties associated with the schema.
    + +Here is an example of `PostSchemaPayload`: + +```java +PulsarAdmin admin = …; + +PostSchemaPayload payload = new PostSchemaPayload(); +payload.setType("INT8"); +payload.setSchema(""); + +admin.createSchema("my-tenant/my-ns/my-topic", payload); +``` + + +### Get a schema (latest) + +To get the latest schema for a topic, you can use one of the following methods. + + + + + +Use the `get` subcommand. + +```bash +$ pulsar-admin schemas get + +{ + "version": 0, + "type": "String", + "timestamp": 0, + "data": "string", + "properties": { + "property1": "string", + "property2": "string" + } +} +``` + + + +Send a `GET` request to this endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/getSchema} + +Here is an example of a response, which is returned in JSON format. + +```json +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} +``` + +The response includes the following fields: + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    + +`version` + + The schema version, which is a long number.
    + +`type` + + The schema type.
    + +`timestamp` + + The timestamp of creating this version of schema.
    + +`data` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
    + + `properties` + The additional properties associated with the schema.
    + + + +```java +SchemaInfo createSchema(String topic) +``` + +The `SchemaInfo` includes the following fields: + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    + +`name` + + The schema name.
    + +`type` + + The schema type.
    + +`schema` + +A byte array of the schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this byte array should be empty. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition converted to a byte array. +
    + + `properties` + The additional properties associated with the schema.
    + +Here is an example of `SchemaInfo`: + +```java +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic"); +``` + + + +### Get a schema (specific) + +To get a specific version of a schema, you can use one of the following methods. + + + + + +Use the `get` subcommand. + +```bash +$ pulsar-admin schemas get --version= +``` + + + +Send a `GET` request to a schema endpoint: {@inject: endpoint|GET|/admin/v2/schemas/:tenant/:namespace/:topic/schema/:version|operation/getSchema} + +Here is an example of a response, which is returned in JSON format. + +```json +{ + "version": "", + "type": "", + "timestamp": "", + "data": "", + "properties": {} // the properties associated with the schema +} +``` + +The response includes the following fields: + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    + +`version` + + The schema version, which is a long number.
    + +`type` + + The schema type.
    + +`timestamp` + + The timestamp of creating this version of schema.
    + +`data` + +The schema definition data, which is encoded in UTF 8 charset. + +* If the schema is a **primitive** schema, this field should be blank. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition. +
    + + `properties` + The additional properties associated with the schema.
    + + + +```java +SchemaInfo createSchema(String topic, long version) +``` + +The `SchemaInfo` includes the following fields: + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    + +`name` + The schema name.
    + +`type` + The schema type.
    + +`schema` + +A byte array of the schema definition data, which is encoded in UTF 8. + +* If the schema is a **primitive** schema, this byte array should be empty. + +* If the schema is a **struct** schema, this field should be a JSON string of the Avro schema definition converted to a byte array. +
    + + `properties` + The additional properties associated with the schema.
    + +Here is an example of `SchemaInfo`: + +```java +PulsarAdmin admin = …; + +SchemaInfo si = admin.getSchema("my-tenant/my-ns/my-topic", 1L); +``` + + + +### Extract a schema + +To provide a schema via a topic, you can use the following method. + + + + + +Use the `extract` subcommand. + +```bash +$ pulsar-admin schemas extract --classname --jar --type +``` + + + +### Delete a schema + +To delete a schema for a topic, you can use one of the following methods. + +> #### Note +> +> In any case, the **delete** action deletes **all versions** of a schema registered for a topic. + + + + + +Use the `delete` subcommand. + +```bash +$ pulsar-admin schemas delete +``` + + + +Send a `DELETE` request to a schema endpoint: {@inject: endpoint|DELETE|/admin/v2/schemas/:tenant/:namespace/:topic/schema|operation/deleteSchema} + +Here is an example of a response, which is returned in JSON format. + +```json +{ + "version": "", +} +``` + +The response includes the following field: + +Field | Description | +---|---| +`version` | The schema version, which is a long number. | + + + +```java +void deleteSchema(String topic) +``` + +Here is an example of deleting a schema. + +```java +PulsarAdmin admin = …; + +admin.deleteSchema("my-tenant/my-ns/my-topic"); +``` + + + +## Custom schema storage + +By default, Pulsar stores various data types of schemas in [Apache BookKeeper](https://bookkeeper.apache.org) deployed alongside Pulsar. + +However, you can use another storage system if needed. + +### Implement + +To use a non-default (non-BookKeeper) storage system for Pulsar schemas, you need to implement the following Java interfaces: + +* [SchemaStorage interface](#schemastorage-interface) + +* [SchemaStorageFactory interface](#schemastoragefactory-interface) + +#### SchemaStorage interface + +The `SchemaStorage` interface has the following methods: + +```java +public interface SchemaStorage { + // How schemas are updated + CompletableFuture put(String key, byte[] value, byte[] hash); + + // How schemas are fetched from storage + CompletableFuture get(String key, SchemaVersion version); + + // How schemas are deleted + CompletableFuture delete(String key); + + // Utility method for converting a schema version byte array to a SchemaVersion object + SchemaVersion versionFromBytes(byte[] version); + + // Startup behavior for the schema storage client + void start() throws Exception; + + // Shutdown behavior for the schema storage client + void close() throws Exception; +} +``` + +> #### Tip +> +> For a complete example of **schema storage** implementation, see [BookKeeperSchemaStorage](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorage.java) class. + +#### SchemaStorageFactory interface + +The `SchemaStorageFactory` interface has the following method: + +```java +public interface SchemaStorageFactory { + @NotNull + SchemaStorage create(PulsarService pulsar) throws Exception; +} +``` + +> Tip +> +> For a complete example of **schema storage factory** implementation, see [BookKeeperSchemaStorageFactory](https://github.com/apache/pulsar/blob/master/pulsar-broker/src/main/java/org/apache/pulsar/broker/service/schema/BookkeeperSchemaStorageFactory.java) class. + +### Deploy + +To use your custom schema storage implementation, perform the following steps. + +1. Package the implementation in a [JAR](https://docs.oracle.com/javase/tutorial/deployment/jar/basicsindex.html) file. + +2. Add the JAR file to the `lib` folder in your Pulsar binary or source distribution. + +3. Change the `schemaRegistryStorageClassName` configuration in `broker.conf` to your custom factory class. + +4. Start Pulsar. diff --git a/site2/website/versioned_docs/version-2.5.0/schema-understand.md b/site2/website/versioned_docs/version-2.5.0/schema-understand.md new file mode 100644 index 0000000000000..7b4ccfb71de08 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/schema-understand.md @@ -0,0 +1,592 @@ +--- +id: version-2.5.0-schema-understand +title: Understand schema +sidebar_label: Understand schema +original_id: schema-understand +--- + +This chapter explains the basic concepts of Pulsar schema, focuses on the topics of particular importance, and provides additional background. + +## SchemaInfo + +Pulsar schema is defined in a data structure called `SchemaInfo`. + +The `SchemaInfo` is stored and enforced on a per-topic basis and cannot be stored at the namespace or tenant level. + +A `SchemaInfo` consists of the following fields: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +Field + + + +Description + +
    + +`name` + + + +Schema name (a string). + +
    + +`type` + + + +Schema type, which determines how to interpret the schema data. + +* Predefined schema: see [here](schema-understand.md#schema-type). + +* Customized schema: it is left as an empty string. + +
    + +`schema`(`payload`) + + + +Schema data, which is a sequence of 8-bit unsigned bytes and schema-type specific. + +
    + +`properties` + + + +It is a user defined properties as a string/string map. + +Applications can use this bag for carrying any application specific logics. + +Possible properties might be the Git hash associated with the schema, an environment string like `dev` or `prod`. + +
    + +**Example** + +This is the `SchemaInfo` of a string. + +```text +{ + "name": "test-string-schema", + "type": "STRING", + "schema": "", + "properties": {} +} +``` + +## Schema type + +Pulsar supports various schema types, which are mainly divided into two categories: + +* Primitive type + +* Complex type + +### Primitive type + +Currently, Pulsar supports the following primitive types: + +| Primitive Type | Description | +|---|---| +| `BOOLEAN` | A binary value | +| `INT8` | A 8-bit signed integer | +| `INT16` | A 16-bit signed integer | +| `INT32` | A 32-bit signed integer | +| `INT64` | A 64-bit signed integer | +| `FLOAT` | A single precision (32-bit) IEEE 754 floating-point number | +| `DOUBLE` | A double-precision (64-bit) IEEE 754 floating-point number | +| `BYTES` | A sequence of 8-bit unsigned bytes | +| `STRING` | A Unicode character sequence | +| `TIMESTAMP` (`DATE`, `TIME`) | A logic type represents a specific instant in time with millisecond precision.
    It stores the number of milliseconds since `January 1, 1970, 00:00:00 GMT` as an `INT64` value | + +For primitive types, Pulsar does not store any schema data in `SchemaInfo`. The `type` in `SchemaInfo` is used to determine how to serialize and deserialize the data. + +Some of the primitive schema implementations can use `properties` to store implementation-specific tunable settings. For example, a `string` schema can use `properties` to store the encoding charset to serialize and deserialize strings. + +The conversions between **Pulsar schema types** and **language-specific primitive types** are as below. + +| Schema Type | Java Type| Python Type | Go Type | +|---|---|---|---| +| BOOLEAN | boolean | bool | bool | +| INT8 | byte | | int8 | +| INT16 | short | | int16 | +| INT32 | int | | int32 | +| INT64 | long | | int64 | +| FLOAT | float | float | float32 | +| DOUBLE | double | float | float64| +| BYTES | byte[], ByteBuffer, ByteBuf | bytes | []byte | +| STRING | string | str | string| +| TIMESTAMP | java.sql.Timestamp | | | +| TIME | java.sql.Time | | | +| DATE | java.util.Date | | | + +**Example** + +This example demonstrates how to use a string schema. + +1. Create a producer with a string schema and send messages. + + ```text + Producer producer = client.newProducer(Schema.STRING).create(); + producer.newMessage().value("Hello Pulsar!").send(); + ``` + +2. Create a consumer with a string schema and receive messages. + + ```text + Consumer consumer = client.newConsumer(Schema.STRING).create(); + consumer.receive(); + ``` + +### Complex type + +Currently, Pulsar supports the following complex types: + +| Complex Type | Description | +|---|---| +| `keyvalue` | Represents a complex type of a key/value pair. | +| `struct` | Supports **AVRO**, **JSON**, and **Protobuf**. | + +#### keyvalue + +`Keyvalue` schema helps applications define schemas for both key and value. + +For `SchemaInfo` of `keyvalue` schema, Pulsar stores the `SchemaInfo` of key schema and the `SchemaInfo` of value schema together. + +Pulsar provides two methods to encode a key/value pair in messages: + +* `INLINE` + +* `SEPARATED` + +Users can choose the encoding type when constructing the key/value schema. + +##### INLINE + +Key/value pairs will be encoded together in the message payload. + +##### SEPARATED + +Key will be encoded in the message key and the value will be encoded in the message payload. + +**Example** + +This example shows how to construct a key/value schema and then use it to produce and consume messages. + +1. Construct a key/value schema with `INLINE` encoding type. + + ```java + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.INLINE + ); + ``` + +2. Optionally, construct a key/value schema with `SEPARATED` encoding type. + + ```java + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + ``` + +3. Produce messages using a key/value schema. + + ```java + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Producer> producer = client.newProducer(kvSchema) + .topic(TOPIC) + .create(); + + final int key = 100; + final String value = "value-100"; + + // send the key/value message + producer.newMessage() + .value(new KeyValue<>(key, value)) + .send(); + ``` + +4. Consume messages using a key/value schema. + + ```java + Schema> kvSchema = Schema.KeyValue( + Schema.INT32, + Schema.STRING, + KeyValueEncodingType.SEPARATED + ); + + Consumer> consumer = client.newConsumer(kvSchema) + ... + .topic(TOPIC) + .subscriptionName(SubscriptionName).subscribe(); + + // receive key/value pair + Message> msg = consumer.receive(); + KeyValue kv = msg.getValue(); + ``` + +#### struct + +Pulsar uses [Avro Specification](http://avro.apache.org/docs/current/spec.html) to declare the schema definition for `struct` schema. + +This allows Pulsar: + +* to use same tools to manage schema definitions + +* to use different serialization/deserialization methods to handle data + +There are two methods to use `struct` schema: + +* `static` + +* `generic` + +##### static + +You can predefine the `struct` schema, and it can be a POJO in Java, a `struct` in Go, or classes generated by Avro or Protobuf tools. + +**Example** + +Pulsar gets the schema definition from the predefined `struct` using an Avro library. The schema definition is the schema data stored as a part of the `SchemaInfo`. + +1. Create the _User_ class to define the messages sent to Pulsar topics. + + ```text + public class User { + String name; + int age; + } + ``` + +2. Create a producer with a `struct` schema and send messages. + + ```text + Producer producer = client.newProducer(Schema.AVRO(User.class)).create(); + producer.newMessage().value(User.builder().userName("pulsar-user").userId(1L).build()).send(); + ``` + +3. Create a consumer with a `struct` schema and receive messages + + ```text + Consumer consumer = client.newConsumer(Schema.AVRO(User.class)).create(); + User user = consumer.receive(); + ``` + +##### generic + +Sometimes applications do not have pre-defined structs, and you can use this method to define schema and access data. + +You can define the `struct` schema using the `GenericSchemaBuilder`, generate a generic struct using `GenericRecordBuilder` and consume messages into `GenericRecord`. + +**Example** + +1. Use `RecordSchemaBuilder` to build a schema. + + ```text + RecordSchemaBuilder recordSchemaBuilder = SchemaBuilder.record("schemaName"); + recordSchemaBuilder.field("intField").type(SchemaType.INT32); + SchemaInfo schemaInfo = recordSchemaBuilder.build(SchemaType.AVRO); + + Producer producer = client.newProducer(Schema.generic(schemaInfo)).create(); + ``` + +2. Use `RecordBuilder` to build the struct records. + + ```text + producer.newMessage().value(schema.newRecordBuilder() + .set("intField", 32) + .build()).send(); + ``` + +### Auto Schema + +If you don't know the schema type of a Pulsar topic in advance, you can use AUTO schema to produce or consume generic records to or from brokers. + +| Auto Schema Type | Description | +|---|---| +| `AUTO_PRODUCE` | This is useful for transferring data **from a producer to a Pulsar topic that has a schema**. | +| `AUTO_CONSUME` | This is useful for transferring data **from a Pulsar topic that has a schema to a consumer**. | + +#### AUTO_PRODUCE + +`AUTO_PRODUCE` schema helps a producer validate whether the bytes sent by the producer is compatible with the schema of a topic. + +**Example** + +Suppose that: + +* You have a producer processing messages from a Kafka topic _K_. + +* You have a Pulsar topic _P_, and you do not know its schema type. + +* Your application reads the messages from _K_ and writes the messages to _P_. + +In this case, you can use `AUTO_PRODUCE` to verify whether the bytes produced by _K_ can be sent to _P_ or not. + +```text +Produce pulsarProducer = client.newProducer(Schema.AUTO_PRODUCE()) + … + .create(); + +byte[] kafkaMessageBytes = … ; + +pulsarProducer.produce(kafkaMessageBytes); +``` + +#### AUTO_CONSUME + +`AUTO_CONSUME` schema helps a Pulsar topic validate whether the bytes sent by a Pulsar topic is compatible with a consumer, that is, the Pulsar topic deserializes messages into language-specific objects using the `SchemaInfo` retrieved from broker-side. + +Currently, `AUTO_CONSUME` only supports **AVRO** and **JSON** schemas. It deserializes messages into `GenericRecord`. + +**Example** + +Suppose that: + +* You have a Pulsar topic _P_. + +* You have a consumer (for example, MySQL) receiving messages from the topic _P_. + +* You application reads the messages from _P_ and writes the messages to MySQL. + +In this case, you can use `AUTO_CONSUME` to verify whether the bytes produced by _P_ can be sent to MySQL or not. + +```text +Consumer pulsarConsumer = client.newConsumer(Schema.AUTO_CONSUME()) + … + .subscribe(); + +Message msg = consumer.receive() ; +GenericRecord record = msg.getValue(); +… +``` + +## Schema version + +Each `SchemaInfo` stored with a topic has a version. Schema version manages schema changes happening within a topic. + +Messages produced with a given `SchemaInfo` is tagged with a schema version, so when a message is consumed by a Pulsar client, the Pulsar client can use the schema version to retrieve the corresponding `SchemaInfo` and then use the `SchemaInfo` to deserialize data. + +Schemas are versioned in succession. Schema storage happens in a broker that handles the associated topics so that version assignments can be made. + +Once a version is assigned/fetched to/for a schema, all subsequent messages produced by that producer are tagged with the appropriate version. + +**Example** + +The following example illustrates how the schema version works. + +Suppose that a Pulsar [Java client](client-libraries-java.md) created using the code below attempts to connect to Pulsar and begins to send messages: + +```text +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .build(); + +Producer producer = client.newProducer(JSONSchema.of(SensorReading.class)) + .topic("sensor-data") + .sendTimeout(3, TimeUnit.SECONDS) + .create(); +``` + +The table below lists the possible scenarios when this connection attempt occurs and what happens in each scenario: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ScenarioWhat happens
    + +* No schema exists for the topic. + + + +(1) The producer is created using the given schema. + +(2) Since no existing schema is compatible with the `SensorReading` schema, the schema is transmitted to the broker and stored. + +(3) Any consumer created using the same schema or topic can consume messages from the `sensor-data` topic. + +
    + +* A schema already exists. + +* The producer connects using the same schema that is already stored. + + + +(1) The schema is transmitted to the broker. + +(2) The broker determines that the schema is compatible. + +(3) The broker attempts to store the schema in [BookKeeper](concepts-architecture-overview.md#persistent-storage) but then determines that it's already stored, so it is used to tag produced messages. + +
    + +* A schema already exists. + +* The producer connects using a new schema that is compatible. + + + +(1) The schema is transmitted to the broker. + +(2) The broker determines that the schema is compatible and stores the new schema as the current version (with a new version number). + +
    + +## How does schema work + +Pulsar schemas are applied and enforced at the **topic** level (schemas cannot be applied at the namespace or tenant level). + +Producers and consumers upload schemas to brokers, so Pulsar schemas work on the producer side and the consumer side. + +### Producer side + +This diagram illustrates how does schema work on the Producer side. + +![Schema works at the producer side](assets/schema-producer.png) + +1. The application uses a schema instance to construct a producer instance. + + The schema instance defines the schema for the data being produced using the producer instance. + + Take AVRO as an example, Pulsar extract schema definition from the POJO class and construct the `SchemaInfo` that the producer needs to pass to a broker when it connects. + +2. The producer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker looks up the schema in the schema storage to check if it is already a registered schema. + +4. If yes, the broker skips the schema validation since it is a known schema, and returns the schema version to the producer. + +5. If no, the broker verifies whether a schema can be automatically created in this namespace: + + * If `isAllowAutoUpdateSchema` sets to **true**, then a schema can be created, and the broker validates the schema based on the schema compatibility check strategy defined for the topic. + + * If `isAllowAutoUpdateSchema` sets to **false**, then a schema can not be created, and the producer is rejected to connect to the broker. + +**Tip**: + +`isAllowAutoUpdateSchema` can be set via **Pulsar admin API** or **REST API.** + +For how to set `isAllowAutoUpdateSchema` via Pulsar admin API, see [Manage AutoUpdate Strategy](schema-manage.md/#manage-autoupdate-strategy). + +6. If the schema is allowed to be updated, then the compatible strategy check is performed. + + * If the schema is compatible, the broker stores it and returns the schema version to the producer. + + All the messages produced by this producer are tagged with the schema version. + + * If the schema is incompatible, the broker rejects it. + +### Consumer side + +This diagram illustrates how does Schema work on the consumer side. + +![Schema works at the consumer side](assets/schema-consumer.png) + +1. The application uses a schema instance to construct a consumer instance. + + The schema instance defines the schema that the consumer uses for decoding messages received from a broker. + +2. The consumer connects to the broker with the `SchemaInfo` extracted from the passed-in schema instance. + +3. The broker determines whether the topic has one of them (a schema/data/a local consumer and a local producer). + +4. If a topic does not have all of them (a schema/data/a local consumer and a local producer): + + * If `isAllowAutoUpdateSchema` sets to **true**, then the consumer registers a schema and it is connected to a broker. + + * If `isAllowAutoUpdateSchema` sets to **false**, then the consumer is rejected to connect to a broker. + +5. If a topic has one of them (a schema/data/a local consumer and a local producer), then the schema compatibility check is performed. + + * If the schema passes the compatibility check, then the consumer is connected to the broker. + + * If the schema does not pass the compatibility check, then the consumer is rejected to connect to the broker. + +6. The consumer receives messages from the broker. + + If the schema used by the consumer supports schema versioning (for example, AVRO schema), the consumer fetches the `SchemaInfo` of the version tagged in messages and uses the passed-in schema and the schema tagged in messages to decode the messages. diff --git a/site2/website/versioned_docs/version-2.5.0/security-encryption.md b/site2/website/versioned_docs/version-2.5.0/security-encryption.md new file mode 100644 index 0000000000000..6aa506b1a865b --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/security-encryption.md @@ -0,0 +1,176 @@ +--- +id: version-2.5.0-security-encryption +title: Pulsar Encryption +sidebar_label: End-to-End Encryption +original_id: security-encryption +--- + +Applications can use Pulsar encryption to encrypt messages at the producer side and decrypt messages at the consumer side. You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. + +## Asymmetric and symmetric encryption + +Pulsar uses dynamically generated symmetric AES key to encrypt messages(data). You can use the application provided ECDSA/RSA key pair to encrypt the AES key(data key), so you do not have to share the secret with everyone. + +Key is a public and private key pair used for encryption or decryption. The producer key is the public key of the key pair, and the consumer key is the private key of the key pair. + +The application configures the producer with the public key. You can use this key to encrypt the AES data key. The encrypted data key is sent as part of message header. Only entities with the private key (in this case the consumer) are able to decrypt the data key which is used to decrypt the message. + +You can encrypt a message with more than one key. Any one of the keys used for encrypting the message is sufficient to decrypt the message. + +Pulsar does not store the encryption key anywhere in the Pulsar service. If you lose or delete the private key, your message is irretrievably lost, and is unrecoverable. + +## Producer +![alt text](assets/pulsar-encryption-producer.jpg "Pulsar Encryption Producer") + +## Consumer +![alt text](assets/pulsar-encryption-consumer.jpg "Pulsar Encryption Consumer") + +## Get started + +1. Enter the commands below to create your ECDSA or RSA public and private key pair. + +```shell +openssl ecparam -name secp521r1 -genkey -param_enc explicit -out test_ecdsa_privkey.pem +openssl ec -in test_ecdsa_privkey.pem -pubout -outform pkcs8 -out test_ecdsa_pubkey.pem +``` + +2. Add the public and private key to the key management and configure your producers to retrieve public keys and consumers clients to retrieve private keys. + +3. Implement CryptoKeyReader::getPublicKey() interface from producer and CryptoKeyReader::getPrivateKey() interface from consumer, which Pulsar client invokes to load the key. + +4. Add encryption key to producer configuration: conf.addEncryptionKey("myapp.key"). + +5. Add CryptoKeyReader implementation to producer or consumer config: conf.setCryptoKeyReader(keyReader). + +6. Sample producer application: + +```java +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); + +ProducerConfiguration prodConf = new ProducerConfiguration(); +prodConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +prodConf.addEncryptionKey("myappkey"); + +Producer producer = pulsarClient.createProducer("persistent://my-tenant/my-ns/my-topic", prodConf); + +for (int i = 0; i < 10; i++) { + producer.send("my-message".getBytes()); +} + +pulsarClient.close(); +``` +7. Sample Consumer Application: + +```java +class RawFileKeyReader implements CryptoKeyReader { + + String publicKeyFile = ""; + String privateKeyFile = ""; + + RawFileKeyReader(String pubKeyFile, String privKeyFile) { + publicKeyFile = pubKeyFile; + privateKeyFile = privKeyFile; + } + + @Override + public EncryptionKeyInfo getPublicKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(publicKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read public key from file " + publicKeyFile); + e.printStackTrace(); + } + return keyInfo; + } + + @Override + public EncryptionKeyInfo getPrivateKey(String keyName, Map keyMeta) { + EncryptionKeyInfo keyInfo = new EncryptionKeyInfo(); + try { + keyInfo.setKey(Files.readAllBytes(Paths.get(privateKeyFile))); + } catch (IOException e) { + System.out.println("ERROR: Failed to read private key from file " + privateKeyFile); + e.printStackTrace(); + } + return keyInfo; + } +} + +ConsumerConfiguration consConf = new ConsumerConfiguration(); +consConf.setCryptoKeyReader(new RawFileKeyReader("test_ecdsa_pubkey.pem", "test_ecdsa_privkey.pem")); +PulsarClient pulsarClient = PulsarClient.create("http://localhost:8080"); +Consumer consumer = pulsarClient.subscribe("persistent://my-tenant/my-ns/my-topic", "my-subscriber-name", consConf); +Message msg = null; + +for (int i = 0; i < 10; i++) { + msg = consumer.receive(); + // do something + System.out.println("Received: " + new String(msg.getData())); +} + +// Acknowledge the consumption of all messages at once +consumer.acknowledgeCumulative(msg); +pulsarClient.close(); +``` + +## Key rotation +Pulsar generates new AES data key every 4 hours or after publishing a certain number of messages. A producer fetches the asymmetric public key every 4 hours by calling CryptoKeyReader::getPublicKey() to retrieve the latest version. + +## Enable encryption at the producer application +If you produce messages that are consumed across application boundaries, you need to ensure that consumers in other applications have access to one of the private keys that can decrypt the messages. You can do this in two ways: +1. The consumer application provides you access to their public key, which you add to your producer keys. +2. You grant access to one of the private keys from the pairs that producer uses. + +When producers want to encrypt the messages with multiple keys, producers add all such keys to the config. Consumer can decrypt the message as long as the consumer has access to at least one of the keys. + +If you need to encrypt the messages using 2 keys (myapp.messagekey1 and myapp.messagekey2), refer to the following example. + +```java +conf.addEncryptionKey("myapp.messagekey1"); +conf.addEncryptionKey("myapp.messagekey2"); +``` +## Decrypt encrypted messages at the consumer application +Consumers require access one of the private keys to decrypt messages that the producer produces. If you want to receive encrypted messages, create a public or private key and give your public key to the producer application to encrypt messages using your public key. + +## Handle failures +* Producer/ Consumer loses access to the key + * Producer action fails indicating the cause of the failure. Application has the option to proceed with sending unencrypted message in such cases. Call conf.setCryptoFailureAction(ProducerCryptoFailureAction) to control the producer behavior. The default behavior is to fail the request. + * If consumption fails due to decryption failure or missing keys in consumer, application has the option to consume the encrypted message or discard it. Call conf.setCryptoFailureAction(ConsumerCryptoFailureAction) to control the consumer behavior. The default behavior is to fail the request. Application is never able to decrypt the messages if the private key is permanently lost. +* Batch messaging + * If decryption fails and the message contains batch messages, client is not able to retrieve individual messages in the batch, hence message consumption fails even if conf.setCryptoFailureAction() is set to CONSUME. +* If decryption fails, the message consumption stops and application notices backlog growth in addition to decryption failure messages in the client log. If application does not have access to the private key to decrypt the message, the only option is to skip or discard backlogged messages. diff --git a/site2/website/versioned_docs/version-2.5.0/security-extending.md b/site2/website/versioned_docs/version-2.5.0/security-extending.md new file mode 100644 index 0000000000000..8d725830b3b88 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/security-extending.md @@ -0,0 +1,194 @@ +--- +id: version-2.5.0-security-extending +title: Extending Authentication and Authorization in Pulsar +sidebar_label: Extending +original_id: security-extending +--- + +Pulsar provides a way to use custom authentication and authorization mechanisms. + +## Authentication + +Pulsar supports mutual TLS and Athenz authentication plugins. For how to use these authentication plugins, you can refer to the description in [Security](security-overview.md). + +You can choose to use a custom authentication mechanism by providing the implementation in the form of two plugins. One plugin is for the Client library and the other plugin is for the Pulsar Broker to validate the credentials. + +### Client authentication plugin + +For client library, you need to implement `org.apache.pulsar.client.api.Authentication`. By entering the command below you can pass this class when you create a Pulsar client: + +```java +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar://localhost:6650") + .authentication(new MyAuthentication()) + .build(); +``` + +You can use 2 interfaces to implement on the client side: + * `Authentication` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/Authentication.html + * `AuthenticationDataProvider` -> http://pulsar.apache.org/api/client/org/apache/pulsar/client/api/AuthenticationDataProvider.html + + +This in turn needs to provide the client credentials in the form of `org.apache.pulsar.client.api.AuthenticationDataProvider`. This leaves the chance to return different kinds of authentication token for different types of connection or by passing a certificate chain to use for TLS. + + +You can find examples for client authentication providers at: + + * Mutual TLS Auth -- https://github.com/apache/pulsar/tree/master/pulsar-client/src/main/java/org/apache/pulsar/client/impl/auth + * Athenz -- https://github.com/apache/pulsar/tree/master/pulsar-client-auth-athenz/src/main/java/org/apache/pulsar/client/impl/auth + +### Broker authentication plugin + +On broker side, you need the corresponding plugin to validate the credentials that the client passes. Broker can support multiple authentication providers at the same time. + +In `conf/broker.conf` you can choose to specify a list of valid providers: + +```properties +# Autentication provider name list, which is comma separated list of class names +authenticationProviders= +``` +To implement `org.apache.pulsar.broker.authentication.AuthenticationProvider` on one single interface: + +```java +/** + * Provider of authentication mechanism + */ +public interface AuthenticationProvider extends Closeable { + + /** + * Perform initialization for the authentication provider + * + * @param config + * broker config object + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration config) throws IOException; + + /** + * @return the authentication method name supported by this provider + */ + String getAuthMethodName(); + + /** + * Validate the authentication for the given credentials with the specified authentication data + * + * @param authData + * provider specific authentication data + * @return the "role" string for the authenticated connection, if the authentication was successful + * @throws AuthenticationException + * if the credentials are not valid + */ + String authenticate(AuthenticationDataSource authData) throws AuthenticationException; + +} +``` + +The follwing is the example for Broker authentication plugins: + + * Mutual TLS -- https://github.com/apache/pulsar/blob/master/pulsar-broker-common/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderTls.java + * Athenz -- https://github.com/apache/pulsar/blob/master/pulsar-broker-auth-athenz/src/main/java/org/apache/pulsar/broker/authentication/AuthenticationProviderAthenz.java + +## Authorization + +Authorization is the operation that checks whether a particular "role" or "principal" has a permission to perform a certain operation. + +By default, Pulsar provides an embedded authorization, though configuring a different one through a plugin is also an alternative choice. + +To provide a custom provider, you need to implement the `org.apache.pulsar.broker.authorization.AuthorizationProvider` interface, put this class in the Pulsar broker classpath and configure the class in `conf/broker.conf`: + + ```properties + # Authorization provider fully qualified class-name + authorizationProvider=org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider + ``` + +```java +/** + * Provider of authorization mechanism + */ +public interface AuthorizationProvider extends Closeable { + + /** + * Perform initialization for the authorization provider + * + * @param config + * broker config object + * @param configCache + * pulsar zk configuration cache service + * @throws IOException + * if the initialization fails + */ + void initialize(ServiceConfiguration conf, ConfigurationCacheService configCache) throws IOException; + + /** + * Check if the specified role has permission to send messages to the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to send messages to the topic. + */ + CompletableFuture canProduceAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * Check if the specified role has permission to receive messages from the specified fully qualified topic name. + * + * @param topicName + * the fully qualified topic name associated with the topic. + * @param role + * the app id used to receive messages from the topic. + * @param subscription + * the subscription name defined by the client + */ + CompletableFuture canConsumeAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData, String subscription); + + /** + * Check whether the specified role can perform a lookup for the specified topic. + * + * For that the caller needs to have producer or consumer permission. + * + * @param topicName + * @param role + * @return + * @throws Exception + */ + CompletableFuture canLookupAsync(TopicName topicName, String role, + AuthenticationDataSource authenticationData); + + /** + * + * Grant authorization-action permission on a namespace to the given client + * + * @param namespace + * @param actions + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
    + * IllegalArgumentException when namespace not found
    + * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(NamespaceName namespace, Set actions, String role, + String authDataJson); + + /** + * Grant authorization-action permission on a topic to the given client + * + * @param topicName + * @param role + * @param authDataJson + * additional authdata in json format + * @return CompletableFuture + * @completesWith
    + * IllegalArgumentException when namespace not found
    + * IllegalStateException when failed to grant permission + */ + CompletableFuture grantPermissionAsync(TopicName topicName, Set actions, String role, + String authDataJson); + +} + +``` diff --git a/site2/website/versioned_docs/version-2.5.0/security-overview.md b/site2/website/versioned_docs/version-2.5.0/security-overview.md new file mode 100644 index 0000000000000..da726eaf7d7a4 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/security-overview.md @@ -0,0 +1,31 @@ +--- +id: version-2.5.0-security-overview +title: Pulsar security overview +sidebar_label: Overview +original_id: security-overview +--- + +As the central message bus for a business, Apache Pulsar is frequently used for storing mission-critical data. Therefore, enabling security features in Pulsar is crucial. + +By default, Pulsar configures no encryption, authentication, or authorization. Any client can communicate to Apache Pulsar via plain text service URLs. So we must ensure that Pulsar accessing via these plain text service URLs is restricted to trusted clients only. In such cases, you can use Network segmentation and/or authorization ACLs to restrict access to trusted IPs. If you use neither, the state of cluster is wide open and anyone can access the cluster. + +Pulsar supports a pluggable authentication mechanism. And Pulsar clients use this mechanism to authenticate with brokers and proxies. You can also configure Pulsar to support multiple authentication sources. + +You had better secure the service components in your Apache Pulsar deployment. + +## Role tokens + +In Pulsar, a *role* is a string, like `admin` or `app1`, which can represent a single client or multiple clients. You can use roles to control permission for clients to produce or consume from certain topics, administer the configuration for tenants, and so on. + +Apache Pulsar uses a [Authentication Provider](#authentication-providers) to establish the identity of a client and then assign a *role token* to that client. This role token is then used for [Authorization and ACLs](security-authorization.md) to determine what the client is authorized to do. + +## Authentication providers + +Currently Pulsar supports the following authentication providers: + +- [TLS Authentication](security-tls-authentication.md) +- [Athenz](security-athenz.md) +- [Kerberos](security-kerberos.md) +- [JSON Web Token Authentication](security-jwt.md) + + diff --git a/site2/website/versioned_docs/version-2.5.0/security-tls-authentication.md b/site2/website/versioned_docs/version-2.5.0/security-tls-authentication.md new file mode 100644 index 0000000000000..9067b3578d94c --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/security-tls-authentication.md @@ -0,0 +1,175 @@ +--- +id: version-2.5.0-security-tls-authentication +title: Authentication using TLS +sidebar_label: Authentication using TLS +original_id: security-tls-authentication +--- + +## TLS authentication overview + +TLS authentication is an extension of [TLS transport encryption](security-tls-transport.md). Not only servers have keys and certs that the client uses to verify the identity of servers, clients also have keys and certs that the server uses to verify the identity of clients. You must have TLS transport encryption configured on your cluster before you can use TLS authentication. This guide assumes you already have TLS transport encryption configured. + +### Create client certificates + +Client certificates are generated using the certificate authority. Server certificates are also generated with the same certificate authority. + +The biggest difference between client certs and server certs is that the **common name** for the client certificate is the **role token** which that client is authenticated as. + +First, you need to enter the follwing command to generate the key : + +```bash +$ openssl genrsa -out admin.key.pem 2048 +``` + +Similar to the broker, the client expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so you need to convert it by entering the follwing command: + +```bash +$ openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in admin.key.pem -out admin.key-pk8.pem -nocrypt +``` + +Next, enter the command below to generate the certificate request. When you are asked for a **common name**, enter the **role token** that you want this key pair to authenticate a client as. + +```bash +$ openssl req -config openssl.cnf \ + -key admin.key.pem -new -sha256 -out admin.csr.pem +``` +> Note +> If openssl.cnf is not specified, read [Certificate authority](http://pulsar.apache.org/docs/en/security-tls-transport/#certificate-authority) to get the openssl.cnf. + +Then, enter the command below to sign with request with the certificate authority. Note that the client certs uses the **usr_cert** extension, which allows the cert to be used for client authentication. + +```bash +$ openssl ca -config openssl.cnf -extensions usr_cert \ + -days 1000 -notext -md sha256 \ + -in admin.csr.pem -out admin.cert.pem +``` + +You can get a cert, `admin.cert.pem`, and a key, `admin.key-pk8.pem` from this command. With `ca.cert.pem`, clients can use this cert and this key to authenticate themselves to brokers and proxies as the role token ``admin``. + +> Note +> If the "unable to load CA private key" error occurs and the reason of this error is "No such file or directory: /etc/pki/CA/private/cakey.pem" in this step. Try the command below: +> +> ```bash +> $ cd /etc/pki/tls/misc/CA +> $ ./CA -newca +> ``` +> +> to generate `cakey.pem` . + +## Enable TLS authentication on brokers + +To configure brokers to authenticate clients, add the following parameters to `broker.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#broker-configuration): + +```properties +# Configuration to enable authentication +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# operations and publish/consume from all topics +superUserRoles=admin + +# Authentication settings of the broker itself. Used when the broker connects to other brokers, either in same or other clusters +brokerClientTlsEnabled=true +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters=tlsCertFile:/path/my-ca/admin.cert.pem,tlsKeyFile:/path/my-ca/admin.key-pk8.pem +brokerClientTrustCertsFilePath=/path/my-ca/certs/ca.cert.pem +``` + +## Enable TLS authentication on proxies + +To configure proxies to authenticate clients, add the following parameters to `proxy.conf`, alongside [the configuration to enable tls transport](security-tls-transport.md#proxy-configuration): + +The proxy should have its own client key pair for connecting to brokers. You need to configure the role token for this key pair in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties +# For clients connecting to the proxy +authenticationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderTls + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters=tlsCertFile:/path/to/proxy.cert.pem,tlsKeyFile:/path/to/proxy.key-pk8.pem +``` + +## Client configuration + +When you use TLS authentication, client connects via TLS transport. You need to configure the client to use ```https://``` and 8443 port for the web service URL, ```pulsar+ssl://``` and 6651 port for the broker service URL. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-pulsar-admin.md), [`pulsar-perf`](reference-cli-tools.md#pulsar-perf), and [`pulsar-client`](reference-cli-tools.md#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS authentication with the CLI tools of Pulsar: + +```properties +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +authPlugin=org.apache.pulsar.client.impl.auth.AuthenticationTls +authParams=tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem +``` + +### Java client + +```java +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .authentication("org.apache.pulsar.client.impl.auth.AuthenticationTls", + "tlsCertFile:/path/to/my-role.cert.pem,tlsKeyFile:/path/to/my-role.key-pk8.pem") + .build(); +``` + +### Python client + +```python +from pulsar import Client, AuthenticationTLS + +auth = AuthenticationTLS("/path/to/my-role.cert.pem", "/path/to/my-role.key-pk8.pem") +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False, + authentication=auth) +``` + +### C++ client + +```c++ +#include + +pulsar::ClientConfiguration config; +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/ca.cert.pem"); +config.setTlsAllowInsecureConnection(false); + +pulsar::AuthenticationPtr auth = pulsar::AuthTls::create("/path/to/my-role.cert.pem", + "/path/to/my-role.key-pk8.pem") +config.setAuth(auth); + +pulsar::Client client("pulsar+ssl://broker.example.com:6651/", config); +``` + +### Node.js client + +```JavaScript +const Pulsar = require('pulsar-client'); + +(async () => { + const auth = new Pulsar.AuthenticationTls({ + certificatePath: '/path/to/my-role.cert.pem', + privateKeyPath: '/path/to/my-role.key-pk8.pem', + }); + + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + authentication: auth, + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + }); +})(); +``` diff --git a/site2/website/versioned_docs/version-2.5.0/security-tls-transport.md b/site2/website/versioned_docs/version-2.5.0/security-tls-transport.md new file mode 100644 index 0000000000000..5e5c127c835f0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/security-tls-transport.md @@ -0,0 +1,243 @@ +--- +id: version-2.5.0-security-tls-transport +title: Transport Encryption using TLS +sidebar_label: Transport Encryption using TLS +original_id: security-tls-transport +--- + +## TLS overview + +By default, Apache Pulsar clients communicate with the Apache Pulsar service in plain text. This means that all data is sent in the clear. You can use TLS to encrypt this traffic to protect the traffic from the snooping of a man-in-the-middle attacker. + +You can also configure TLS for both encryption and authentication. Use this guide to configure just TLS transport encryption and refer to [here](security-tls-authentication.md) for TLS authentication configuration. Alternatively, you can use [another authentication mechanism](security-athenz.md) on top of TLS transport encryption. + +> Note that enabling TLS may impact the performance due to encryption overhead. + +## TLS concepts + +TLS is a form of [public key cryptography](https://en.wikipedia.org/wiki/Public-key_cryptography). Using key pairs consisting of a public key and a private key can perform the encryption. The public key encrpyts the messages and the private key decrypts the messages. + +To use TLS transport encryption, you need two kinds of key pairs, **server key pairs** and a **certificate authority**. + +You can use a third kind of key pair, **client key pairs**, for [client authentication](security-tls-authentication.md). + +You should store the **certificate authority** private key in a very secure location (a fully encrypted, disconnected, air gapped computer). As for the certificate authority public key, the **trust cert**, you can freely shared it. + +For both client and server key pairs, the administrator first generates a private key and a certificate request, then uses the certificate authority private key to sign the certificate request, finally generates a certificate. This certificate is the public key for the server/client key pair. + +For TLS transport encryption, the clients can use the **trust cert** to verify that the server has a key pair that the certificate authority signed when the clients are talking to the server. A man-in-the-middle attacker does not have access to the certificate authority, so they couldn't create a server with such a key pair. + +For TLS authentication, the server uses the **trust cert** to verify that the client has a key pair that the certificate authority signed. The common name of the **client cert** is then used as the client's role token (see [Overview](security-overview.md)). + +## Create TLS certificates + +Creating TLS certificates for Pulsar involves creating a [certificate authority](#certificate-authority) (CA), [server certificate](#server-certificate), and [client certificate](#client-certificate). + +Follow the guide below to set up a certificate authority. You can also refer to plenty of resources on the internet for more details. We recommend [this guide](https://jamielinux.com/docs/openssl-certificate-authority/index.html) for your detailed reference. + +### Certificate authority + +1. Create the certificate for the CA. You can use CA to sign both the broker and client certificates. This ensures that each party will trust the others. You should store CA in a very secure location (ideally completely disconnected from networks, air gapped, and fully encrypted). + +2. Entering the follwing command to create a directory for your CA, and place [this openssl configuration file](https://github.com/apache/pulsar/tree/master/site2/website/static/examples/openssl.cnf) in the directory. You may want to modify the default answers for company name and department in the configuration file. Export the location of the CA directory to the environment variable, CA_HOME. The configuration file uses this environment variable to find the rest of the files and directories that the CA needs. + +```bash +$ mkdir my-ca +$ cd my-ca +$ wget https://raw.githubusercontent.com/apache/pulsar/master/site2/website/static/examples/openssl.cnf +$ export CA_HOME=$(pwd) +``` + +3. Enter the commands below to create the necessary directories, keys and certs. + +```bash +$ mkdir certs crl newcerts private +$ chmod 700 private/ +$ touch index.txt +$ echo 1000 > serial +$ openssl genrsa -aes256 -out private/ca.key.pem 4096 +$ chmod 400 private/ca.key.pem +$ openssl req -config openssl.cnf -key private/ca.key.pem \ + -new -x509 -days 7300 -sha256 -extensions v3_ca \ + -out certs/ca.cert.pem +$ chmod 444 certs/ca.cert.pem +``` + +4. After you answer the question prompts, CA-related files are stored in the `./my-ca` directory. Within that directory: + +* `certs/ca.cert.pem` is the public certificate. This public certificates is meant to be distributed to all parties involved. +* `private/ca.key.pem` is the private key. You only need it when you are signing a new certificate for either broker or clients and you must safely guard this private key. + +### Server certificate + +Once you have created a CA certificate, you can create certificate requests and sign them with the CA. + +The following commands ask you a few questions and then create the certificates. When you are asked for the common name, you should match the hostname of the broker. You can also use a wildcard to match a group of broker hostnames, for example, `*.broker.usw.example.com`. This ensures that multiple machines can reuse the same certificate. + +> #### Tips +> +> Sometimes matching the hostname is not possible or makes no sense, +> such as when you creat the brokers with random hostnames, or you +> plan to connect to the hosts via their IP. In these cases, you +> should configure the client to disable TLS hostname verification. For more +> details, you can see [the host verification section in client configuration](#hostname-verification). + +1. Enter the command below to generate the key. + +```bash +$ openssl genrsa -out broker.key.pem 2048 +``` + +The broker expects the key to be in [PKCS 8](https://en.wikipedia.org/wiki/PKCS_8) format, so enter the following command to convert it. + +```bash +$ openssl pkcs8 -topk8 -inform PEM -outform PEM \ + -in broker.key.pem -out broker.key-pk8.pem -nocrypt +``` + +2. Enter the follwing command to generate the certificate request. + +```bash +$ openssl req -config openssl.cnf \ + -key broker.key.pem -new -sha256 -out broker.csr.pem +``` + +3. Sign it with the certificate authority by entering the command below. + +```bash +$ openssl ca -config openssl.cnf -extensions server_cert \ + -days 1000 -notext -md sha256 \ + -in broker.csr.pem -out broker.cert.pem +``` + +At this point, you have a cert, `broker.cert.pem`, and a key, `broker.key-pk8.pem`, which you can use along with `ca.cert.pem` to configure TLS transport encryption for your broker and proxy nodes. + +## Broker Configuration + +To configure a Pulsar [broker](reference-terminology.md#broker) to use TLS transport encryption, you need to make some changes to `broker.conf`, which locates in the `conf` directory of your [Pulsar installation](getting-started-standalone.md). + +Add these values to the configuration file (substituting the appropriate certificate paths where necessary): + +```properties +tlsEnabled=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem +``` + +> You can find a full list of parameters available in the `conf/broker.conf` file, +> as well as the default values for those parameters, in [Broker Configuration](reference-configuration.md#broker) +> +### TLS Protocol Version and Cipher + +You can configure the broker (and proxy) to require specific TLS protocol versions and ciphers for TLS negiotation. You can use the TLS protocol versions and ciphers to stop clients from requesting downgraded TLS protocol versions or ciphers that may have weaknesses. + +Both the TLS protocol versions and cipher properties can take multiple values, separated by commas. The possible values for protocol version and ciphers depend on the TLS provider that you are using. Pulsar uses OpenSSL if the OpenSSL is available, but if the OpenSSL is not available, Pulsar defaults back to the JDK implementation. + +```properties +tlsProtocols=TLSv1.2,TLSv1.1 +tlsCiphers=TLS_DH_RSA_WITH_AES_256_GCM_SHA384,TLS_DH_RSA_WITH_AES_256_CBC_SHA +``` + +OpenSSL currently supports ```SSL2```, ```SSL3```, ```TLSv1```, ```TLSv1.1``` and ```TLSv1.2``` for the protocol version. You can acquire a list of supported cipher from the openssl ciphers command, i.e. ```openssl ciphers -tls_v2```. + +For JDK 8, you can obtain a list of supported values from the documentation: +- [TLS protocol](https://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#SSLContext) +- [Ciphers](https://docs.oracle.com/javase/8/docs/technotes/guides/security/StandardNames.html#ciphersuites) + +## Proxy Configuration + +Proxies need to configure TLS in two directions, for clients connecting to the proxy, and for the proxy connecting to brokers. + +```properties +# For clients connecting to the proxy +tlsEnabledInProxy=true +tlsCertificateFilePath=/path/to/broker.cert.pem +tlsKeyFilePath=/path/to/broker.key-pk8.pem +tlsTrustCertsFilePath=/path/to/ca.cert.pem + +# For the proxy to connect to brokers +tlsEnabledWithBroker=true +brokerClientTrustCertsFilePath=/path/to/ca.cert.pem +``` + +## Client configuration + +When you enable the TLS transport encryption, you need to configure the client to use ```https://``` and port 8443 for the web service URL, and ```pulsar+ssl://``` and port 6651 for the broker service URL. + +As the server certificate that you generated above does not belong to any of the default trust chains, you also need to either specify the path the **trust cert** (recommended), or tell the client to allow untrusted server certs. + +#### Hostname verification + +Hostname verification is a TLS security feature whereby a client can refuse to connect to a server if the "CommonName" does not match the hostname to which the hostname is connecting. By default, Pulsar clients disable hostname verification, as it requires that each broker has a DNS record and a unique cert. + +Moreover, as the administrator has full control of the certificate authority, a bad actor is unlikely to be able to pull off a man-in-the-middle attack. "allowInsecureConnection" allows the client to connect to servers whose cert has not been signed by an approved CA. The client disables "allowInsecureConnection" by default, and you should always disable "allowInsecureConnection" in production environments. As long as you disable "allowInsecureConnection", a man-in-the-middle attack requires that the attacker has access to the CA. + +One scenario where you may want to enable hostname verification is where you have multiple proxy nodes behind a VIP, and the VIP has a DNS record, for example, pulsar.mycompany.com. In this case, you can generate a TLS cert with pulsar.mycompany.com as the "CommonName," and then enable hostname verification on the client. + +The examples below show hostname verification being disabled for the Java client, though you can omit this as the client disables the hostname verification by default. C++/python/Node.js clients do now allow configuring this at the moment. + +### CLI tools + +[Command-line tools](reference-cli-tools.md) like [`pulsar-admin`](reference-cli-tools#pulsar-admin), [`pulsar-perf`](reference-cli-tools#pulsar-perf), and [`pulsar-client`](reference-cli-tools#pulsar-client) use the `conf/client.conf` config file in a Pulsar installation. + +You need to add the following parameters to that file to use TLS transport with the CLI tools of Pulsar: + +```properties +webServiceUrl=https://broker.example.com:8443/ +brokerServiceUrl=pulsar+ssl://broker.example.com:6651/ +useTls=true +tlsAllowInsecureConnection=false +tlsTrustCertsFilePath=/path/to/ca.cert.pem +tlsEnableHostnameVerification=false +``` + +### Java client + +```java +import org.apache.pulsar.client.api.PulsarClient; + +PulsarClient client = PulsarClient.builder() + .serviceUrl("pulsar+ssl://broker.example.com:6651/") + .enableTls(true) + .tlsTrustCertsFilePath("/path/to/ca.cert.pem") + .enableTlsHostnameVerification(false) // false by default, in any case + .allowTlsInsecureConnection(false) // false by default, in any case + .build(); +``` + +### Python client + +```python +from pulsar import Client + +client = Client("pulsar+ssl://broker.example.com:6651/", + tls_trust_certs_file_path="/path/to/ca.cert.pem", + tls_allow_insecure_connection=False) // defaults to false from v2.2.0 onwards +``` + +### C++ client + +```c++ +#include + +pulsar::ClientConfiguration config; +config.setUseTls(true); +config.setTlsTrustCertsFilePath("/path/to/ca.cert.pem"); +config.setTlsAllowInsecureConnection(false); // defaults to false from v2.2.0 onwards + +pulsar::Client client("pulsar+ssl://broker.example.com:6651/", config); +``` + +### Node.js client + +```JavaScript +const Pulsar = require('pulsar-client'); + +(async () => { + const client = new Pulsar.Client({ + serviceUrl: 'pulsar+ssl://broker.example.com:6651/', + tlsTrustCertsFilePath: '/path/to/ca.cert.pem', + }); +})(); +``` diff --git a/site2/website/versioned_docs/version-2.5.0/security-token-admin.md b/site2/website/versioned_docs/version-2.5.0/security-token-admin.md new file mode 100644 index 0000000000000..b5eb2a16f2759 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/security-token-admin.md @@ -0,0 +1,159 @@ +--- +id: version-2.5.0-security-token-admin +title: Token authentication admin +sidebar_label: Token authentication admin +original_id: security-token-admin +--- + +## Token Authentication Overview + +Pulsar supports authenticating clients using security tokens that are based on +[JSON Web Tokens](https://jwt.io/introduction/) ([RFC-7519](https://tools.ietf.org/html/rfc7519)). + +Tokens are used to identify a Pulsar client and associate with some "principal" (or "role") which +will be then granted permissions to do some actions (eg: publish or consume from a topic). + +A user will typically be given a token string by an administrator (or some automated service). + +The compact representation of a signed JWT is a string that looks like: + +``` + eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJKb2UifQ.ipevRNuRP6HflG8cFKnmUPtypruRC4fb1DWtoLL62SY + ``` + +Application will specify the token when creating the client instance. An alternative is to pass +a "token supplier", that is to say a function that returns the token when the client library +will need one. + +> #### Always use TLS transport encryption +> Sending a token is equivalent to sending a password over the wire. It is strongly recommended to +> always use TLS encryption when talking to the Pulsar service. See +> [Transport Encryption using TLS](security-tls-transport.md) + +## Secret vs Public/Private keys + +JWT support two different kind of keys in order to generate and validate the tokens: + + * Symmetric : + - there is a single ***Secret*** key that is used both to generate and validate + * Asymmetric: there is a pair of keys. + - ***Private*** key is used to generate tokens + - ***Public*** key is used to validate tokens + +### Secret key + +When using a secret key, the administrator will create the key and he will +use it to generate the client tokens. This key will be also configured to +the brokers to allow them to validate the clients. + +#### Creating a secret key + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. +```shell +$ bin/pulsar tokens create-secret-key --output my-secret.key +``` +To generate base64 encoded private key +```shell +$ bin/pulsar tokens create-secret-key --output /opt/my-secret.key --base64 +``` + +### Public/Private keys + +With public/private, we need to create a pair of keys. Pulsar supports all algorithms supported by the Java JWT library shown [here](https://github.com/jwtk/jjwt#signature-algorithms-keys) + +#### Creating a key pair + +> Output file will be generated in the root of your pulsar installation directory. You can also provide absolute path for the output file. +```shell +$ bin/pulsar tokens create-key-pair --output-private-key my-private.key --output-public-key my-public.key +``` + + * `my-private.key` will be stored in a safe location and only used by administrator to generate + new tokens. + * `my-public.key` will be distributed to all Pulsar brokers. This file can be publicly shared without + any security concern. + +## Generating tokens + +A token is the credential associated with a user. The association is done through the "principal", +or "role". In case of JWT tokens, this field it's typically referred to as **subject**, though +it's exactly the same concept. + +The generated token is then required to have a **subject** field set. + +```shell +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user +``` + +This will print the token string on stdout. + +Similarly, one can create a token by passing the "private" key: + +```shell +$ bin/pulsar tokens create --private-key file:///path/to/my-private.key \ + --subject test-user +``` + +Finally, a token can also be created with a pre-defined TTL. After that time, +the token will be automatically invalidated. + +```shell +$ bin/pulsar tokens create --secret-key file:///path/to/my-secret.key \ + --subject test-user \ + --expiry-time 1y +``` + +## Authorization + +The token itself doesn't have any permission associated. That will be determined by the +authorization engine. Once the token is created, one can grant permission for this token to do certain +actions. Eg. : + +```shell +$ bin/pulsar-admin namespaces grant-permission my-tenant/my-namespace \ + --role test-user \ + --actions produce,consume +``` + +## Enabling Token Authentication ... + +### ... on Brokers + +To configure brokers to authenticate clients, put the following in `broker.conf`: + +```properties +# Configuration to enable authentication and authorization +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken + +# If using secret key +tokenSecretKey=file:///path/to/secret.key +# The key can also be passed inline: +# tokenSecretKey=data:base64,FLFyW0oLJ2Fi22KKCm21J18mbAdztfSHN/lAT5ucEKU= + +# If using public/private +# tokenPublicKey=file:///path/to/public.key +``` + +### ... on Proxies + +To configure proxies to authenticate clients, put the following in `proxy.conf`: + +The proxy will have its own token used when talking to brokers. The role token for this +key pair should be configured in the ``proxyRoles`` of the brokers. See the [authorization guide](security-authorization.md) for more details. + +```properties +# For clients connecting to the proxy +authenticationEnabled=true +authorizationEnabled=true +authenticationProviders=org.apache.pulsar.broker.authentication.AuthenticationProviderToken +tokenSecretKey=file:///path/to/secret.key + +# For the proxy to connect to brokers +brokerClientAuthenticationPlugin=org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters=token:eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ0ZXN0LXVzZXIifQ.9OHgE9ZUDeBTZs7nSMEFIuGNEX18FLR3qvy8mqxSxXw +# Or, alternatively, read token from file +# brokerClientAuthenticationParameters=file:///path/to/proxy-token.txt +``` \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/sql-deployment-configurations.md b/site2/website/versioned_docs/version-2.5.0/sql-deployment-configurations.md new file mode 100644 index 0000000000000..bfcd2c45df9dc --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/sql-deployment-configurations.md @@ -0,0 +1,156 @@ +--- +id: version-2.5.0-sql-deployment-configurations +title: Pulsar SQL configuration and deployment +sidebar_label: Configuration and deployment +original_id: sql-deployment-configurations +--- + +You can configure Presto Pulsar connector and deploy a cluster with the following instruction. + +## Configure Presto Pulsar Connector +You can configure Presto Pulsar Connector in the `${project.root}/conf/presto/catalog/pulsar.properties` properties file. The configuration for the connector and the default values are as follows. + +```properties +# name of the connector to be displayed in the catalog +connector.name=pulsar + +# the url of Pulsar broker service +pulsar.broker-service-url=http://localhost:8080 + +# URI of Zookeeper cluster +pulsar.zookeeper-uri=localhost:2181 + +# minimum number of entries to read at a single time +pulsar.entry-read-batch-size=100 + +# default number of splits to use per query +pulsar.target-num-splits=4 +``` + +You can connect Presto to a Pulsar cluster with multiple hosts. To configure multiple hosts for brokers, add multiple URLs to `pulsar.broker-service-url`. To configure multiple hosts for ZooKeeper, add multiple URIs to `pulsar.zookeeper-uri`. The following is an example. + +``` +pulsar.broker-service-url=http://localhost:8080,localhost:8081,localhost:8082 +pulsar.zookeeper-uri=localhost1,localhost2:2181 +``` + +## Query data from existing Presto clusters + +If you already have a Presto cluster, you can copy the Presto Pulsar connector plugin to your existing cluster. Download the archived plugin package with the following command. + +```bash +$ wget pulsar:binary_release_url +``` + +## Deploy a new cluster + +Since Pulsar SQL is powered by [Presto](https://prestodb.io), the configuration for deployment is the same for the Pulsar SQL worker. + +> Note +> For how to set up a standalone single node environment, refer to [Query data](sql-getting-started.md). + +You can use the same CLI args as the Presto launcher. + +```bash +$ ./bin/pulsar sql-worker --help +Usage: launcher [options] command + +Commands: run, start, stop, restart, kill, status + +Options: + -h, --help show this help message and exit + -v, --verbose Run verbosely + --etc-dir=DIR Defaults to INSTALL_PATH/etc + --launcher-config=FILE + Defaults to INSTALL_PATH/bin/launcher.properties + --node-config=FILE Defaults to ETC_DIR/node.properties + --jvm-config=FILE Defaults to ETC_DIR/jvm.config + --config=FILE Defaults to ETC_DIR/config.properties + --log-levels-file=FILE + Defaults to ETC_DIR/log.properties + --data-dir=DIR Defaults to INSTALL_PATH + --pid-file=FILE Defaults to DATA_DIR/var/run/launcher.pid + --launcher-log-file=FILE + Defaults to DATA_DIR/var/log/launcher.log (only in + daemon mode) + --server-log-file=FILE + Defaults to DATA_DIR/var/log/server.log (only in + daemon mode) + -D NAME=VALUE Set a Java system property + +``` + +The default configuration for the cluster is located in `${project.root}/conf/presto`. You can customize your deployment by modifying the default configuration. + +You can set the worker to read from a different configuration directory, or set a different directory to write data. + +```bash +$ ./bin/pulsar sql-worker run --etc-dir /tmp/incubator-pulsar/conf/presto --data-dir /tmp/presto-1 +``` + +You can start the worker as daemon process. + +```bash +$ ./bin sql-worker start +``` + +### Deploy a cluster on multiple nodes + +You can deploy a Pulsar SQL cluster or Presto cluster on multiple nodes. The following example shows how to deploy a cluster on three-node cluster. + +1. Copy the Pulsar binary distribution to three nodes. + +The first node runs as Presto coordinator. The minimal configuration requirement in the `${project.root}/conf/presto/config.properties` file is as follows. + +```properties +coordinator=true +node-scheduler.include-coordinator=true +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery-server.enabled=true +discovery.uri= +``` + +The other two nodes serve as worker nodes, you can use the following configuration for worker nodes. + +```properties +coordinator=false +http-server.http.port=8080 +query.max-memory=50GB +query.max-memory-per-node=1GB +discovery.uri= +``` + +2. Modify `pulsar.broker-service-url` and `pulsar.zookeeper-uri` configuration in the `${project.root}/conf/presto/catalog/pulsar.properties` file accordingly for the three nodes. + +3. Start the coordinator node. + +``` +$ ./bin/pulsar sql-worker run +``` + +4. Start worker nodes. + +``` +$ ./bin/pulsar sql-worker run +``` + +5. Start the SQL CLI and check the status of your cluster. + +```bash +$ ./bin/pulsar sql --server +``` + +6. Check the status of your nodes. + +```bash +presto> SELECT * FROM system.runtime.nodes; + node_id | http_uri | node_version | coordinator | state +---------+-------------------------+--------------+-------------+-------- + 1 | http://192.168.2.1:8081 | testversion | true | active + 3 | http://192.168.2.2:8081 | testversion | false | active + 2 | http://192.168.2.3:8081 | testversion | false | active +``` + +For more information about deployment in Presto, refer to [Presto deployment](https://prestodb.io/docs/current/installation/deployment.html). \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/sql-getting-started.md b/site2/website/versioned_docs/version-2.5.0/sql-getting-started.md new file mode 100644 index 0000000000000..c3014599c4b01 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/sql-getting-started.md @@ -0,0 +1,144 @@ +--- +id: version-2.5.0-sql-getting-started +title: Query data with Pulsar SQL +sidebar_label: Query data +original_id: sql-getting-started +--- + +Before querying data in Pulsar, you need to install Pulsar and built-in connectors. + +## Requirements +1. Install [Pulsar](getting-started-standalone.md#install-pulsar-standalone). +2. Install Pulsar [built-in connectors](getting-started-standalone.md#install-builtin-connectors-optional). + +## Query data in Pulsar +To query data in Pulsar with Pulsar SQL, complete the following steps. + +1. Start a Pulsar standalone cluster. + +```bash +./bin/pulsar standalone +``` + +2. Start a Pulsar SQL worker. + +```bash +./bin/pulsar sql-worker run +``` + +3. After initializing Pulsar standalone cluster and the SQL worker, run SQL CLI. + +```bash +./bin/pulsar sql +``` + +4. Test with SQL commands. + +```bash +presto> show catalogs; + Catalog +--------- + pulsar + system +(2 rows) + +Query 20180829_211752_00004_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + + +presto> show schemas in pulsar; + Schema +----------------------- + information_schema + public/default + public/functions + sample/standalone/ns1 +(4 rows) + +Query 20180829_211818_00005_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [4 rows, 89B] [21 rows/s, 471B/s] + + +presto> show tables in pulsar."public/default"; + Table +------- +(0 rows) + +Query 20180829_211839_00006_7qpwh, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:00 [0 rows, 0B] [0 rows/s, 0B/s] + +``` + +Since there is no data in Pulsar, no records is returned. + +5. Start the built-in connector _DataGeneratorSource_ and ingest some mock data. + +```bash +./bin/pulsar-admin sources create --name generator --destinationTopicName generator_test --source-type data-generator +``` + +And then you can query a topic in the namespace "public/default". + +```bash +presto> show tables in pulsar."public/default"; + Table +---------------- + generator_test +(1 row) + +Query 20180829_213202_00000_csyeu, FINISHED, 1 node +Splits: 19 total, 19 done (100.00%) +0:02 [1 rows, 38B] [0 rows/s, 17B/s] +``` + +You can now query the data within the topic "generator_test". + +```bash +presto> select * from pulsar."public/default".generator_test; + + firstname | middlename | lastname | email | username | password | telephonenumber | age | companyemail | nationalidentitycardnumber | +-------------+-------------+-------------+----------------------------------+--------------+----------+-----------------+-----+-----------------------------------------------+----------------------------+ + Genesis | Katherine | Wiley | genesis.wiley@gmail.com | genesisw | y9D2dtU3 | 959-197-1860 | 71 | genesis.wiley@interdemconsulting.eu | 880-58-9247 | + Brayden | | Stanton | brayden.stanton@yahoo.com | braydens | ZnjmhXik | 220-027-867 | 81 | brayden.stanton@supermemo.eu | 604-60-7069 | + Benjamin | Julian | Velasquez | benjamin.velasquez@yahoo.com | benjaminv | 8Bc7m3eb | 298-377-0062 | 21 | benjamin.velasquez@hostesltd.biz | 213-32-5882 | + Michael | Thomas | Donovan | donovan@mail.com | michaeld | OqBm9MLs | 078-134-4685 | 55 | michael.donovan@memortech.eu | 443-30-3442 | + Brooklyn | Avery | Roach | brooklynroach@yahoo.com | broach | IxtBLafO | 387-786-2998 | 68 | brooklyn.roach@warst.biz | 085-88-3973 | + Skylar | | Bradshaw | skylarbradshaw@yahoo.com | skylarb | p6eC6cKy | 210-872-608 | 96 | skylar.bradshaw@flyhigh.eu | 453-46-0334 | +. +. +. +``` + +You can query the mock data. + +## Query your own data +If you want to query your own data, you need to ingest your own data first. You can write a simple producer and write custom defined data to Pulsar. The following is an example. + +```java +public class Test { + + public static class Foo { + private int field1 = 1; + private String field2; + private long field3; + } + + public static void main(String[] args) throws Exception { + PulsarClient pulsarClient = PulsarClient.builder().serviceUrl("pulsar://localhost:6650").build(); + Producer producer = pulsarClient.newProducer(AvroSchema.of(Foo.class)).topic("test_topic").create(); + + for (int i = 0; i < 1000; i++) { + Foo foo = new Foo(); + foo.setField1(i); + foo.setField2("foo" + i); + foo.setField3(System.currentTimeMillis()); + producer.newMessage().value(foo).send(); + } + producer.close(); + pulsarClient.close(); + } +} +``` diff --git a/site2/website/versioned_docs/version-2.5.0/sql-overview.md b/site2/website/versioned_docs/version-2.5.0/sql-overview.md new file mode 100644 index 0000000000000..4097a180d1e72 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/sql-overview.md @@ -0,0 +1,18 @@ +--- +id: version-2.5.0-sql-overview +title: Pulsar SQL Overview +sidebar_label: Overview +original_id: sql-overview +--- + +Apache Pulsar is used to store streams of event data, and the event data is structured with predefined fields. With the implementation of the [Schema Registry](schema-get-started.md), you can store structured data in Pulsar and query the data by using [Presto](https://prestosql.io/). + +As the core of Pulsar SQL, Presto Pulsar connector enables Presto workers within a Presto cluster to query data from Pulsar. + +![The Pulsar consumer and reader interfaces](assets/pulsar-sql-arch-2.png) + +The query performance is efficient and highly scalable, because Pulsar adopts [two level segment based architecture](concepts-architecture-overview.md#apache-bookkeeper). + +Topics in Pulsar are stored as segments in [Apache BookKeeper](https://bookkeeper.apache.org/). Each topic segment is replicated to some BookKeeper nodes, which enables concurrent reads and high read throughput. You can configure the number of BookKeeper nodes, and the default number is `3`. In Presto Pulsar connector, data is read directly from BookKeeper, so Presto workers can read concurrently from horizontally scalable number BookKeeper nodes. + +![The Pulsar consumer and reader interfaces](assets/pulsar-sql-arch-1.png) \ No newline at end of file diff --git a/site2/website/versioned_docs/version-2.5.0/sql-rest-api.md b/site2/website/versioned_docs/version-2.5.0/sql-rest-api.md new file mode 100644 index 0000000000000..aa345f01aa0d0 --- /dev/null +++ b/site2/website/versioned_docs/version-2.5.0/sql-rest-api.md @@ -0,0 +1,186 @@ +--- +id: version-2.5.0-sql-rest-api +title: Pulsar SQL REST APIs +sidebar_label: REST APIs +original_id: sql-rest-api +--- + +This section lists resources that make up the Presto REST API v1. + +## Request for Presto services + +All requests for Presto services should use Presto REST API v1 version. + +To request services, use explicit URL `http://presto.service:8081/v1`. You need to update `presto.service:8081` with your real Presto address before sending requests. + +`POST` requests require the `X-Presto-User` header. If you use authentication, you must use the same `username` that is specified in the authentication configuration. If you do not use authentication, you can specify anything for `username`. + +```properties +X-Presto-User: username +``` + +For more information about headers, refer to [PrestoHeaders](https://github.com/prestosql/presto/blob/master/presto-client/src/main/java/io/prestosql/client/PrestoHeaders.java). + +## Schema + +You can use statement in the HTTP body. All data is received as JSON document that might contain a `nextUri` link. If the received JSON document contains a `nextUri` link, the request continues with the `nextUri` link until the received data does not contain a `nextUri` link. If no error is returned, the query completes successfully. If an `error` field is displayed in `stats`, it means the query fails. + +The following is an example of `show catalogs`. The query continues until the received JSON document does not contain a `nextUri` link. Since no `error` is displayed in `stats`, it means that the query completes successfully. + +```powershell +➜ ~ curl --header "X-Presto-User: test-user" --request POST --data 'show catalogs' http://localhost:8081/v1/statement +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "stats" : { + "queued" : true, + "nodes" : 0, + "userTimeMillis" : 0, + "cpuTimeMillis" : 0, + "wallTimeMillis" : 0, + "processedBytes" : 0, + "processedRows" : 0, + "runningSplits" : 0, + "queuedTimeMillis" : 0, + "queuedSplits" : 0, + "completedSplits" : 0, + "totalSplits" : 0, + "scheduled" : false, + "peakMemoryBytes" : 0, + "state" : "QUEUED", + "elapsedTimeMillis" : 0 + }, + "id" : "20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1" +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/1 +{ + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "nextUri" : "http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2", + "id" : "20191113_033653_00006_dg6hb", + "stats" : { + "state" : "PLANNING", + "totalSplits" : 0, + "queued" : false, + "userTimeMillis" : 0, + "completedSplits" : 0, + "scheduled" : false, + "wallTimeMillis" : 0, + "runningSplits" : 0, + "queuedSplits" : 0, + "cpuTimeMillis" : 0, + "processedRows" : 0, + "processedBytes" : 0, + "nodes" : 0, + "queuedTimeMillis" : 1, + "elapsedTimeMillis" : 2, + "peakMemoryBytes" : 0 + } +} + +➜ ~ curl http://localhost:8081/v1/statement/20191113_033653_00006_dg6hb/2 +{ + "id" : "20191113_033653_00006_dg6hb", + "data" : [ + [ + "pulsar" + ], + [ + "system" + ] + ], + "infoUri" : "http://localhost:8081/ui/query.html?20191113_033653_00006_dg6hb", + "columns" : [ + { + "typeSignature" : { + "rawType" : "varchar", + "arguments" : [ + { + "kind" : "LONG_LITERAL", + "value" : 6 + } + ], + "literalArguments" : [], + "typeArguments" : [] + }, + "name" : "Catalog", + "type" : "varchar(6)" + } + ], + "stats" : { + "wallTimeMillis" : 104, + "scheduled" : true, + "userTimeMillis" : 14, + "progressPercentage" : 100, + "totalSplits" : 19, + "nodes" : 1, + "cpuTimeMillis" : 16, + "queued" : false, + "queuedTimeMillis" : 1, + "state" : "FINISHED", + "peakMemoryBytes" : 0, + "elapsedTimeMillis" : 111, + "processedBytes" : 0, + "processedRows" : 0, + "queuedSplits" : 0, + "rootStage" : { + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1, + "subStages" : [ + { + "cpuTimeMillis" : 14, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 17, + "subStages" : [ + { + "wallTimeMillis" : 7, + "subStages" : [], + "stageId" : "2", + "done" : true, + "nodes" : 1, + "totalSplits" : 1, + "processedBytes" : 22, + "processedRows" : 2, + "queuedSplits" : 0, + "userTimeMillis" : 1, + "cpuTimeMillis" : 1, + "runningSplits" : 0, + "state" : "FINISHED", + "completedSplits" : 1 + } + ], + "wallTimeMillis" : 92, + "nodes" : 1, + "done" : true, + "stageId" : "1", + "userTimeMillis" : 12, + "processedRows" : 2, + "processedBytes" : 51, + "queuedSplits" : 0, + "totalSplits" : 17 + } + ], + "wallTimeMillis" : 5, + "done" : true, + "nodes" : 1, + "stageId" : "0", + "userTimeMillis" : 1, + "processedRows" : 2, + "processedBytes" : 22, + "totalSplits" : 1, + "queuedSplits" : 0 + }, + "runningSplits" : 0, + "completedSplits" : 19 + } +} +``` + +> Note +> +> Since the response data is not in sync with the query state from the perspective of clients, you cannot rely on the response data to determine whether the query completes. + +For more information about Presto REST API, refer to [Presto HTTP Protocol](https://github.com/prestosql/presto/wiki/HTTP-Protocol). diff --git a/site2/website/versioned_sidebars/version-2.5.0-sidebars.json b/site2/website/versioned_sidebars/version-2.5.0-sidebars.json new file mode 100644 index 0000000000000..f61588847baf4 --- /dev/null +++ b/site2/website/versioned_sidebars/version-2.5.0-sidebars.json @@ -0,0 +1,145 @@ +{ + "version-2.5.0-docs": { + "Get started": [ + "version-2.5.0-pulsar-2.0", + "version-2.5.0-standalone", + "version-2.5.0-standalone-docker", + "version-2.5.0-client-libraries" + ], + "Concepts and Architecture": [ + "version-2.5.0-concepts-overview", + "version-2.5.0-concepts-messaging", + "version-2.5.0-concepts-architecture-overview", + "version-2.5.0-concepts-clients", + "version-2.5.0-concepts-replication", + "version-2.5.0-concepts-multi-tenancy", + "version-2.5.0-concepts-authentication", + "version-2.5.0-concepts-topic-compaction", + "version-2.5.0-concepts-tiered-storage", + "version-2.5.0-concepts-schema-registry" + ], + "Pulsar Schema": [ + "version-2.5.0-schema-get-started", + "version-2.5.0-schema-understand", + "version-2.5.0-schema-evolution-compatibility", + "version-2.5.0-schema-manage" + ], + "Pulsar Functions": [ + "version-2.5.0-functions-overview", + "version-2.5.0-functions-worker", + "version-2.5.0-functions-runtime", + "version-2.5.0-functions-develop", + "version-2.5.0-functions-debug", + "version-2.5.0-functions-deploy", + "version-2.5.0-functions-configure", + "version-2.5.0-functions-monitor", + "version-2.5.0-functions-secure", + "version-2.5.0-functions-troubleshoot", + "version-2.5.0-functions-cli" + ], + "Pulsar IO": [ + "version-2.5.0-io-overview", + "version-2.5.0-io-quickstart", + "version-2.5.0-io-use", + "version-2.5.0-io-debug", + "version-2.5.0-io-connectors", + "version-2.5.0-io-cdc", + "version-2.5.0-io-develop", + "version-2.5.0-io-cli" + ], + "Pulsar SQL": [ + "version-2.5.0-sql-overview", + "version-2.5.0-sql-getting-started", + "version-2.5.0-sql-deployment-configurations", + "version-2.5.0-sql-rest-api" + ], + "Deployment": [ + "version-2.5.0-deploy-aws", + "version-2.5.0-deploy-kubernetes", + "version-2.5.0-deploy-bare-metal", + "version-2.5.0-deploy-bare-metal-multi-cluster", + "version-2.5.0-deploy-dcos", + "version-2.5.0-deploy-monitoring" + ], + "Administration": [ + "version-2.5.0-administration-zk-bk", + "version-2.5.0-administration-geo", + "version-2.5.0-administration-dashboard", + "version-2.5.0-administration-pulsar-manager", + "version-2.5.0-administration-stats", + "version-2.5.0-administration-load-balance", + "version-2.5.0-administration-proxy", + "version-2.5.0-administration-upgrade" + ], + "Security": [ + "version-2.5.0-security-overview", + "version-2.5.0-security-tls-transport", + "version-2.5.0-security-tls-authentication", + "version-2.5.0-security-jwt", + "version-2.5.0-security-athenz", + "version-2.5.0-security-kerberos", + "version-2.5.0-security-authorization", + "version-2.5.0-security-encryption", + "version-2.5.0-security-extending" + ], + "Client libraries": [ + "version-2.5.0-client-libraries-java", + "version-2.5.0-client-libraries-go", + "version-2.5.0-client-libraries-python", + "version-2.5.0-client-libraries-cpp", + "version-2.5.0-client-libraries-node", + "version-2.5.0-client-libraries-websocket" + ], + "Admin API": [ + "version-2.5.0-admin-api-overview", + "version-2.5.0-admin-api-clusters", + "version-2.5.0-admin-api-tenants", + "version-2.5.0-admin-api-brokers", + "version-2.5.0-admin-api-namespaces", + "version-2.5.0-admin-api-permissions", + "version-2.5.0-admin-api-persistent-topics", + "version-2.5.0-admin-api-non-persistent-topics", + "version-2.5.0-admin-api-partitioned-topics", + "version-2.5.0-admin-api-non-partitioned-topics", + "version-2.5.0-admin-api-schemas", + "version-2.5.0-admin-api-functions" + ], + "Adaptors": [ + "version-2.5.0-adaptors-kafka", + "version-2.5.0-adaptors-spark", + "version-2.5.0-adaptors-storm" + ], + "Cookbooks": [ + "version-2.5.0-cookbooks-tiered-storage", + "version-2.5.0-cookbooks-compaction", + "version-2.5.0-cookbooks-deduplication", + "version-2.5.0-cookbooks-non-persistent", + "version-2.5.0-cookbooks-partitioned", + "version-2.5.0-cookbooks-retention-expiry", + "version-2.5.0-cookbooks-encryption", + "version-2.5.0-cookbooks-message-queue", + "version-2.5.0-cookbooks-bookkeepermetadata" + ], + "Development": [ + "version-2.5.0-develop-tools", + "version-2.5.0-develop-binary-protocol", + "version-2.5.0-develop-schema", + "version-2.5.0-develop-load-manager", + "version-2.5.0-develop-cpp" + ], + "Reference": [ + "version-2.5.0-reference-terminology", + "version-2.5.0-reference-cli-tools", + "version-2.5.0-pulsar-admin", + "version-2.5.0-reference-connector-admin", + "version-2.5.0-reference-configuration", + "version-2.5.0-reference-metrics" + ] + }, + "version-2.5.0-docs-other": { + "First Category": [ + "version-2.5.0-doc4", + "version-2.5.0-doc5" + ] + } +} diff --git a/site2/website/versions.json b/site2/website/versions.json index 5343a50826453..c5381e05ecc95 100644 --- a/site2/website/versions.json +++ b/site2/website/versions.json @@ -1,4 +1,5 @@ [ + "2.5.0", "2.4.2", "2.4.1", "2.4.0",