From e7f2632a13e527275ed4e382a2729ae29d9d87f6 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Fri, 8 Apr 2022 15:57:23 -0600 Subject: [PATCH] Added the en/reference folder --- .github/workflows/deploy-github.yml | 1 + .gitignore | 1 - docs/en/reference/development/_category_.yml | 8 + .../development/adding_test_queries.md | 157 + docs/en/reference/development/architecture.md | 202 + docs/en/reference/development/browse-code.md | 13 + .../reference/development/build-cross-arm.md | 38 + .../reference/development/build-cross-osx.md | 62 + .../development/build-cross-riscv.md | 30 + docs/en/reference/development/build-osx.md | 154 + docs/en/reference/development/build.md | 181 + .../development/continuous-integration.md | 193 + docs/en/reference/development/contrib.md | 107 + .../development/developer-instruction.md | 278 ++ docs/en/reference/development/style.md | 832 ++++ docs/en/reference/development/tests.md | 297 ++ docs/en/reference/engines/_category_.yml | 8 + .../engines/database-engines/atomic.md | 61 + .../engines/database-engines/index.md | 25 + .../engines/database-engines/lazy.md | 16 + .../database-engines/materialized-mysql.md | 290 ++ .../materialized-postgresql.md | 279 ++ .../engines/database-engines/mysql.md | 151 + .../engines/database-engines/postgresql.md | 139 + .../engines/database-engines/replicated.md | 123 + .../engines/database-engines/sqlite.md | 80 + .../reference/engines/table-engines/index.md | 89 + .../integrations/ExternalDistributed.md | 56 + .../integrations/embedded-rocksdb.md | 84 + .../table-engines/integrations/hdfs.md | 230 + .../table-engines/integrations/hive.md | 410 ++ .../table-engines/integrations/index.md | 23 + .../table-engines/integrations/jdbc.md | 95 + .../table-engines/integrations/kafka.md | 198 + .../integrations/materialized-postgresql.md | 59 + .../table-engines/integrations/mongodb.md | 79 + .../table-engines/integrations/mysql.md | 152 + .../table-engines/integrations/odbc.md | 131 + .../table-engines/integrations/postgresql.md | 178 + .../table-engines/integrations/rabbitmq.md | 175 + .../engines/table-engines/integrations/s3.md | 163 + .../table-engines/integrations/sqlite.md | 62 + .../engines/table-engines/log-family/index.md | 46 + .../engines/table-engines/log-family/log.md | 15 + .../table-engines/log-family/stripelog.md | 93 + .../table-engines/log-family/tinylog.md | 14 + .../mergetree-family/aggregatingmergetree.md | 104 + .../mergetree-family/collapsingmergetree.md | 307 ++ .../custom-partitioning-key.md | 136 + .../mergetree-family/graphitemergetree.md | 260 + .../table-engines/mergetree-family/index.md | 16 + .../mergetree-family/mergetree.md | 953 ++++ .../mergetree-family/replacingmergetree.md | 70 + .../mergetree-family/replication.md | 295 ++ .../mergetree-family/summingmergetree.md | 140 + .../versionedcollapsingmergetree.md | 237 + .../engines/table-engines/special/buffer.md | 77 + .../table-engines/special/dictionary.md | 101 + .../table-engines/special/distributed.md | 229 + .../table-engines/special/external-data.md | 65 + .../engines/table-engines/special/file.md | 89 + .../engines/table-engines/special/generate.md | 59 + .../engines/table-engines/special/index.md | 14 + .../engines/table-engines/special/join.md | 130 + .../table-engines/special/materializedview.md | 10 + .../engines/table-engines/special/memory.md | 18 + .../engines/table-engines/special/merge.md | 85 + .../engines/table-engines/special/null.md | 15 + .../engines/table-engines/special/set.md | 23 + .../engines/table-engines/special/url.md | 92 + .../engines/table-engines/special/view.md | 10 + .../reference/getting-started/_category_.yml | 8 + .../example-datasets/_category_.yml | 8 + .../example-datasets/amplab-benchmark.md | 127 + .../example-datasets/brown-benchmark.md | 416 ++ .../example-datasets/cell-towers.md | 131 + .../example-datasets/criteo.md | 78 + .../example-datasets/github-events.md | 10 + .../getting-started/example-datasets/menus.md | 354 ++ .../example-datasets/metrica.md | 78 + .../example-datasets/nyc-taxi.md | 392 ++ .../example-datasets/ontime.md | 408 ++ .../example-datasets/opensky.md | 420 ++ .../example-datasets/recipes.md | 339 ++ .../example-datasets/star-schema.md | 371 ++ .../example-datasets/uk-price-paid.md | 648 +++ .../example-datasets/wikistat.md | 32 + docs/en/reference/getting-started/install.md | 312 ++ .../reference/getting-started/playground.md | 46 + docs/en/reference/images/column-oriented.gif | Bin 0 -> 43771 bytes docs/en/reference/images/logo.svg | 1 + docs/en/reference/images/play.png | Bin 0 -> 26602 bytes docs/en/reference/images/row-oriented.gif | Bin 0 -> 39281 bytes docs/en/reference/interfaces/cli.md | 181 + docs/en/reference/interfaces/cpp.md | 10 + docs/en/reference/interfaces/formats.md | 1707 +++++++ docs/en/reference/interfaces/grpc.md | 99 + docs/en/reference/interfaces/http.md | 664 +++ docs/en/reference/interfaces/index.md | 28 + docs/en/reference/interfaces/jdbc.md | 14 + docs/en/reference/interfaces/mysql.md | 53 + docs/en/reference/interfaces/odbc.md | 12 + docs/en/reference/interfaces/tcp.md | 10 + .../third-party/client-libraries.md | 74 + .../reference/interfaces/third-party/gui.md | 247 + .../reference/interfaces/third-party/index.md | 17 + .../interfaces/third-party/integrations.md | 112 + .../reference/interfaces/third-party/proxy.md | 44 + docs/en/reference/operations/_category_.yml | 4 + docs/en/reference/operations/access-rights.md | 154 + docs/en/reference/operations/backup.md | 41 + docs/en/reference/operations/caches.md | 29 + .../reference/operations/clickhouse-keeper.md | 325 ++ .../operations/configuration-files.md | 159 + .../external-authenticators/index.md | 16 + .../external-authenticators/kerberos.md | 118 + .../external-authenticators/ldap.md | 182 + .../external-authenticators/ssl-x509.md | 24 + docs/en/reference/operations/index.md | 25 + docs/en/reference/operations/monitoring.md | 44 + .../reference/operations/named-collections.md | 230 + docs/en/reference/operations/opentelemetry.md | 65 + .../optimizing-performance/index.md | 8 + .../sampling-query-profiler.md | 62 + .../reference/operations/performance-test.md | 83 + docs/en/reference/operations/quotas.md | 120 + docs/en/reference/operations/requirements.md | 59 + .../server-configuration-parameters/index.md | 16 + .../settings.md | 1630 +++++++ .../settings/constraints-on-settings.md | 73 + .../en/reference/operations/settings/index.md | 58 + .../settings/merge-tree-settings.md | 383 ++ .../settings/permissions-for-queries.md | 59 + .../operations/settings/query-complexity.md | 315 ++ .../operations/settings/settings-profiles.md | 80 + .../operations/settings/settings-users.md | 164 + .../reference/operations/settings/settings.md | 4232 +++++++++++++++++ docs/en/reference/operations/ssl-zookeeper.md | 73 + docs/en/reference/operations/storing-data.md | 318 ++ .../system-tables/asynchronous_metric_log.md | 39 + .../system-tables/asynchronous_metrics.md | 38 + .../operations/system-tables/clusters.md | 71 + .../operations/system-tables/columns.md | 89 + .../operations/system-tables/contributors.md | 41 + .../operations/system-tables/crash-log.md | 48 + .../operations/system-tables/current-roles.md | 11 + .../system-tables/data_skipping_indices.md | 47 + .../system-tables/data_type_families.md | 36 + .../operations/system-tables/databases.md | 37 + .../system-tables/detached_parts.md | 11 + .../operations/system-tables/dictionaries.md | 88 + .../operations/system-tables/disks.md | 27 + .../system-tables/distributed_ddl_queue.md | 64 + .../system-tables/distribution_queue.md | 50 + .../operations/system-tables/enabled-roles.md | 12 + .../operations/system-tables/errors.md | 36 + .../operations/system-tables/events.md | 34 + .../operations/system-tables/functions.md | 33 + .../operations/system-tables/grants.md | 24 + .../system-tables/graphite_retentions.md | 17 + .../operations/system-tables/index.md | 74 + .../system-tables/information_schema.md | 210 + .../operations/system-tables/licenses.md | 39 + .../system-tables/merge_tree_settings.md | 54 + .../operations/system-tables/merges.md | 25 + .../operations/system-tables/metric_log.md | 51 + .../operations/system-tables/metrics.md | 41 + .../operations/system-tables/mutations.md | 49 + .../operations/system-tables/numbers.md | 32 + .../operations/system-tables/numbers_mt.md | 30 + .../reference/operations/system-tables/one.md | 23 + .../system-tables/opentelemetry_span_log.md | 53 + .../operations/system-tables/part_log.md | 69 + .../operations/system-tables/parts.md | 168 + .../operations/system-tables/parts_columns.md | 148 + .../operations/system-tables/processes.md | 61 + .../operations/system-tables/query_log.md | 189 + .../system-tables/query_thread_log.md | 119 + .../system-tables/query_views_log.md | 86 + .../operations/system-tables/quota_limits.md | 21 + .../operations/system-tables/quota_usage.md | 32 + .../operations/system-tables/quotas.md | 28 + .../operations/system-tables/quotas_usage.md | 35 + .../operations/system-tables/replicas.md | 132 + .../system-tables/replicated_fetches.md | 70 + .../system-tables/replication_queue.md | 91 + .../operations/system-tables/role-grants.md | 21 + .../operations/system-tables/roles.md | 15 + .../operations/system-tables/row_policies.md | 34 + .../operations/system-tables/session_log.md | 77 + .../operations/system-tables/settings.md | 53 + .../settings_profile_elements.md | 30 + .../system-tables/settings_profiles.md | 24 + .../operations/system-tables/stack_trace.md | 91 + .../system-tables/storage_policies.md | 17 + .../operations/system-tables/table_engines.md | 38 + .../operations/system-tables/tables.md | 125 + .../operations/system-tables/text_log.md | 53 + .../operations/system-tables/time_zones.md | 30 + .../operations/system-tables/trace_log.md | 57 + .../operations/system-tables/users.md | 34 + .../operations/system-tables/zookeeper.md | 75 + .../operations/system-tables/zookeeper_log.md | 129 + docs/en/reference/operations/tips.md | 279 ++ .../reference/operations/troubleshooting.md | 144 + docs/en/reference/operations/update.md | 32 + .../utilities/clickhouse-benchmark.md | 163 + .../utilities/clickhouse-compressor.md | 28 + .../operations/utilities/clickhouse-copier.md | 188 + .../operations/utilities/clickhouse-format.md | 109 + .../operations/utilities/clickhouse-local.md | 118 + .../utilities/clickhouse-obfuscator.md | 42 + .../reference/operations/utilities/index.md | 15 + .../operations/utilities/odbc-bridge.md | 38 + .../en/reference/sql-reference/_category_.yml | 4 + .../aggregate-functions/combinators.md | 287 ++ .../aggregate-functions/index.md | 60 + .../parametric-functions.md | 766 +++ .../aggregate-functions/reference/any.md | 13 + .../aggregate-functions/reference/anyheavy.md | 30 + .../aggregate-functions/reference/anylast.md | 8 + .../aggregate-functions/reference/argmax.md | 50 + .../aggregate-functions/reference/argmin.md | 50 + .../aggregate-functions/reference/avg.md | 66 + .../reference/avgweighted.md | 99 + .../reference/categoricalinformationvalue.md | 13 + .../aggregate-functions/reference/corr.md | 13 + .../aggregate-functions/reference/count.md | 72 + .../aggregate-functions/reference/covarpop.md | 13 + .../reference/covarsamp.md | 13 + .../aggregate-functions/reference/deltasum.md | 73 + .../reference/deltasumtimestamp.md | 45 + .../aggregate-functions/reference/entropy.md | 43 + .../reference/exponentialmovingaverage.md | 148 + .../reference/grouparray.md | 14 + .../reference/grouparrayinsertat.md | 91 + .../reference/grouparraymovingavg.md | 78 + .../reference/grouparraymovingsum.md | 76 + .../reference/grouparraysample.md | 81 + .../reference/grouparraysorted.md | 48 + .../reference/groupbitand.md | 46 + .../reference/groupbitmap.md | 44 + .../reference/groupbitmapand.md | 46 + .../reference/groupbitmapor.md | 46 + .../reference/groupbitmapxor.md | 46 + .../reference/groupbitor.md | 46 + .../reference/groupbitxor.md | 46 + .../reference/groupuniqarray.md | 12 + .../aggregate-functions/reference/index.md | 76 + .../reference/intervalLengthSum.md | 108 + .../aggregate-functions/reference/kurtpop.md | 25 + .../aggregate-functions/reference/kurtsamp.md | 27 + .../reference/mannwhitneyutest.md | 74 + .../aggregate-functions/reference/max.md | 24 + .../aggregate-functions/reference/maxmap.md | 28 + .../reference/meanztest.md | 70 + .../aggregate-functions/reference/median.md | 46 + .../aggregate-functions/reference/min.md | 23 + .../aggregate-functions/reference/minmap.md | 28 + .../aggregate-functions/reference/quantile.md | 68 + .../reference/quantilebfloat16.md | 68 + .../reference/quantiledeterministic.md | 67 + .../reference/quantileexact.md | 270 ++ .../reference/quantileexactweighted.md | 67 + .../reference/quantiles.md | 115 + .../reference/quantiletdigest.md | 57 + .../reference/quantiletdigestweighted.md | 62 + .../reference/quantiletiming.md | 88 + .../reference/quantiletimingweighted.md | 121 + .../aggregate-functions/reference/rankCorr.md | 57 + .../reference/simplelinearregression.md | 42 + .../aggregate-functions/reference/skewpop.md | 25 + .../aggregate-functions/reference/skewsamp.md | 27 + .../aggregate-functions/reference/sparkbar.md | 64 + .../reference/stddevpop.md | 11 + .../reference/stddevsamp.md | 11 + .../reference/stochasticlinearregression.md | 75 + .../reference/stochasticlogisticregression.md | 55 + .../reference/studentttest.md | 73 + .../aggregate-functions/reference/sum.md | 7 + .../aggregate-functions/reference/sumcount.md | 46 + .../aggregate-functions/reference/sumkahan.md | 40 + .../aggregate-functions/reference/summap.md | 48 + .../reference/sumwithoverflow.md | 9 + .../aggregate-functions/reference/topk.md | 42 + .../reference/topkweighted.md | 43 + .../aggregate-functions/reference/uniq.md | 39 + .../reference/uniqcombined.md | 53 + .../reference/uniqcombined64.md | 7 + .../reference/uniqexact.md | 26 + .../reference/uniqhll12.md | 40 + .../reference/uniqthetasketch.md | 39 + .../aggregate-functions/reference/varpop.md | 13 + .../aggregate-functions/reference/varsamp.md | 15 + .../reference/welchttest.md | 72 + docs/en/reference/sql-reference/ansi.md | 190 + .../data-types/aggregatefunction.md | 66 + .../sql-reference/data-types/array.md | 99 + .../sql-reference/data-types/boolean.md | 10 + .../sql-reference/data-types/date.md | 43 + .../sql-reference/data-types/date32.md | 40 + .../sql-reference/data-types/datetime.md | 148 + .../sql-reference/data-types/datetime64.md | 106 + .../sql-reference/data-types/decimal.md | 114 + .../sql-reference/data-types/domains/index.md | 30 + .../sql-reference/data-types/domains/ipv4.md | 82 + .../sql-reference/data-types/domains/ipv6.md | 84 + .../sql-reference/data-types/enum.md | 130 + .../sql-reference/data-types/fixedstring.md | 61 + .../sql-reference/data-types/float.md | 92 + .../reference/sql-reference/data-types/geo.md | 107 + .../sql-reference/data-types/index.md | 14 + .../sql-reference/data-types/int-uint.md | 37 + .../data-types/lowcardinality.md | 59 + .../reference/sql-reference/data-types/map.md | 111 + .../data-types/multiword-types.md | 29 + .../nested-data-structures/index.md | 8 + .../nested-data-structures/nested.md | 106 + .../sql-reference/data-types/nullable.md | 72 + .../data-types/simpleaggregatefunction.md | 42 + .../special-data-types/expression.md | 10 + .../data-types/special-data-types/index.md | 10 + .../data-types/special-data-types/interval.md | 84 + .../data-types/special-data-types/nothing.md | 24 + .../data-types/special-data-types/set.md | 10 + .../sql-reference/data-types/string.md | 24 + .../sql-reference/data-types/tuple.md | 78 + .../sql-reference/data-types/uuid.md | 75 + .../external-dictionaries/_category_.yml | 8 + .../external-dicts-dict-hierarchical.md | 67 + .../external-dicts-dict-layout.md | 557 +++ .../external-dicts-dict-lifetime.md | 119 + .../external-dicts-dict-polygon.md | 131 + .../external-dicts-dict-sources.md | 818 ++++ .../external-dicts-dict-structure.md | 174 + .../external-dicts-dict.md | 50 + .../external-dictionaries/external-dicts.md | 60 + .../sql-reference/dictionaries/index.md | 16 + .../dictionaries/internal-dicts.md | 52 + .../sql-reference/distributed-ddl.md | 22 + .../functions/arithmetic-functions.md | 157 + .../functions/array-functions.md | 1717 +++++++ .../sql-reference/functions/array-join.md | 34 + .../sql-reference/functions/bit-functions.md | 447 ++ .../functions/bitmap-functions.md | 537 +++ .../functions/comparison-functions.md | 34 + .../functions/conditional-functions.md | 211 + .../functions/date-time-functions.md | 1233 +++++ .../functions/encoding-functions.md | 442 ++ .../functions/encryption-functions.md | 361 ++ .../functions/ext-dict-functions.md | 444 ++ .../sql-reference/functions/files.md | 35 + .../functions/functions-for-nulls.md | 311 ++ .../functions/geo/coordinates.md | 147 + .../sql-reference/functions/geo/geohash.md | 114 + .../sql-reference/functions/geo/h3.md | 1029 ++++ .../sql-reference/functions/geo/index.md | 8 + .../sql-reference/functions/geo/s2.md | 376 ++ .../sql-reference/functions/hash-functions.md | 1496 ++++++ .../sql-reference/functions/in-functions.md | 11 + .../sql-reference/functions/index.md | 252 + .../sql-reference/functions/introspection.md | 478 ++ .../functions/ip-address-functions.md | 448 ++ .../sql-reference/functions/json-functions.md | 466 ++ .../functions/logical-functions.md | 194 + .../functions/machine-learning-functions.md | 18 + .../sql-reference/functions/math-functions.md | 550 +++ .../sql-reference/functions/nlp-functions.md | 133 + .../functions/other-functions.md | 2505 ++++++++++ .../functions/random-functions.md | 105 + .../functions/rounding-functions.md | 201 + .../functions/splitting-merging-functions.md | 339 ++ .../functions/string-functions.md | 1077 +++++ .../functions/string-replace-functions.md | 95 + .../functions/string-search-functions.md | 819 ++++ .../functions/time-window-functions.md | 112 + .../functions/tuple-functions.md | 1060 +++++ .../functions/tuple-map-functions.md | 434 ++ .../functions/type-conversion-functions.md | 1743 +++++++ .../sql-reference/functions/url-functions.md | 467 ++ .../sql-reference/functions/uuid-functions.md | 266 ++ .../functions/ym-dict-functions.md | 152 + docs/en/reference/sql-reference/index.md | 16 + .../sql-reference/operators/exists.md | 45 + .../reference/sql-reference/operators/in.md | 251 + .../sql-reference/operators/index.md | 377 ++ .../sql-reference/statements/alter/column.md | 260 + .../sql-reference/statements/alter/comment.md | 58 + .../statements/alter/constraint.md | 23 + .../sql-reference/statements/alter/delete.md | 28 + .../sql-reference/statements/alter/index.md | 62 + .../statements/alter/index/index.md | 23 + .../statements/alter/order-by.md | 18 + .../statements/alter/partition.md | 328 ++ .../statements/alter/projection.md | 25 + .../sql-reference/statements/alter/quota.md | 39 + .../sql-reference/statements/alter/role.md | 16 + .../statements/alter/row-policy.md | 19 + .../statements/alter/sample-by.md | 20 + .../sql-reference/statements/alter/setting.md | 60 + .../statements/alter/settings-profile.md | 16 + .../sql-reference/statements/alter/ttl.md | 85 + .../sql-reference/statements/alter/update.md | 29 + .../sql-reference/statements/alter/user.md | 63 + .../sql-reference/statements/alter/view.md | 44 + .../sql-reference/statements/attach.md | 81 + .../sql-reference/statements/check-table.md | 69 + .../statements/create/database.md | 58 + .../statements/create/dictionary.md | 99 + .../statements/create/function.md | 59 + .../sql-reference/statements/create/index.md | 21 + .../sql-reference/statements/create/quota.md | 39 + .../sql-reference/statements/create/role.md | 47 + .../statements/create/row-policy.md | 90 + .../statements/create/settings-profile.md | 26 + .../sql-reference/statements/create/table.md | 445 ++ .../sql-reference/statements/create/user.md | 104 + .../sql-reference/statements/create/view.md | 365 ++ .../statements/describe-table.md | 69 + .../sql-reference/statements/detach.md | 74 + .../sql-reference/statements/drop.md | 116 + .../sql-reference/statements/exchange.md | 43 + .../sql-reference/statements/exists.md | 12 + .../sql-reference/statements/explain.md | 454 ++ .../sql-reference/statements/grant.md | 483 ++ .../sql-reference/statements/index.md | 31 + .../sql-reference/statements/insert-into.md | 200 + .../sql-reference/statements/kill.md | 70 + .../sql-reference/statements/misc.md | 20 + .../sql-reference/statements/optimize.md | 200 + .../sql-reference/statements/rename.md | 62 + .../sql-reference/statements/revoke.md | 48 + .../sql-reference/statements/select/all.md | 21 + .../statements/select/array-join.md | 300 ++ .../statements/select/distinct.md | 109 + .../sql-reference/statements/select/except.md | 69 + .../sql-reference/statements/select/format.md | 17 + .../sql-reference/statements/select/from.md | 45 + .../statements/select/group-by.md | 286 ++ .../sql-reference/statements/select/having.md | 13 + .../sql-reference/statements/select/index.md | 285 ++ .../statements/select/intersect.md | 73 + .../statements/select/into-outfile.md | 38 + .../sql-reference/statements/select/join.md | 338 ++ .../statements/select/limit-by.md | 74 + .../sql-reference/statements/select/limit.md | 66 + .../sql-reference/statements/select/offset.md | 88 + .../statements/select/order-by.md | 486 ++ .../statements/select/prewhere.md | 27 + .../sql-reference/statements/select/sample.md | 114 + .../sql-reference/statements/select/union.md | 87 + .../sql-reference/statements/select/where.md | 57 + .../sql-reference/statements/select/with.md | 70 + .../sql-reference/statements/set-role.md | 48 + .../reference/sql-reference/statements/set.md | 20 + .../sql-reference/statements/show.md | 499 ++ .../sql-reference/statements/system.md | 371 ++ .../sql-reference/statements/truncate.md | 22 + .../reference/sql-reference/statements/use.md | 16 + .../sql-reference/statements/watch.md | 109 + docs/en/reference/sql-reference/syntax.md | 208 + .../sql-reference/table-functions/cluster.md | 59 + .../table-functions/dictionary.md | 59 + .../sql-reference/table-functions/file.md | 130 + .../sql-reference/table-functions/generate.md | 41 + .../sql-reference/table-functions/hdfs.md | 102 + .../table-functions/hdfsCluster.md | 59 + .../sql-reference/table-functions/index.md | 41 + .../sql-reference/table-functions/input.md | 44 + .../sql-reference/table-functions/jdbc.md | 38 + .../sql-reference/table-functions/merge.md | 27 + .../sql-reference/table-functions/mysql.md | 114 + .../sql-reference/table-functions/null.md | 43 + .../sql-reference/table-functions/numbers.md | 27 + .../sql-reference/table-functions/odbc.md | 104 + .../table-functions/postgresql.md | 134 + .../sql-reference/table-functions/remote.md | 98 + .../sql-reference/table-functions/s3.md | 157 + .../table-functions/s3Cluster.md | 49 + .../sql-reference/table-functions/sqlite.md | 45 + .../sql-reference/table-functions/url.md | 47 + .../sql-reference/table-functions/view.md | 68 + .../sql-reference/window-functions/index.md | 57 + docs/en/reference/whats-new/_category_.yml | 8 + docs/en/reference/whats-new/changelog/2017.md | 266 ++ docs/en/reference/whats-new/changelog/2018.md | 1061 +++++ docs/en/reference/whats-new/changelog/2019.md | 2072 ++++++++ docs/en/reference/whats-new/changelog/2020.md | 3532 ++++++++++++++ docs/en/reference/whats-new/changelog/2021.md | 2053 ++++++++ .../en/reference/whats-new/changelog/index.md | 7 + docs/en/reference/whats-new/index.md | 8 + docs/en/reference/whats-new/roadmap.md | 10 + .../reference/whats-new/security-changelog.md | 127 + 493 files changed, 81391 insertions(+), 1 deletion(-) create mode 100644 docs/en/reference/development/_category_.yml create mode 100644 docs/en/reference/development/adding_test_queries.md create mode 100644 docs/en/reference/development/architecture.md create mode 100644 docs/en/reference/development/browse-code.md create mode 100644 docs/en/reference/development/build-cross-arm.md create mode 100644 docs/en/reference/development/build-cross-osx.md create mode 100644 docs/en/reference/development/build-cross-riscv.md create mode 100644 docs/en/reference/development/build-osx.md create mode 100644 docs/en/reference/development/build.md create mode 100644 docs/en/reference/development/continuous-integration.md create mode 100644 docs/en/reference/development/contrib.md create mode 100644 docs/en/reference/development/developer-instruction.md create mode 100644 docs/en/reference/development/style.md create mode 100644 docs/en/reference/development/tests.md create mode 100644 docs/en/reference/engines/_category_.yml create mode 100644 docs/en/reference/engines/database-engines/atomic.md create mode 100644 docs/en/reference/engines/database-engines/index.md create mode 100644 docs/en/reference/engines/database-engines/lazy.md create mode 100644 docs/en/reference/engines/database-engines/materialized-mysql.md create mode 100644 docs/en/reference/engines/database-engines/materialized-postgresql.md create mode 100644 docs/en/reference/engines/database-engines/mysql.md create mode 100644 docs/en/reference/engines/database-engines/postgresql.md create mode 100644 docs/en/reference/engines/database-engines/replicated.md create mode 100644 docs/en/reference/engines/database-engines/sqlite.md create mode 100644 docs/en/reference/engines/table-engines/index.md create mode 100644 docs/en/reference/engines/table-engines/integrations/ExternalDistributed.md create mode 100644 docs/en/reference/engines/table-engines/integrations/embedded-rocksdb.md create mode 100644 docs/en/reference/engines/table-engines/integrations/hdfs.md create mode 100644 docs/en/reference/engines/table-engines/integrations/hive.md create mode 100644 docs/en/reference/engines/table-engines/integrations/index.md create mode 100644 docs/en/reference/engines/table-engines/integrations/jdbc.md create mode 100644 docs/en/reference/engines/table-engines/integrations/kafka.md create mode 100644 docs/en/reference/engines/table-engines/integrations/materialized-postgresql.md create mode 100644 docs/en/reference/engines/table-engines/integrations/mongodb.md create mode 100644 docs/en/reference/engines/table-engines/integrations/mysql.md create mode 100644 docs/en/reference/engines/table-engines/integrations/odbc.md create mode 100644 docs/en/reference/engines/table-engines/integrations/postgresql.md create mode 100644 docs/en/reference/engines/table-engines/integrations/rabbitmq.md create mode 100644 docs/en/reference/engines/table-engines/integrations/s3.md create mode 100644 docs/en/reference/engines/table-engines/integrations/sqlite.md create mode 100644 docs/en/reference/engines/table-engines/log-family/index.md create mode 100644 docs/en/reference/engines/table-engines/log-family/log.md create mode 100644 docs/en/reference/engines/table-engines/log-family/stripelog.md create mode 100644 docs/en/reference/engines/table-engines/log-family/tinylog.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/aggregatingmergetree.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/collapsingmergetree.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/custom-partitioning-key.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/graphitemergetree.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/index.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/mergetree.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/replacingmergetree.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/replication.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/summingmergetree.md create mode 100644 docs/en/reference/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md create mode 100644 docs/en/reference/engines/table-engines/special/buffer.md create mode 100644 docs/en/reference/engines/table-engines/special/dictionary.md create mode 100644 docs/en/reference/engines/table-engines/special/distributed.md create mode 100644 docs/en/reference/engines/table-engines/special/external-data.md create mode 100644 docs/en/reference/engines/table-engines/special/file.md create mode 100644 docs/en/reference/engines/table-engines/special/generate.md create mode 100644 docs/en/reference/engines/table-engines/special/index.md create mode 100644 docs/en/reference/engines/table-engines/special/join.md create mode 100644 docs/en/reference/engines/table-engines/special/materializedview.md create mode 100644 docs/en/reference/engines/table-engines/special/memory.md create mode 100644 docs/en/reference/engines/table-engines/special/merge.md create mode 100644 docs/en/reference/engines/table-engines/special/null.md create mode 100644 docs/en/reference/engines/table-engines/special/set.md create mode 100644 docs/en/reference/engines/table-engines/special/url.md create mode 100644 docs/en/reference/engines/table-engines/special/view.md create mode 100644 docs/en/reference/getting-started/_category_.yml create mode 100644 docs/en/reference/getting-started/example-datasets/_category_.yml create mode 100644 docs/en/reference/getting-started/example-datasets/amplab-benchmark.md create mode 100644 docs/en/reference/getting-started/example-datasets/brown-benchmark.md create mode 100644 docs/en/reference/getting-started/example-datasets/cell-towers.md create mode 100644 docs/en/reference/getting-started/example-datasets/criteo.md create mode 100644 docs/en/reference/getting-started/example-datasets/github-events.md create mode 100644 docs/en/reference/getting-started/example-datasets/menus.md create mode 100644 docs/en/reference/getting-started/example-datasets/metrica.md create mode 100644 docs/en/reference/getting-started/example-datasets/nyc-taxi.md create mode 100644 docs/en/reference/getting-started/example-datasets/ontime.md create mode 100644 docs/en/reference/getting-started/example-datasets/opensky.md create mode 100644 docs/en/reference/getting-started/example-datasets/recipes.md create mode 100644 docs/en/reference/getting-started/example-datasets/star-schema.md create mode 100644 docs/en/reference/getting-started/example-datasets/uk-price-paid.md create mode 100644 docs/en/reference/getting-started/example-datasets/wikistat.md create mode 100644 docs/en/reference/getting-started/install.md create mode 100644 docs/en/reference/getting-started/playground.md create mode 100644 docs/en/reference/images/column-oriented.gif create mode 100644 docs/en/reference/images/logo.svg create mode 100644 docs/en/reference/images/play.png create mode 100644 docs/en/reference/images/row-oriented.gif create mode 100644 docs/en/reference/interfaces/cli.md create mode 100644 docs/en/reference/interfaces/cpp.md create mode 100644 docs/en/reference/interfaces/formats.md create mode 100644 docs/en/reference/interfaces/grpc.md create mode 100644 docs/en/reference/interfaces/http.md create mode 100644 docs/en/reference/interfaces/index.md create mode 100644 docs/en/reference/interfaces/jdbc.md create mode 100644 docs/en/reference/interfaces/mysql.md create mode 100644 docs/en/reference/interfaces/odbc.md create mode 100644 docs/en/reference/interfaces/tcp.md create mode 100644 docs/en/reference/interfaces/third-party/client-libraries.md create mode 100644 docs/en/reference/interfaces/third-party/gui.md create mode 100644 docs/en/reference/interfaces/third-party/index.md create mode 100644 docs/en/reference/interfaces/third-party/integrations.md create mode 100644 docs/en/reference/interfaces/third-party/proxy.md create mode 100644 docs/en/reference/operations/_category_.yml create mode 100644 docs/en/reference/operations/access-rights.md create mode 100644 docs/en/reference/operations/backup.md create mode 100644 docs/en/reference/operations/caches.md create mode 100644 docs/en/reference/operations/clickhouse-keeper.md create mode 100644 docs/en/reference/operations/configuration-files.md create mode 100644 docs/en/reference/operations/external-authenticators/index.md create mode 100644 docs/en/reference/operations/external-authenticators/kerberos.md create mode 100644 docs/en/reference/operations/external-authenticators/ldap.md create mode 100644 docs/en/reference/operations/external-authenticators/ssl-x509.md create mode 100644 docs/en/reference/operations/index.md create mode 100644 docs/en/reference/operations/monitoring.md create mode 100644 docs/en/reference/operations/named-collections.md create mode 100644 docs/en/reference/operations/opentelemetry.md create mode 100644 docs/en/reference/operations/optimizing-performance/index.md create mode 100644 docs/en/reference/operations/optimizing-performance/sampling-query-profiler.md create mode 100644 docs/en/reference/operations/performance-test.md create mode 100644 docs/en/reference/operations/quotas.md create mode 100644 docs/en/reference/operations/requirements.md create mode 100644 docs/en/reference/operations/server-configuration-parameters/index.md create mode 100644 docs/en/reference/operations/server-configuration-parameters/settings.md create mode 100644 docs/en/reference/operations/settings/constraints-on-settings.md create mode 100644 docs/en/reference/operations/settings/index.md create mode 100644 docs/en/reference/operations/settings/merge-tree-settings.md create mode 100644 docs/en/reference/operations/settings/permissions-for-queries.md create mode 100644 docs/en/reference/operations/settings/query-complexity.md create mode 100644 docs/en/reference/operations/settings/settings-profiles.md create mode 100644 docs/en/reference/operations/settings/settings-users.md create mode 100644 docs/en/reference/operations/settings/settings.md create mode 100644 docs/en/reference/operations/ssl-zookeeper.md create mode 100644 docs/en/reference/operations/storing-data.md create mode 100644 docs/en/reference/operations/system-tables/asynchronous_metric_log.md create mode 100644 docs/en/reference/operations/system-tables/asynchronous_metrics.md create mode 100644 docs/en/reference/operations/system-tables/clusters.md create mode 100644 docs/en/reference/operations/system-tables/columns.md create mode 100644 docs/en/reference/operations/system-tables/contributors.md create mode 100644 docs/en/reference/operations/system-tables/crash-log.md create mode 100644 docs/en/reference/operations/system-tables/current-roles.md create mode 100644 docs/en/reference/operations/system-tables/data_skipping_indices.md create mode 100644 docs/en/reference/operations/system-tables/data_type_families.md create mode 100644 docs/en/reference/operations/system-tables/databases.md create mode 100644 docs/en/reference/operations/system-tables/detached_parts.md create mode 100644 docs/en/reference/operations/system-tables/dictionaries.md create mode 100644 docs/en/reference/operations/system-tables/disks.md create mode 100644 docs/en/reference/operations/system-tables/distributed_ddl_queue.md create mode 100644 docs/en/reference/operations/system-tables/distribution_queue.md create mode 100644 docs/en/reference/operations/system-tables/enabled-roles.md create mode 100644 docs/en/reference/operations/system-tables/errors.md create mode 100644 docs/en/reference/operations/system-tables/events.md create mode 100644 docs/en/reference/operations/system-tables/functions.md create mode 100644 docs/en/reference/operations/system-tables/grants.md create mode 100644 docs/en/reference/operations/system-tables/graphite_retentions.md create mode 100644 docs/en/reference/operations/system-tables/index.md create mode 100644 docs/en/reference/operations/system-tables/information_schema.md create mode 100644 docs/en/reference/operations/system-tables/licenses.md create mode 100644 docs/en/reference/operations/system-tables/merge_tree_settings.md create mode 100644 docs/en/reference/operations/system-tables/merges.md create mode 100644 docs/en/reference/operations/system-tables/metric_log.md create mode 100644 docs/en/reference/operations/system-tables/metrics.md create mode 100644 docs/en/reference/operations/system-tables/mutations.md create mode 100644 docs/en/reference/operations/system-tables/numbers.md create mode 100644 docs/en/reference/operations/system-tables/numbers_mt.md create mode 100644 docs/en/reference/operations/system-tables/one.md create mode 100644 docs/en/reference/operations/system-tables/opentelemetry_span_log.md create mode 100644 docs/en/reference/operations/system-tables/part_log.md create mode 100644 docs/en/reference/operations/system-tables/parts.md create mode 100644 docs/en/reference/operations/system-tables/parts_columns.md create mode 100644 docs/en/reference/operations/system-tables/processes.md create mode 100644 docs/en/reference/operations/system-tables/query_log.md create mode 100644 docs/en/reference/operations/system-tables/query_thread_log.md create mode 100644 docs/en/reference/operations/system-tables/query_views_log.md create mode 100644 docs/en/reference/operations/system-tables/quota_limits.md create mode 100644 docs/en/reference/operations/system-tables/quota_usage.md create mode 100644 docs/en/reference/operations/system-tables/quotas.md create mode 100644 docs/en/reference/operations/system-tables/quotas_usage.md create mode 100644 docs/en/reference/operations/system-tables/replicas.md create mode 100644 docs/en/reference/operations/system-tables/replicated_fetches.md create mode 100644 docs/en/reference/operations/system-tables/replication_queue.md create mode 100644 docs/en/reference/operations/system-tables/role-grants.md create mode 100644 docs/en/reference/operations/system-tables/roles.md create mode 100644 docs/en/reference/operations/system-tables/row_policies.md create mode 100644 docs/en/reference/operations/system-tables/session_log.md create mode 100644 docs/en/reference/operations/system-tables/settings.md create mode 100644 docs/en/reference/operations/system-tables/settings_profile_elements.md create mode 100644 docs/en/reference/operations/system-tables/settings_profiles.md create mode 100644 docs/en/reference/operations/system-tables/stack_trace.md create mode 100644 docs/en/reference/operations/system-tables/storage_policies.md create mode 100644 docs/en/reference/operations/system-tables/table_engines.md create mode 100644 docs/en/reference/operations/system-tables/tables.md create mode 100644 docs/en/reference/operations/system-tables/text_log.md create mode 100644 docs/en/reference/operations/system-tables/time_zones.md create mode 100644 docs/en/reference/operations/system-tables/trace_log.md create mode 100644 docs/en/reference/operations/system-tables/users.md create mode 100644 docs/en/reference/operations/system-tables/zookeeper.md create mode 100644 docs/en/reference/operations/system-tables/zookeeper_log.md create mode 100644 docs/en/reference/operations/tips.md create mode 100644 docs/en/reference/operations/troubleshooting.md create mode 100644 docs/en/reference/operations/update.md create mode 100644 docs/en/reference/operations/utilities/clickhouse-benchmark.md create mode 100644 docs/en/reference/operations/utilities/clickhouse-compressor.md create mode 100644 docs/en/reference/operations/utilities/clickhouse-copier.md create mode 100644 docs/en/reference/operations/utilities/clickhouse-format.md create mode 100644 docs/en/reference/operations/utilities/clickhouse-local.md create mode 100644 docs/en/reference/operations/utilities/clickhouse-obfuscator.md create mode 100644 docs/en/reference/operations/utilities/index.md create mode 100644 docs/en/reference/operations/utilities/odbc-bridge.md create mode 100644 docs/en/reference/sql-reference/_category_.yml create mode 100644 docs/en/reference/sql-reference/aggregate-functions/combinators.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/index.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/parametric-functions.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/any.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/anyheavy.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/anylast.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/argmax.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/argmin.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/avg.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/avgweighted.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/corr.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/count.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/covarpop.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/covarsamp.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/deltasum.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/deltasumtimestamp.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/entropy.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/grouparray.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/grouparrayinsertat.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingavg.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingsum.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysample.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysorted.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupbitand.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmap.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapand.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapor.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapxor.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupbitor.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupbitxor.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/groupuniqarray.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/index.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/intervalLengthSum.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/kurtpop.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/kurtsamp.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/mannwhitneyutest.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/max.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/maxmap.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/meanztest.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/median.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/min.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/minmap.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantile.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantilebfloat16.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantiledeterministic.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantileexact.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantileexactweighted.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantiles.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigest.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantiletiming.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/quantiletimingweighted.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/rankCorr.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/simplelinearregression.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/skewpop.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/skewsamp.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/sparkbar.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/stddevpop.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/stddevsamp.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlinearregression.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/studentttest.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/sum.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/sumcount.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/sumkahan.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/summap.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/sumwithoverflow.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/topk.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/topkweighted.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/uniq.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined64.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/uniqexact.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/uniqhll12.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/uniqthetasketch.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/varpop.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/varsamp.md create mode 100644 docs/en/reference/sql-reference/aggregate-functions/reference/welchttest.md create mode 100644 docs/en/reference/sql-reference/ansi.md create mode 100644 docs/en/reference/sql-reference/data-types/aggregatefunction.md create mode 100644 docs/en/reference/sql-reference/data-types/array.md create mode 100644 docs/en/reference/sql-reference/data-types/boolean.md create mode 100644 docs/en/reference/sql-reference/data-types/date.md create mode 100644 docs/en/reference/sql-reference/data-types/date32.md create mode 100644 docs/en/reference/sql-reference/data-types/datetime.md create mode 100644 docs/en/reference/sql-reference/data-types/datetime64.md create mode 100644 docs/en/reference/sql-reference/data-types/decimal.md create mode 100644 docs/en/reference/sql-reference/data-types/domains/index.md create mode 100644 docs/en/reference/sql-reference/data-types/domains/ipv4.md create mode 100644 docs/en/reference/sql-reference/data-types/domains/ipv6.md create mode 100644 docs/en/reference/sql-reference/data-types/enum.md create mode 100644 docs/en/reference/sql-reference/data-types/fixedstring.md create mode 100644 docs/en/reference/sql-reference/data-types/float.md create mode 100644 docs/en/reference/sql-reference/data-types/geo.md create mode 100644 docs/en/reference/sql-reference/data-types/index.md create mode 100644 docs/en/reference/sql-reference/data-types/int-uint.md create mode 100644 docs/en/reference/sql-reference/data-types/lowcardinality.md create mode 100644 docs/en/reference/sql-reference/data-types/map.md create mode 100644 docs/en/reference/sql-reference/data-types/multiword-types.md create mode 100644 docs/en/reference/sql-reference/data-types/nested-data-structures/index.md create mode 100644 docs/en/reference/sql-reference/data-types/nested-data-structures/nested.md create mode 100644 docs/en/reference/sql-reference/data-types/nullable.md create mode 100644 docs/en/reference/sql-reference/data-types/simpleaggregatefunction.md create mode 100644 docs/en/reference/sql-reference/data-types/special-data-types/expression.md create mode 100644 docs/en/reference/sql-reference/data-types/special-data-types/index.md create mode 100644 docs/en/reference/sql-reference/data-types/special-data-types/interval.md create mode 100644 docs/en/reference/sql-reference/data-types/special-data-types/nothing.md create mode 100644 docs/en/reference/sql-reference/data-types/special-data-types/set.md create mode 100644 docs/en/reference/sql-reference/data-types/string.md create mode 100644 docs/en/reference/sql-reference/data-types/tuple.md create mode 100644 docs/en/reference/sql-reference/data-types/uuid.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/_category_.yml create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md create mode 100644 docs/en/reference/sql-reference/dictionaries/external-dictionaries/external-dicts.md create mode 100644 docs/en/reference/sql-reference/dictionaries/index.md create mode 100644 docs/en/reference/sql-reference/dictionaries/internal-dicts.md create mode 100644 docs/en/reference/sql-reference/distributed-ddl.md create mode 100644 docs/en/reference/sql-reference/functions/arithmetic-functions.md create mode 100644 docs/en/reference/sql-reference/functions/array-functions.md create mode 100644 docs/en/reference/sql-reference/functions/array-join.md create mode 100644 docs/en/reference/sql-reference/functions/bit-functions.md create mode 100644 docs/en/reference/sql-reference/functions/bitmap-functions.md create mode 100644 docs/en/reference/sql-reference/functions/comparison-functions.md create mode 100644 docs/en/reference/sql-reference/functions/conditional-functions.md create mode 100644 docs/en/reference/sql-reference/functions/date-time-functions.md create mode 100644 docs/en/reference/sql-reference/functions/encoding-functions.md create mode 100644 docs/en/reference/sql-reference/functions/encryption-functions.md create mode 100644 docs/en/reference/sql-reference/functions/ext-dict-functions.md create mode 100644 docs/en/reference/sql-reference/functions/files.md create mode 100644 docs/en/reference/sql-reference/functions/functions-for-nulls.md create mode 100644 docs/en/reference/sql-reference/functions/geo/coordinates.md create mode 100644 docs/en/reference/sql-reference/functions/geo/geohash.md create mode 100644 docs/en/reference/sql-reference/functions/geo/h3.md create mode 100644 docs/en/reference/sql-reference/functions/geo/index.md create mode 100644 docs/en/reference/sql-reference/functions/geo/s2.md create mode 100644 docs/en/reference/sql-reference/functions/hash-functions.md create mode 100644 docs/en/reference/sql-reference/functions/in-functions.md create mode 100644 docs/en/reference/sql-reference/functions/index.md create mode 100644 docs/en/reference/sql-reference/functions/introspection.md create mode 100644 docs/en/reference/sql-reference/functions/ip-address-functions.md create mode 100644 docs/en/reference/sql-reference/functions/json-functions.md create mode 100644 docs/en/reference/sql-reference/functions/logical-functions.md create mode 100644 docs/en/reference/sql-reference/functions/machine-learning-functions.md create mode 100644 docs/en/reference/sql-reference/functions/math-functions.md create mode 100644 docs/en/reference/sql-reference/functions/nlp-functions.md create mode 100644 docs/en/reference/sql-reference/functions/other-functions.md create mode 100644 docs/en/reference/sql-reference/functions/random-functions.md create mode 100644 docs/en/reference/sql-reference/functions/rounding-functions.md create mode 100644 docs/en/reference/sql-reference/functions/splitting-merging-functions.md create mode 100644 docs/en/reference/sql-reference/functions/string-functions.md create mode 100644 docs/en/reference/sql-reference/functions/string-replace-functions.md create mode 100644 docs/en/reference/sql-reference/functions/string-search-functions.md create mode 100644 docs/en/reference/sql-reference/functions/time-window-functions.md create mode 100644 docs/en/reference/sql-reference/functions/tuple-functions.md create mode 100644 docs/en/reference/sql-reference/functions/tuple-map-functions.md create mode 100644 docs/en/reference/sql-reference/functions/type-conversion-functions.md create mode 100644 docs/en/reference/sql-reference/functions/url-functions.md create mode 100644 docs/en/reference/sql-reference/functions/uuid-functions.md create mode 100644 docs/en/reference/sql-reference/functions/ym-dict-functions.md create mode 100644 docs/en/reference/sql-reference/index.md create mode 100644 docs/en/reference/sql-reference/operators/exists.md create mode 100644 docs/en/reference/sql-reference/operators/in.md create mode 100644 docs/en/reference/sql-reference/operators/index.md create mode 100644 docs/en/reference/sql-reference/statements/alter/column.md create mode 100644 docs/en/reference/sql-reference/statements/alter/comment.md create mode 100644 docs/en/reference/sql-reference/statements/alter/constraint.md create mode 100644 docs/en/reference/sql-reference/statements/alter/delete.md create mode 100644 docs/en/reference/sql-reference/statements/alter/index.md create mode 100644 docs/en/reference/sql-reference/statements/alter/index/index.md create mode 100644 docs/en/reference/sql-reference/statements/alter/order-by.md create mode 100644 docs/en/reference/sql-reference/statements/alter/partition.md create mode 100644 docs/en/reference/sql-reference/statements/alter/projection.md create mode 100644 docs/en/reference/sql-reference/statements/alter/quota.md create mode 100644 docs/en/reference/sql-reference/statements/alter/role.md create mode 100644 docs/en/reference/sql-reference/statements/alter/row-policy.md create mode 100644 docs/en/reference/sql-reference/statements/alter/sample-by.md create mode 100644 docs/en/reference/sql-reference/statements/alter/setting.md create mode 100644 docs/en/reference/sql-reference/statements/alter/settings-profile.md create mode 100644 docs/en/reference/sql-reference/statements/alter/ttl.md create mode 100644 docs/en/reference/sql-reference/statements/alter/update.md create mode 100644 docs/en/reference/sql-reference/statements/alter/user.md create mode 100644 docs/en/reference/sql-reference/statements/alter/view.md create mode 100644 docs/en/reference/sql-reference/statements/attach.md create mode 100644 docs/en/reference/sql-reference/statements/check-table.md create mode 100644 docs/en/reference/sql-reference/statements/create/database.md create mode 100644 docs/en/reference/sql-reference/statements/create/dictionary.md create mode 100644 docs/en/reference/sql-reference/statements/create/function.md create mode 100644 docs/en/reference/sql-reference/statements/create/index.md create mode 100644 docs/en/reference/sql-reference/statements/create/quota.md create mode 100644 docs/en/reference/sql-reference/statements/create/role.md create mode 100644 docs/en/reference/sql-reference/statements/create/row-policy.md create mode 100644 docs/en/reference/sql-reference/statements/create/settings-profile.md create mode 100644 docs/en/reference/sql-reference/statements/create/table.md create mode 100644 docs/en/reference/sql-reference/statements/create/user.md create mode 100644 docs/en/reference/sql-reference/statements/create/view.md create mode 100644 docs/en/reference/sql-reference/statements/describe-table.md create mode 100644 docs/en/reference/sql-reference/statements/detach.md create mode 100644 docs/en/reference/sql-reference/statements/drop.md create mode 100644 docs/en/reference/sql-reference/statements/exchange.md create mode 100644 docs/en/reference/sql-reference/statements/exists.md create mode 100644 docs/en/reference/sql-reference/statements/explain.md create mode 100644 docs/en/reference/sql-reference/statements/grant.md create mode 100644 docs/en/reference/sql-reference/statements/index.md create mode 100644 docs/en/reference/sql-reference/statements/insert-into.md create mode 100644 docs/en/reference/sql-reference/statements/kill.md create mode 100644 docs/en/reference/sql-reference/statements/misc.md create mode 100644 docs/en/reference/sql-reference/statements/optimize.md create mode 100644 docs/en/reference/sql-reference/statements/rename.md create mode 100644 docs/en/reference/sql-reference/statements/revoke.md create mode 100644 docs/en/reference/sql-reference/statements/select/all.md create mode 100644 docs/en/reference/sql-reference/statements/select/array-join.md create mode 100644 docs/en/reference/sql-reference/statements/select/distinct.md create mode 100644 docs/en/reference/sql-reference/statements/select/except.md create mode 100644 docs/en/reference/sql-reference/statements/select/format.md create mode 100644 docs/en/reference/sql-reference/statements/select/from.md create mode 100644 docs/en/reference/sql-reference/statements/select/group-by.md create mode 100644 docs/en/reference/sql-reference/statements/select/having.md create mode 100644 docs/en/reference/sql-reference/statements/select/index.md create mode 100644 docs/en/reference/sql-reference/statements/select/intersect.md create mode 100644 docs/en/reference/sql-reference/statements/select/into-outfile.md create mode 100644 docs/en/reference/sql-reference/statements/select/join.md create mode 100644 docs/en/reference/sql-reference/statements/select/limit-by.md create mode 100644 docs/en/reference/sql-reference/statements/select/limit.md create mode 100644 docs/en/reference/sql-reference/statements/select/offset.md create mode 100644 docs/en/reference/sql-reference/statements/select/order-by.md create mode 100644 docs/en/reference/sql-reference/statements/select/prewhere.md create mode 100644 docs/en/reference/sql-reference/statements/select/sample.md create mode 100644 docs/en/reference/sql-reference/statements/select/union.md create mode 100644 docs/en/reference/sql-reference/statements/select/where.md create mode 100644 docs/en/reference/sql-reference/statements/select/with.md create mode 100644 docs/en/reference/sql-reference/statements/set-role.md create mode 100644 docs/en/reference/sql-reference/statements/set.md create mode 100644 docs/en/reference/sql-reference/statements/show.md create mode 100644 docs/en/reference/sql-reference/statements/system.md create mode 100644 docs/en/reference/sql-reference/statements/truncate.md create mode 100644 docs/en/reference/sql-reference/statements/use.md create mode 100644 docs/en/reference/sql-reference/statements/watch.md create mode 100644 docs/en/reference/sql-reference/syntax.md create mode 100644 docs/en/reference/sql-reference/table-functions/cluster.md create mode 100644 docs/en/reference/sql-reference/table-functions/dictionary.md create mode 100644 docs/en/reference/sql-reference/table-functions/file.md create mode 100644 docs/en/reference/sql-reference/table-functions/generate.md create mode 100644 docs/en/reference/sql-reference/table-functions/hdfs.md create mode 100644 docs/en/reference/sql-reference/table-functions/hdfsCluster.md create mode 100644 docs/en/reference/sql-reference/table-functions/index.md create mode 100644 docs/en/reference/sql-reference/table-functions/input.md create mode 100644 docs/en/reference/sql-reference/table-functions/jdbc.md create mode 100644 docs/en/reference/sql-reference/table-functions/merge.md create mode 100644 docs/en/reference/sql-reference/table-functions/mysql.md create mode 100644 docs/en/reference/sql-reference/table-functions/null.md create mode 100644 docs/en/reference/sql-reference/table-functions/numbers.md create mode 100644 docs/en/reference/sql-reference/table-functions/odbc.md create mode 100644 docs/en/reference/sql-reference/table-functions/postgresql.md create mode 100644 docs/en/reference/sql-reference/table-functions/remote.md create mode 100644 docs/en/reference/sql-reference/table-functions/s3.md create mode 100644 docs/en/reference/sql-reference/table-functions/s3Cluster.md create mode 100644 docs/en/reference/sql-reference/table-functions/sqlite.md create mode 100644 docs/en/reference/sql-reference/table-functions/url.md create mode 100644 docs/en/reference/sql-reference/table-functions/view.md create mode 100644 docs/en/reference/sql-reference/window-functions/index.md create mode 100644 docs/en/reference/whats-new/_category_.yml create mode 100644 docs/en/reference/whats-new/changelog/2017.md create mode 100644 docs/en/reference/whats-new/changelog/2018.md create mode 100644 docs/en/reference/whats-new/changelog/2019.md create mode 100644 docs/en/reference/whats-new/changelog/2020.md create mode 100644 docs/en/reference/whats-new/changelog/2021.md create mode 100644 docs/en/reference/whats-new/changelog/index.md create mode 100644 docs/en/reference/whats-new/index.md create mode 100644 docs/en/reference/whats-new/roadmap.md create mode 100644 docs/en/reference/whats-new/security-changelog.md diff --git a/.github/workflows/deploy-github.yml b/.github/workflows/deploy-github.yml index 7d72ad9b99a..c135dc420f3 100644 --- a/.github/workflows/deploy-github.yml +++ b/.github/workflows/deploy-github.yml @@ -39,6 +39,7 @@ jobs: uses: JamesIves/github-pages-deploy-action@v4.3.0 with: repository-name: ClickHouse/clickhouse-docs-content + token: ${{ secrets.GITHUB_TOKEN }} branch: gh-pages folder: . env: diff --git a/.gitignore b/.gitignore index fb9a46a1b5a..c11afe4cd36 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ node_modules .docusaurus build -docs/en/reference **/.DS_Store diff --git a/docs/en/reference/development/_category_.yml b/docs/en/reference/development/_category_.yml new file mode 100644 index 00000000000..9e622150f74 --- /dev/null +++ b/docs/en/reference/development/_category_.yml @@ -0,0 +1,8 @@ +position: 100 +label: 'Building ClickHouse' +collapsible: true +collapsed: true +link: + type: generated-index + title: Building ClickHouse + slug: /en/development \ No newline at end of file diff --git a/docs/en/reference/development/adding_test_queries.md b/docs/en/reference/development/adding_test_queries.md new file mode 100644 index 00000000000..9b993a96ed5 --- /dev/null +++ b/docs/en/reference/development/adding_test_queries.md @@ -0,0 +1,157 @@ +--- +sidebar_label: Adding Test Queries +sidebar_position: 63 +description: Instructions on how to add a test case to ClickHouse continuous integration +--- + +# How to add test queries to ClickHouse CI + +ClickHouse has hundreds (or even thousands) of features. Every commit gets checked by a complex set of tests containing many thousands of test cases. + +The core functionality is very well tested, but some corner-cases and different combinations of features can be uncovered with ClickHouse CI. + +Most of the bugs/regressions we see happen in that 'grey area' where test coverage is poor. + +And we are very interested in covering most of the possible scenarios and feature combinations used in real life by tests. + +## Why adding tests + +Why/when you should add a test case into ClickHouse code: +1) you use some complicated scenarios / feature combinations / you have some corner case which is probably not widely used +2) you see that certain behavior gets changed between version w/o notifications in the changelog +3) you just want to help to improve ClickHouse quality and ensure the features you use will not be broken in the future releases +4) once the test is added/accepted, you can be sure the corner case you check will never be accidentally broken. +5) you will be a part of great open-source community +6) your name will be visible in the `system.contributors` table! +7) you will make a world bit better :) + +### Steps to do + +#### Prerequisite + +I assume you run some Linux machine (you can use docker / virtual machines on other OS) and any modern browser / internet connection, and you have some basic Linux & SQL skills. + +Any highly specialized knowledge is not needed (so you don't need to know C++ or know something about how ClickHouse CI works). + + +#### Preparation + +1) [create GitHub account](https://github.com/join) (if you haven't one yet) +2) [setup git](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/set-up-git) +```bash +# for Ubuntu +sudo apt-get update +sudo apt-get install git + +git config --global user.name "John Doe" # fill with your name +git config --global user.email "email@example.com" # fill with your email + +``` +3) [fork ClickHouse project](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) - just open [https://github.com/ClickHouse/ClickHouse](https://github.com/ClickHouse/ClickHouse) and press fork button in the top right corner: +![fork repo](https://github-images.s3.amazonaws.com/help/bootcamp/Bootcamp-Fork.png) + +4) clone your fork to some folder on your PC, for example, `~/workspace/ClickHouse` +``` +mkdir ~/workspace && cd ~/workspace +git clone https://github.com/< your GitHub username>/ClickHouse +cd ClickHouse +git remote add upstream https://github.com/ClickHouse/ClickHouse +``` + +#### New branch for the test + +1) create a new branch from the latest clickhouse master +``` +cd ~/workspace/ClickHouse +git fetch upstream +git checkout -b name_for_a_branch_with_my_test upstream/master +``` + +#### Install & run clickhouse + +1) install `clickhouse-server` (follow [official docs](https://clickhouse.com/docs/en/getting-started/install/)) +2) install test configurations (it will use Zookeeper mock implementation and adjust some settings) +``` +cd ~/workspace/ClickHouse/tests/config +sudo ./install.sh +``` +3) run clickhouse-server +``` +sudo systemctl restart clickhouse-server +``` + +#### Creating the test file + + +1) find the number for your test - find the file with the biggest number in `tests/queries/0_stateless/` + +```sh +$ cd ~/workspace/ClickHouse +$ ls tests/queries/0_stateless/[0-9]*.reference | tail -n 1 +tests/queries/0_stateless/01520_client_print_query_id.reference +``` +Currently, the last number for the test is `01520`, so my test will have the number `01521` + +2) create an SQL file with the next number and name of the feature you test + +```sh +touch tests/queries/0_stateless/01521_dummy_test.sql +``` + +3) edit SQL file with your favorite editor (see hint of creating tests below) +```sh +vim tests/queries/0_stateless/01521_dummy_test.sql +``` + + +4) run the test, and put the result of that into the reference file: +``` +clickhouse-client -nmT < tests/queries/0_stateless/01521_dummy_test.sql | tee tests/queries/0_stateless/01521_dummy_test.reference +``` + +5) ensure everything is correct, if the test output is incorrect (due to some bug for example), adjust the reference file using text editor. + +#### How to create a good test + +- A test should be + - minimal - create only tables related to tested functionality, remove unrelated columns and parts of query + - fast - should not take longer than a few seconds (better subseconds) + - correct - fails then feature is not working + - deterministic + - isolated / stateless + - don't rely on some environment things + - don't rely on timing when possible +- try to cover corner cases (zeros / Nulls / empty sets / throwing exceptions) +- to test that query return errors, you can put special comment after the query: `-- { serverError 60 }` or `-- { clientError 20 }` +- don't switch databases (unless necessary) +- you can create several table replicas on the same node if needed +- you can use one of the test cluster definitions when needed (see system.clusters) +- use `number` / `numbers_mt` / `zeros` / `zeros_mt` and similar for queries / to initialize data when applicable +- clean up the created objects after test and before the test (DROP IF EXISTS) - in case of some dirty state +- prefer sync mode of operations (mutations, merges, etc.) +- use other SQL files in the `0_stateless` folder as an example +- ensure the feature / feature combination you want to test is not yet covered with existing tests + +#### Test naming rules + +It's important to name tests correctly, so one could turn some tests subset off in clickhouse-test invocation. + +| Tester flag| What should be in test name | When flag should be added | +|---|---|---|---| +| `--[no-]zookeeper`| "zookeeper" or "replica" | Test uses tables from ReplicatedMergeTree family | +| `--[no-]shard` | "shard" or "distributed" or "global"| Test using connections to 127.0.0.2 or similar | +| `--[no-]long` | "long" or "deadlock" or "race" | Test runs longer than 60 seconds | + +#### Commit / push / create PR. + +1) commit & push your changes +```sh +cd ~/workspace/ClickHouse +git add tests/queries/0_stateless/01521_dummy_test.sql +git add tests/queries/0_stateless/01521_dummy_test.reference +git commit # use some nice commit message when possible +git push origin HEAD +``` +2) use a link which was shown during the push, to create a PR into the main repo +3) adjust the PR title and contents, in `Changelog category (leave one)` keep +`Build/Testing/Packaging Improvement`, fill the rest of the fields if you want. diff --git a/docs/en/reference/development/architecture.md b/docs/en/reference/development/architecture.md new file mode 100644 index 00000000000..b5cb6c321ac --- /dev/null +++ b/docs/en/reference/development/architecture.md @@ -0,0 +1,202 @@ +--- +sidebar_label: Architecture Overview +sidebar_position: 62 +--- + +# Overview of ClickHouse Architecture + +ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns). +Whenever possible, operations are dispatched on arrays, rather than on individual values. It is called “vectorized query execution” and it helps lower the cost of actual data processing. + +> This idea is nothing new. It dates back to the `APL` (A programming language, 1957) and its descendants: `A +` (APL dialect), `J` (1990), `K` (1993), and `Q` (programming language from Kx Systems, 2003). Array programming is used in scientific data processing. Neither is this idea something new in relational databases: for example, it is used in the `VectorWise` system (also known as Actian Vector Analytic Database by Actian Corporation). + +There are two different approaches for speeding up query processing: vectorized query execution and runtime code generation. The latter removes all indirection and dynamic dispatch. Neither of these approaches is strictly better than the other. Runtime code generation can be better when it fuses many operations, thus fully utilizing CPU execution units and the pipeline. Vectorized query execution can be less practical because it involves temporary vectors that must be written to the cache and read back. If the temporary data does not fit in the L2 cache, this becomes an issue. But vectorized query execution more easily utilizes the SIMD capabilities of the CPU. A [research paper](http://15721.courses.cs.cmu.edu/spring2016/papers/p5-sompolski.pdf) written by our friends shows that it is better to combine both approaches. ClickHouse uses vectorized query execution and has limited initial support for runtime code generation. + +## Columns {#columns} + +`IColumn` interface is used to represent columns in memory (actually, chunks of columns). This interface provides helper methods for the implementation of various relational operators. Almost all operations are immutable: they do not modify the original column, but create a new modified one. For example, the `IColumn :: filter` method accepts a filter byte mask. It is used for the `WHERE` and `HAVING` relational operators. Additional examples: the `IColumn :: permute` method to support `ORDER BY`, the `IColumn :: cut` method to support `LIMIT`. + +Various `IColumn` implementations (`ColumnUInt8`, `ColumnString`, and so on) are responsible for the memory layout of columns. The memory layout is usually a contiguous array. For the integer type of columns, it is just one contiguous array, like `std :: vector`. For `String` and `Array` columns, it is two vectors: one for all array elements, placed contiguously, and a second one for offsets to the beginning of each array. There is also `ColumnConst` that stores just one value in memory, but looks like a column. + +## Field {#field} + +Nevertheless, it is possible to work with individual values as well. To represent an individual value, the `Field` is used. `Field` is just a discriminated union of `UInt64`, `Int64`, `Float64`, `String` and `Array`. `IColumn` has the `operator []` method to get the n-th value as a `Field`, and the `insert` method to append a `Field` to the end of a column. These methods are not very efficient, because they require dealing with temporary `Field` objects representing an individual value. There are more efficient methods, such as `insertFrom`, `insertRangeFrom`, and so on. + +`Field` does not have enough information about a specific data type for a table. For example, `UInt8`, `UInt16`, `UInt32`, and `UInt64` are all represented as `UInt64` in a `Field`. + +## Leaky Abstractions {#leaky-abstractions} + +`IColumn` has methods for common relational transformations of data, but they do not meet all needs. For example, `ColumnUInt64` does not have a method to calculate the sum of two columns, and `ColumnString` does not have a method to run a substring search. These countless routines are implemented outside of `IColumn`. + +Various functions on columns can be implemented in a generic, non-efficient way using `IColumn` methods to extract `Field` values, or in a specialized way using knowledge of inner memory layout of data in a specific `IColumn` implementation. It is implemented by casting functions to a specific `IColumn` type and deal with internal representation directly. For example, `ColumnUInt64` has the `getData` method that returns a reference to an internal array, then a separate routine reads or fills that array directly. We have “leaky abstractions” to allow efficient specializations of various routines. + +## Data Types {#data_types} + +`IDataType` is responsible for serialization and deserialization: for reading and writing chunks of columns or individual values in binary or text form. `IDataType` directly corresponds to data types in tables. For example, there are `DataTypeUInt32`, `DataTypeDateTime`, `DataTypeString` and so on. + +`IDataType` and `IColumn` are only loosely related to each other. Different data types can be represented in memory by the same `IColumn` implementations. For example, `DataTypeUInt32` and `DataTypeDateTime` are both represented by `ColumnUInt32` or `ColumnConstUInt32`. In addition, the same data type can be represented by different `IColumn` implementations. For example, `DataTypeUInt8` can be represented by `ColumnUInt8` or `ColumnConstUInt8`. + +`IDataType` only stores metadata. For instance, `DataTypeUInt8` does not store anything at all (except virtual pointer `vptr`) and `DataTypeFixedString` stores just `N` (the size of fixed-size strings). + +`IDataType` has helper methods for various data formats. Examples are methods to serialize a value with possible quoting, to serialize a value for JSON, and to serialize a value as part of the XML format. There is no direct correspondence to data formats. For example, the different data formats `Pretty` and `TabSeparated` can use the same `serializeTextEscaped` helper method from the `IDataType` interface. + +## Block {#block} + +A `Block` is a container that represents a subset (chunk) of a table in memory. It is just a set of triples: `(IColumn, IDataType, column name)`. During query execution, data is processed by `Block`s. If we have a `Block`, we have data (in the `IColumn` object), we have information about its type (in `IDataType`) that tells us how to deal with that column, and we have the column name. It could be either the original column name from the table or some artificial name assigned for getting temporary results of calculations. + +When we calculate some function over columns in a block, we add another column with its result to the block, and we do not touch columns for arguments of the function because operations are immutable. Later, unneeded columns can be removed from the block, but not modified. It is convenient for the elimination of common subexpressions. + +Blocks are created for every processed chunk of data. Note that for the same type of calculation, the column names and types remain the same for different blocks, and only column data changes. It is better to split block data from the block header because small block sizes have a high overhead of temporary strings for copying shared_ptrs and column names. + +## Block Streams {#block-streams} + +Block streams are for processing data. We use streams of blocks to read data from somewhere, perform data transformations, or write data to somewhere. `IBlockInputStream` has the `read` method to fetch the next block while available. `IBlockOutputStream` has the `write` method to push the block somewhere. + +Streams are responsible for: + +1. Reading or writing to a table. The table just returns a stream for reading or writing blocks. +2. Implementing data formats. For example, if you want to output data to a terminal in `Pretty` format, you create a block output stream where you push blocks, and it formats them. +3. Performing data transformations. Let’s say you have `IBlockInputStream` and want to create a filtered stream. You create `FilterBlockInputStream` and initialize it with your stream. Then when you pull a block from `FilterBlockInputStream`, it pulls a block from your stream, filters it, and returns the filtered block to you. Query execution pipelines are represented this way. + +There are more sophisticated transformations. For example, when you pull from `AggregatingBlockInputStream`, it reads all data from its source, aggregates it, and then returns a stream of aggregated data for you. Another example: `UnionBlockInputStream` accepts many input sources in the constructor and also a number of threads. It launches multiple threads and reads from multiple sources in parallel. + +> Block streams use the “pull” approach to control flow: when you pull a block from the first stream, it consequently pulls the required blocks from nested streams, and the entire execution pipeline will work. Neither “pull” nor “push” is the best solution, because control flow is implicit, and that limits the implementation of various features like simultaneous execution of multiple queries (merging many pipelines together). This limitation could be overcome with coroutines or just running extra threads that wait for each other. We may have more possibilities if we make control flow explicit: if we locate the logic for passing data from one calculation unit to another outside of those calculation units. Read this [article](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) for more thoughts. + +We should note that the query execution pipeline creates temporary data at each step. We try to keep block size small enough so that temporary data fits in the CPU cache. With that assumption, writing and reading temporary data is almost free in comparison with other calculations. We could consider an alternative, which is to fuse many operations in the pipeline together. It could make the pipeline as short as possible and remove much of the temporary data, which could be an advantage, but it also has drawbacks. For example, a split pipeline makes it easy to implement caching intermediate data, stealing intermediate data from similar queries running at the same time, and merging pipelines for similar queries. + +## Formats {#formats} + +Data formats are implemented with block streams. There are “presentational” formats only suitable for the output of data to the client, such as `Pretty` format, which provides only `IBlockOutputStream`. And there are input/output formats, such as `TabSeparated` or `JSONEachRow`. + +There are also row streams: `IRowInputStream` and `IRowOutputStream`. They allow you to pull/push data by individual rows, not by blocks. And they are only needed to simplify the implementation of row-oriented formats. The wrappers `BlockInputStreamFromRowInputStream` and `BlockOutputStreamFromRowOutputStream` allow you to convert row-oriented streams to regular block-oriented streams. + +## I/O {#io} + +For byte-oriented input/output, there are `ReadBuffer` and `WriteBuffer` abstract classes. They are used instead of C++ `iostream`s. Don’t worry: every mature C++ project is using something other than `iostream`s for good reasons. + +`ReadBuffer` and `WriteBuffer` are just a contiguous buffer and a cursor pointing to the position in that buffer. Implementations may own or not own the memory for the buffer. There is a virtual method to fill the buffer with the following data (for `ReadBuffer`) or to flush the buffer somewhere (for `WriteBuffer`). The virtual methods are rarely called. + +Implementations of `ReadBuffer`/`WriteBuffer` are used for working with files and file descriptors and network sockets, for implementing compression (`CompressedWriteBuffer` is initialized with another WriteBuffer and performs compression before writing data to it), and for other purposes – the names `ConcatReadBuffer`, `LimitReadBuffer`, and `HashingWriteBuffer` speak for themselves. + +Read/WriteBuffers only deal with bytes. There are functions from `ReadHelpers` and `WriteHelpers` header files to help with formatting input/output. For example, there are helpers to write a number in decimal format. + +Let’s look at what happens when you want to write a result set in `JSON` format to stdout. You have a result set ready to be fetched from `IBlockInputStream`. You create `WriteBufferFromFileDescriptor(STDOUT_FILENO)` to write bytes to stdout. You create `JSONRowOutputStream`, initialized with that `WriteBuffer`, to write rows in `JSON` to stdout. You create `BlockOutputStreamFromRowOutputStream` on top of it, to represent it as `IBlockOutputStream`. Then you call `copyData` to transfer data from `IBlockInputStream` to `IBlockOutputStream`, and everything works. Internally, `JSONRowOutputStream` will write various JSON delimiters and call the `IDataType::serializeTextJSON` method with a reference to `IColumn` and the row number as arguments. Consequently, `IDataType::serializeTextJSON` will call a method from `WriteHelpers.h`: for example, `writeText` for numeric types and `writeJSONString` for `DataTypeString`. + +## Tables {#tables} + +The `IStorage` interface represents tables. Different implementations of that interface are different table engines. Examples are `StorageMergeTree`, `StorageMemory`, and so on. Instances of these classes are just tables. + +The key `IStorage` methods are `read` and `write`. There are also `alter`, `rename`, `drop`, and so on. The `read` method accepts the following arguments: the set of columns to read from a table, the `AST` query to consider, and the desired number of streams to return. It returns one or multiple `IBlockInputStream` objects and information about the stage of data processing that was completed inside a table engine during query execution. + +In most cases, the read method is only responsible for reading the specified columns from a table, not for any further data processing. All further data processing is done by the query interpreter and is outside the responsibility of `IStorage`. + +But there are notable exceptions: + +- The AST query is passed to the `read` method, and the table engine can use it to derive index usage and to read fewer data from a table. +- Sometimes the table engine can process data itself to a specific stage. For example, `StorageDistributed` can send a query to remote servers, ask them to process data to a stage where data from different remote servers can be merged, and return that preprocessed data. The query interpreter then finishes processing the data. + +The table’s `read` method can return multiple `IBlockInputStream` objects to allow parallel data processing. These multiple block input streams can read from a table in parallel. Then you can wrap these streams with various transformations (such as expression evaluation or filtering) that can be calculated independently and create a `UnionBlockInputStream` on top of them, to read from multiple streams in parallel. + +There are also `TableFunction`s. These are functions that return a temporary `IStorage` object to use in the `FROM` clause of a query. + +To get a quick idea of how to implement your table engine, look at something simple, like `StorageMemory` or `StorageTinyLog`. + +> As the result of the `read` method, `IStorage` returns `QueryProcessingStage` – information about what parts of the query were already calculated inside storage. + +## Parsers {#parsers} + +A hand-written recursive descent parser parses a query. For example, `ParserSelectQuery` just recursively calls the underlying parsers for various parts of the query. Parsers create an `AST`. The `AST` is represented by nodes, which are instances of `IAST`. + +> Parser generators are not used for historical reasons. + +## Interpreters {#interpreters} + +Interpreters are responsible for creating the query execution pipeline from an `AST`. There are simple interpreters, such as `InterpreterExistsQuery` and `InterpreterDropQuery`, or the more sophisticated `InterpreterSelectQuery`. The query execution pipeline is a combination of block input or output streams. For example, the result of interpreting the `SELECT` query is the `IBlockInputStream` to read the result set from; the result of the INSERT query is the `IBlockOutputStream` to write data for insertion to, and the result of interpreting the `INSERT SELECT` query is the `IBlockInputStream` that returns an empty result set on the first read, but that copies data from `SELECT` to `INSERT` at the same time. + +`InterpreterSelectQuery` uses `ExpressionAnalyzer` and `ExpressionActions` machinery for query analysis and transformations. This is where most rule-based query optimizations are done. `ExpressionAnalyzer` is quite messy and should be rewritten: various query transformations and optimizations should be extracted to separate classes to allow modular transformations of query. + +## Functions {#functions} + +There are ordinary functions and aggregate functions. For aggregate functions, see the next section. + +Ordinary functions do not change the number of rows – they work as if they are processing each row independently. In fact, functions are not called for individual rows, but for `Block`’s of data to implement vectorized query execution. + +There are some miscellaneous functions, like [blockSize](../sql-reference/functions/other-functions.md#function-blocksize), [rowNumberInBlock](../sql-reference/functions/other-functions.md#function-rownumberinblock), and [runningAccumulate](../sql-reference/functions/other-functions.md#runningaccumulate), that exploit block processing and violate the independence of rows. + +ClickHouse has strong typing, so there’s no implicit type conversion. If a function does not support a specific combination of types, it throws an exception. But functions can work (be overloaded) for many different combinations of types. For example, the `plus` function (to implement the `+` operator) works for any combination of numeric types: `UInt8` + `Float32`, `UInt16` + `Int8`, and so on. Also, some variadic functions can accept any number of arguments, such as the `concat` function. + +Implementing a function may be slightly inconvenient because a function explicitly dispatches supported data types and supported `IColumns`. For example, the `plus` function has code generated by instantiation of a C++ template for each combination of numeric types, and constant or non-constant left and right arguments. + +It is an excellent place to implement runtime code generation to avoid template code bloat. Also, it makes it possible to add fused functions like fused multiply-add or to make multiple comparisons in one loop iteration. + +Due to vectorized query execution, functions are not short-circuited. For example, if you write `WHERE f(x) AND g(y)`, both sides are calculated, even for rows, when `f(x)` is zero (except when `f(x)` is a zero constant expression). But if the selectivity of the `f(x)` condition is high, and calculation of `f(x)` is much cheaper than `g(y)`, it’s better to implement multi-pass calculation. It would first calculate `f(x)`, then filter columns by the result, and then calculate `g(y)` only for smaller, filtered chunks of data. + +## Aggregate Functions {#aggregate-functions} + +Aggregate functions are stateful functions. They accumulate passed values into some state and allow you to get results from that state. They are managed with the `IAggregateFunction` interface. States can be rather simple (the state for `AggregateFunctionCount` is just a single `UInt64` value) or quite complex (the state of `AggregateFunctionUniqCombined` is a combination of a linear array, a hash table, and a `HyperLogLog` probabilistic data structure). + +States are allocated in `Arena` (a memory pool) to deal with multiple states while executing a high-cardinality `GROUP BY` query. States can have a non-trivial constructor and destructor: for example, complicated aggregation states can allocate additional memory themselves. It requires some attention to creating and destroying states and properly passing their ownership and destruction order. + +Aggregation states can be serialized and deserialized to pass over the network during distributed query execution or to write them on the disk where there is not enough RAM. They can even be stored in a table with the `DataTypeAggregateFunction` to allow incremental aggregation of data. + +> The serialized data format for aggregate function states is not versioned right now. It is ok if aggregate states are only stored temporarily. But we have the `AggregatingMergeTree` table engine for incremental aggregation, and people are already using it in production. It is the reason why backward compatibility is required when changing the serialized format for any aggregate function in the future. + +## Server {#server} + +The server implements several different interfaces: + +- An HTTP interface for any foreign clients. +- A TCP interface for the native ClickHouse client and for cross-server communication during distributed query execution. +- An interface for transferring data for replication. + +Internally, it is just a primitive multithreaded server without coroutines or fibers. Since the server is not designed to process a high rate of simple queries but to process a relatively low rate of complex queries, each of them can process a vast amount of data for analytics. + +The server initializes the `Context` class with the necessary environment for query execution: the list of available databases, users and access rights, settings, clusters, the process list, the query log, and so on. Interpreters use this environment. + +We maintain full backward and forward compatibility for the server TCP protocol: old clients can talk to new servers, and new clients can talk to old servers. But we do not want to maintain it eternally, and we are removing support for old versions after about one year. + +:::note +For most external applications, we recommend using the HTTP interface because it is simple and easy to use. The TCP protocol is more tightly linked to internal data structures: it uses an internal format for passing blocks of data, and it uses custom framing for compressed data. We haven’t released a C library for that protocol because it requires linking most of the ClickHouse codebase, which is not practical. +::: + +## Distributed Query Execution {#distributed-query-execution} + +Servers in a cluster setup are mostly independent. You can create a `Distributed` table on one or all servers in a cluster. The `Distributed` table does not store data itself – it only provides a “view” to all local tables on multiple nodes of a cluster. When you SELECT from a `Distributed` table, it rewrites that query, chooses remote nodes according to load balancing settings, and sends the query to them. The `Distributed` table requests remote servers to process a query just up to a stage where intermediate results from different servers can be merged. Then it receives the intermediate results and merges them. The distributed table tries to distribute as much work as possible to remote servers and does not send much intermediate data over the network. + +Things become more complicated when you have subqueries in IN or JOIN clauses, and each of them uses a `Distributed` table. We have different strategies for the execution of these queries. + +There is no global query plan for distributed query execution. Each node has its local query plan for its part of the job. We only have simple one-pass distributed query execution: we send queries for remote nodes and then merge the results. But this is not feasible for complicated queries with high cardinality GROUP BYs or with a large amount of temporary data for JOIN. In such cases, we need to “reshuffle” data between servers, which requires additional coordination. ClickHouse does not support that kind of query execution, and we need to work on it. + +## Merge Tree {#merge-tree} + +`MergeTree` is a family of storage engines that supports indexing by primary key. The primary key can be an arbitrary tuple of columns or expressions. Data in a `MergeTree` table is stored in “parts”. Each part stores data in the primary key order, so data is ordered lexicographically by the primary key tuple. All the table columns are stored in separate `column.bin` files in these parts. The files consist of compressed blocks. Each block is usually from 64 KB to 1 MB of uncompressed data, depending on the average value size. The blocks consist of column values placed contiguously one after the other. Column values are in the same order for each column (the primary key defines the order), so when you iterate by many columns, you get values for the corresponding rows. + +The primary key itself is “sparse”. It does not address every single row, but only some ranges of data. A separate `primary.idx` file has the value of the primary key for each N-th row, where N is called `index_granularity` (usually, N = 8192). Also, for each column, we have `column.mrk` files with “marks”, which are offsets to each N-th row in the data file. Each mark is a pair: the offset in the file to the beginning of the compressed block, and the offset in the decompressed block to the beginning of data. Usually, compressed blocks are aligned by marks, and the offset in the decompressed block is zero. Data for `primary.idx` always resides in memory, and data for `column.mrk` files is cached. + +When we are going to read something from a part in `MergeTree`, we look at `primary.idx` data and locate ranges that could contain requested data, then look at `column.mrk` data and calculate offsets for where to start reading those ranges. Because of sparseness, excess data may be read. ClickHouse is not suitable for a high load of simple point queries, because the entire range with `index_granularity` rows must be read for each key, and the entire compressed block must be decompressed for each column. We made the index sparse because we must be able to maintain trillions of rows per single server without noticeable memory consumption for the index. Also, because the primary key is sparse, it is not unique: it cannot check the existence of the key in the table at INSERT time. You could have many rows with the same key in a table. + +When you `INSERT` a bunch of data into `MergeTree`, that bunch is sorted by primary key order and forms a new part. There are background threads that periodically select some parts and merge them into a single sorted part to keep the number of parts relatively low. That’s why it is called `MergeTree`. Of course, merging leads to “write amplification”. All parts are immutable: they are only created and deleted, but not modified. When SELECT is executed, it holds a snapshot of the table (a set of parts). After merging, we also keep old parts for some time to make a recovery after failure easier, so if we see that some merged part is probably broken, we can replace it with its source parts. + +`MergeTree` is not an LSM tree because it does not contain MEMTABLE and LOG: inserted data is written directly to the filesystem. This behavior makes MergeTree much more suitable to insert data in batches. Therefore frequently inserting small amounts of rows is not ideal for MergeTree. For example, a couple of rows per second is OK, but doing it a thousand times a second is not optimal for MergeTree. However, there is an async insert mode for small inserts to overcome this limitation. We did it this way for simplicity’s sake, and because we are already inserting data in batches in our applications + +There are MergeTree engines that are doing additional work during background merges. Examples are `CollapsingMergeTree` and `AggregatingMergeTree`. This could be treated as special support for updates. Keep in mind that these are not real updates because users usually have no control over the time when background merges are executed, and data in a `MergeTree` table is almost always stored in more than one part, not in completely merged form. + +## Replication {#replication} + +Replication in ClickHouse can be configured on a per-table basis. You could have some replicated and some non-replicated tables on the same server. You could also have tables replicated in different ways, such as one table with two-factor replication and another with three-factor. + +Replication is implemented in the `ReplicatedMergeTree` storage engine. The path in `ZooKeeper` is specified as a parameter for the storage engine. All tables with the same path in `ZooKeeper` become replicas of each other: they synchronize their data and maintain consistency. Replicas can be added and removed dynamically simply by creating or dropping a table. + +Replication uses an asynchronous multi-master scheme. You can insert data into any replica that has a session with `ZooKeeper`, and data is replicated to all other replicas asynchronously. Because ClickHouse does not support UPDATEs, replication is conflict-free. As there is no quorum acknowledgment of inserts, just-inserted data might be lost if one node fails. + +Metadata for replication is stored in ZooKeeper. There is a replication log that lists what actions to do. Actions are: get part; merge parts; drop a partition, and so on. Each replica copies the replication log to its queue and then executes the actions from the queue. For example, on insertion, the “get the part” action is created in the log, and every replica downloads that part. Merges are coordinated between replicas to get byte-identical results. All parts are merged in the same way on all replicas. One of the leaders initiates a new merge first and writes “merge parts” actions to the log. Multiple replicas (or all) can be leaders at the same time. A replica can be prevented from becoming a leader using the `merge_tree` setting `replicated_can_become_leader`. The leaders are responsible for scheduling background merges. + +Replication is physical: only compressed parts are transferred between nodes, not queries. Merges are processed on each replica independently in most cases to lower the network costs by avoiding network amplification. Large merged parts are sent over the network only in cases of significant replication lag. + +Besides, each replica stores its state in ZooKeeper as the set of parts and its checksums. When the state on the local filesystem diverges from the reference state in ZooKeeper, the replica restores its consistency by downloading missing and broken parts from other replicas. When there is some unexpected or broken data in the local filesystem, ClickHouse does not remove it, but moves it to a separate directory and forgets it. + +:::note +The ClickHouse cluster consists of independent shards, and each shard consists of replicas. The cluster is **not elastic**, so after adding a new shard, data is not rebalanced between shards automatically. Instead, the cluster load is supposed to be adjusted to be uneven. This implementation gives you more control, and it is ok for relatively small clusters, such as tens of nodes. But for clusters with hundreds of nodes that we are using in production, this approach becomes a significant drawback. We should implement a table engine that spans across the cluster with dynamically replicated regions that could be split and balanced between clusters automatically. +::: + +[Original article](https://clickhouse.com/docs/en/development/architecture/) diff --git a/docs/en/reference/development/browse-code.md b/docs/en/reference/development/browse-code.md new file mode 100644 index 00000000000..da924c359ff --- /dev/null +++ b/docs/en/reference/development/browse-code.md @@ -0,0 +1,13 @@ +--- +sidebar_label: Source Code Browser +sidebar_position: 72 +description: Various ways to browse and edit the source code +--- + +# Browse ClickHouse Source Code + +You can use the **Woboq** online code browser available [here](https://clickhouse.com/codebrowser/ClickHouse/src/index.html). It provides code navigation and semantic highlighting, search and indexing. The code snapshot is updated daily. + +Also, you can browse sources on [GitHub](https://github.com/ClickHouse/ClickHouse) as usual. + +If you’re interested what IDE to use, we recommend CLion, QT Creator, VS Code and KDevelop (with caveats). You can use any favorite IDE. Vim and Emacs also count. diff --git a/docs/en/reference/development/build-cross-arm.md b/docs/en/reference/development/build-cross-arm.md new file mode 100644 index 00000000000..305c09ae217 --- /dev/null +++ b/docs/en/reference/development/build-cross-arm.md @@ -0,0 +1,38 @@ +--- +sidebar_position: 67 +sidebar_label: Build on Linux for AARCH64 (ARM64) +--- + +# How to Build ClickHouse on Linux for AARCH64 (ARM64) Architecture + +This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on another Linux machine with AARCH64 CPU architecture. +This is intended for continuous integration checks that run on Linux servers. + +The cross-build for AARCH64 is based on the [Build instructions](../development/build.md), follow them first. + +## Install Clang-13 + +Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup or do +``` +sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" +``` + +## Install Cross-Compilation Toolset {#install-cross-compilation-toolset} + +``` bash +cd ClickHouse +mkdir -p build-aarch64/cmake/toolchain/linux-aarch64 +wget 'https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en' -O gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz +tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build-aarch64/cmake/toolchain/linux-aarch64 --strip-components=1 +``` + +## Build ClickHouse {#build-clickhouse} + +``` bash +cd ClickHouse +mkdir build-arm64 +CC=clang-13 CXX=clang++-13 cmake . -Bbuild-arm64 -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-aarch64.cmake +ninja -C build-arm64 +``` + +The resulting binary will run only on Linux with the AARCH64 CPU architecture. diff --git a/docs/en/reference/development/build-cross-osx.md b/docs/en/reference/development/build-cross-osx.md new file mode 100644 index 00000000000..1dbd0ec6430 --- /dev/null +++ b/docs/en/reference/development/build-cross-osx.md @@ -0,0 +1,62 @@ +--- +sidebar_position: 66 +sidebar_label: Build on Linux for Mac OS X +--- + +# How to Build ClickHouse on Linux for Mac OS X + +This is for the case when you have a Linux machine and want to use it to build `clickhouse` binary that will run on OS X. +This is intended for continuous integration checks that run on Linux servers. If you want to build ClickHouse directly on Mac OS X, then proceed with [another instruction](../development/build-osx.md). + +The cross-build for Mac OS X is based on the [Build instructions](../development/build.md), follow them first. + +## Install Clang-13 + +Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup. +For example the commands for Bionic are like: + +``` bash +sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-13 main" >> /etc/apt/sources.list +sudo apt-get install clang-13 +``` + +## Install Cross-Compilation Toolset {#install-cross-compilation-toolset} + +Let’s remember the path where we install `cctools` as ${CCTOOLS} + +``` bash +mkdir ${CCTOOLS} +cd ${CCTOOLS} + +git clone https://github.com/tpoechtrager/apple-libtapi.git +cd apple-libtapi +INSTALLPREFIX=${CCTOOLS} ./build.sh +./install.sh +cd .. + +git clone https://github.com/tpoechtrager/cctools-port.git +cd cctools-port/cctools +./configure --prefix=$(readlink -f ${CCTOOLS}) --with-libtapi=$(readlink -f ${CCTOOLS}) --target=x86_64-apple-darwin +make install +``` + +Also, we need to download macOS X SDK into the working tree. + +``` bash +cd ClickHouse +wget 'https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.15.sdk.tar.xz' +mkdir -p build-darwin/cmake/toolchain/darwin-x86_64 +tar xJf MacOSX10.15.sdk.tar.xz -C build-darwin/cmake/toolchain/darwin-x86_64 --strip-components=1 +``` + +## Build ClickHouse {#build-clickhouse} + +``` bash +cd ClickHouse +mkdir build-darwin +cd build-darwin +CC=clang-13 CXX=clang++-13 cmake -DCMAKE_AR:FILEPATH=${CCTOOLS}/bin/aarch64-apple-darwin-ar -DCMAKE_INSTALL_NAME_TOOL=${CCTOOLS}/bin/aarch64-apple-darwin-install_name_tool -DCMAKE_RANLIB:FILEPATH=${CCTOOLS}/bin/aarch64-apple-darwin-ranlib -DLINKER_NAME=${CCTOOLS}/bin/aarch64-apple-darwin-ld -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake .. +ninja +``` + +The resulting binary will have a Mach-O executable format and can’t be run on Linux. diff --git a/docs/en/reference/development/build-cross-riscv.md b/docs/en/reference/development/build-cross-riscv.md new file mode 100644 index 00000000000..94c0f47a05d --- /dev/null +++ b/docs/en/reference/development/build-cross-riscv.md @@ -0,0 +1,30 @@ +--- +sidebar_position: 68 +sidebar_label: Build on Linux for RISC-V 64 +--- + +# How to Build ClickHouse on Linux for RISC-V 64 Architecture + +As of writing (11.11.2021) building for risc-v considered to be highly experimental. Not all features can be enabled. + +This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on another Linux machine with RISC-V 64 CPU architecture. This is intended for continuous integration checks that run on Linux servers. + +The cross-build for RISC-V 64 is based on the [Build instructions](../development/build.md), follow them first. + +## Install Clang-13 + +Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup or do +``` +sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" +``` + +## Build ClickHouse {#build-clickhouse} + +``` bash +cd ClickHouse +mkdir build-riscv64 +CC=clang-13 CXX=clang++-13 cmake . -Bbuild-riscv64 -G Ninja -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-riscv64.cmake -DGLIBC_COMPATIBILITY=OFF -DENABLE_LDAP=OFF -DOPENSSL_NO_ASM=ON -DENABLE_JEMALLOC=ON -DENABLE_PARQUET=OFF -DENABLE_ORC=OFF -DUSE_UNWIND=OFF -DENABLE_GRPC=OFF -DENABLE_HDFS=OFF -DENABLE_MYSQL=OFF +ninja -C build-riscv64 +``` + +The resulting binary will run only on Linux with the RISC-V 64 CPU architecture. diff --git a/docs/en/reference/development/build-osx.md b/docs/en/reference/development/build-osx.md new file mode 100644 index 00000000000..05ef10ad020 --- /dev/null +++ b/docs/en/reference/development/build-osx.md @@ -0,0 +1,154 @@ +--- +sidebar_position: 65 +sidebar_label: Build on Mac OS X +description: How to build ClickHouse on Mac OS X +--- + +# How to Build ClickHouse on Mac OS X + +:::info You don't have to build ClickHouse yourself! +You can install pre-built ClickHouse as described in [Quick Start](https://clickhouse.com/#quick-start). Follow **macOS (Intel)** or **macOS (Apple silicon)** installation instructions. +::: + +Build should work on x86_64 (Intel) and arm64 (Apple silicon) based macOS 10.15 (Catalina) and higher with Homebrew's vanilla Clang. +It is always recommended to use vanilla `clang` compiler. + +:::note +It is possible to use XCode's `apple-clang` or `gcc`, but it's strongly discouraged. +::: + +## Install Homebrew {#install-homebrew} + +``` bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" +# ...and follow the printed instructions on any additional steps required to complete the installation. +``` + +## Install Xcode and Command Line Tools {#install-xcode-and-command-line-tools} + +Install the latest [Xcode](https://apps.apple.com/am/app/xcode/id497799835?mt=12) from App Store. + +Open it at least once to accept the end-user license agreement and automatically install the required components. + +Then, make sure that the latest Command Line Tools are installed and selected in the system: + +``` bash +sudo rm -rf /Library/Developer/CommandLineTools +sudo xcode-select --install +``` + +## Install Required Compilers, Tools, and Libraries {#install-required-compilers-tools-and-libraries} + +``` bash +brew update +brew install cmake ninja libtool gettext llvm gcc binutils +``` + +## Checkout ClickHouse Sources {#checkout-clickhouse-sources} + +``` bash +git clone --recursive git@github.com:ClickHouse/ClickHouse.git +# ...alternatively, you can use https://github.com/ClickHouse/ClickHouse.git as the repo URL. +``` + +## Build ClickHouse {#build-clickhouse} + +To build using Homebrew's vanilla Clang compiler (the only **recommended** way): + +``` bash +cd ClickHouse +rm -rf build +mkdir build +cd build +cmake -DCMAKE_C_COMPILER=$(brew --prefix llvm)/bin/clang -DCMAKE_CXX_COMPILER=$(brew --prefix llvm)/bin/clang++ -DCMAKE_AR=$(brew --prefix llvm)/bin/llvm-ar -DCMAKE_RANLIB=$(brew --prefix llvm)/bin/llvm-ranlib -DOBJCOPY_PATH=$(brew --prefix llvm)/bin/llvm-objcopy -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --config RelWithDebInfo +# The resulting binary will be created at: ./programs/clickhouse +``` + +To build using Xcode's native AppleClang compiler in Xcode IDE (this option is only for development builds and workflows, and is **not recommended** unless you know what you are doing): + +``` bash +cd ClickHouse +rm -rf build +mkdir build +cd build +XCODE_IDE=1 ALLOW_APPLECLANG=1 cmake -G Xcode -DCMAKE_BUILD_TYPE=Debug -DENABLE_JEMALLOC=OFF .. +cmake --open . +# ...then, in Xcode IDE select ALL_BUILD scheme and start the building process. +# The resulting binary will be created at: ./programs/Debug/clickhouse +``` + +To build using Homebrew's vanilla GCC compiler (this option is only for development experiments, and is **absolutely not recommended** unless you really know what you are doing): + +``` bash +cd ClickHouse +rm -rf build +mkdir build +cd build +cmake -DCMAKE_C_COMPILER=$(brew --prefix gcc)/bin/gcc-11 -DCMAKE_CXX_COMPILER=$(brew --prefix gcc)/bin/g++-11 -DCMAKE_AR=$(brew --prefix gcc)/bin/gcc-ar-11 -DCMAKE_RANLIB=$(brew --prefix gcc)/bin/gcc-ranlib-11 -DOBJCOPY_PATH=$(brew --prefix binutils)/bin/objcopy -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --config RelWithDebInfo +# The resulting binary will be created at: ./programs/clickhouse +``` + +## Caveats {#caveats} + +If you intend to run `clickhouse-server`, make sure to increase the system’s maxfiles variable. + +:::note +You’ll need to use sudo. +::: + +To do so, create the `/Library/LaunchDaemons/limit.maxfiles.plist` file with the following content: + +``` xml + + + + + Label + limit.maxfiles + ProgramArguments + + launchctl + limit + maxfiles + 524288 + 524288 + + RunAtLoad + + ServiceIPC + + + +``` + +Give the file correct permissions: + +``` bash +sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist +``` + +Validate that the file is correct: + +``` bash +plutil /Library/LaunchDaemons/limit.maxfiles.plist +``` + +Load the file (or reboot): + +``` bash +sudo launchctl load -w /Library/LaunchDaemons/limit.maxfiles.plist +``` + +To check if it’s working, use the `ulimit -n` or `launchctl limit maxfiles` commands. + +## Running ClickHouse server + +``` bash +cd ClickHouse +./build/programs/clickhouse-server --config-file ./programs/server/config.xml +``` + +[Original article](https://clickhouse.com/docs/en/development/build_osx/) diff --git a/docs/en/reference/development/build.md b/docs/en/reference/development/build.md new file mode 100644 index 00000000000..b128412a55e --- /dev/null +++ b/docs/en/reference/development/build.md @@ -0,0 +1,181 @@ +--- +sidebar_position: 64 +sidebar_label: Build on Linux +description: How to build ClickHouse on Linux +--- + +# How to Build ClickHouse on Linux + +Supported platforms: + +- x86_64 +- AArch64 +- Power9 (experimental) + +## Normal Build for Development on Ubuntu + +The following tutorial is based on the Ubuntu Linux system. With appropriate changes, it should also work on any other Linux distribution. + +### Install Git, CMake, Python and Ninja {#install-git-cmake-python-and-ninja} + +``` bash +$ sudo apt-get install git cmake python ninja-build +``` + +Or cmake3 instead of cmake on older systems. + +### Install the latest clang (recommended) + +On Ubuntu/Debian you can use the automatic installation script (check [official webpage](https://apt.llvm.org/)) + +```bash +sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" +``` + +For other Linux distribution - check the availability of the [prebuild packages](https://releases.llvm.org/download.html) or build clang [from sources](https://clang.llvm.org/get_started.html). + +#### Use the latest clang for Builds + +``` bash +$ export CC=clang-14 +$ export CXX=clang++-14 +``` + +In this example we use version 14 that is the latest as of Feb 2022. + +Gcc can also be used though it is discouraged. + +### Checkout ClickHouse Sources {#checkout-clickhouse-sources} + +``` bash +$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git +``` + +or + +``` bash +$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git +``` + +### Build ClickHouse {#build-clickhouse} + +``` bash +$ cd ClickHouse +$ mkdir build +$ cd build +$ cmake .. +$ ninja +``` + +To create an executable, run `ninja clickhouse`. +This will create the `programs/clickhouse` executable, which can be used with `client` or `server` arguments. + +## How to Build ClickHouse on Any Linux {#how-to-build-clickhouse-on-any-linux} + +The build requires the following components: + +- Git (is used only to checkout the sources, it’s not needed for the build) +- CMake 3.10 or newer +- Ninja +- C++ compiler: clang-13 or newer +- Linker: lld + +If all the components are installed, you may build in the same way as the steps above. + +Example for Ubuntu Eoan: +``` bash +sudo apt update +sudo apt install git cmake ninja-build clang++ python +git clone --recursive https://github.com/ClickHouse/ClickHouse.git +mkdir build && cd build +cmake ../ClickHouse +ninja +``` + +Example for OpenSUSE Tumbleweed: +``` bash +sudo zypper install git cmake ninja clang-c++ python lld +git clone --recursive https://github.com/ClickHouse/ClickHouse.git +mkdir build && cd build +cmake ../ClickHouse +ninja +``` + +Example for Fedora Rawhide: +``` bash +sudo yum update +yum --nogpg install git cmake make clang-c++ python3 +git clone --recursive https://github.com/ClickHouse/ClickHouse.git +mkdir build && cd build +cmake ../ClickHouse +make -j $(nproc) +``` + +Here is an example of how to build `clang` and all the llvm infrastructure from sources: + +``` + git clone git@github.com:llvm/llvm-project.git + mkdir llvm-build && cd llvm-build + cmake -DCMAKE_BUILD_TYPE:STRING=Release -DLLVM_ENABLE_PROJECTS=all ../llvm-project/llvm/ + make -j16 + sudo make install + hash clang + clang --version +``` + +You can install the older clang like clang-11 from packages and then use it to build the new clang from sources. + +Here is an example of how to install the new `cmake` from the official website: + +``` +wget https://github.com/Kitware/CMake/releases/download/v3.22.2/cmake-3.22.2-linux-x86_64.sh +chmod +x cmake-3.22.2-linux-x86_64.sh +./cmake-3.22.2-linux-x86_64.sh +export PATH=/home/milovidov/work/cmake-3.22.2-linux-x86_64/bin/:${PATH} +hash cmake +``` + +## How to Build ClickHouse Debian Package {#how-to-build-clickhouse-debian-package} + +### Install Git {#install-git} + +``` bash +$ sudo apt-get update +$ sudo apt-get install git python debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring +``` + +### Checkout ClickHouse Sources {#checkout-clickhouse-sources-1} + +``` bash +$ git clone --recursive --branch master https://github.com/ClickHouse/ClickHouse.git +$ cd ClickHouse +``` + +### Run Release Script {#run-release-script} + +``` bash +$ ./release +``` + +## You Don’t Have to Build ClickHouse {#you-dont-have-to-build-clickhouse} + +ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour. + +They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request. + +To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green checkmark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”. + +## Faster builds for development: Split build configuration {#split-build} + +Normally, ClickHouse is statically linked into a single static `clickhouse` binary with minimal dependencies. This is convenient for distribution, but it means that on every change the entire binary needs to be linked, which is slow and may be inconvenient for development. There is an alternative configuration which instead creates dynamically loaded shared libraries and separate binaries `clickhouse-server`, `clickhouse-client` etc., allowing for faster incremental builds. To use it, add the following flags to your `cmake` invocation: +``` +-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1 -DCLICKHOUSE_SPLIT_BINARY=1 +``` + +Note that the split build has several drawbacks: +* There is no single `clickhouse` binary, and you have to run `clickhouse-server`, `clickhouse-client`, etc. +* Risk of segfault if you run any of the programs while rebuilding the project. +* You cannot run the integration tests since they only work a single complete binary. +* You can't easily copy the binaries elsewhere. Instead of moving a single binary you'll need to copy all binaries and libraries. + +[Original article](https://clickhouse.com/docs/en/development/build/) diff --git a/docs/en/reference/development/continuous-integration.md b/docs/en/reference/development/continuous-integration.md new file mode 100644 index 00000000000..b3172d103f0 --- /dev/null +++ b/docs/en/reference/development/continuous-integration.md @@ -0,0 +1,193 @@ +--- +sidebar_position: 62 +sidebar_label: Continuous Integration Checks +description: When you submit a pull request, some automated checks are ran for your code by the ClickHouse continuous integration (CI) system +--- + +# Continuous Integration Checks + +When you submit a pull request, some automated checks are ran for your code by +the ClickHouse [continuous integration (CI) system](tests.md#test-automation). +This happens after a repository maintainer (someone from ClickHouse team) has +screened your code and added the `can be tested` label to your pull request. +The results of the checks are listed on the GitHub pull request page as +described in the [GitHub checks +documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-status-checks). +If a check is failing, you might be required to fix it. This page gives an +overview of checks you may encounter, and what you can do to fix them. + +If it looks like the check failure is not related to your changes, it may be +some transient failure or an infrastructure problem. Push an empty commit to +the pull request to restart the CI checks: +``` +git reset +git commit --allow-empty +git push +``` + +If you are not sure what to do, ask a maintainer for help. + + +## Merge With Master + +Verifies that the PR can be merged to master. If not, it will fail with the +message 'Cannot fetch mergecommit'. To fix this check, resolve the conflict as +described in the [GitHub +documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/resolving-a-merge-conflict-on-github), +or merge the `master` branch to your pull request branch using git. + + +## Docs check + +Tries to build the ClickHouse documentation website. It can fail if you changed +something in the documentation. Most probable reason is that some cross-link in +the documentation is wrong. Go to the check report and look for `ERROR` and `WARNING` messages. + +### Report Details + +- [Status page example](https://clickhouse-test-reports.s3.yandex.net/12550/eabcc293eb02214caa6826b7c15f101643f67a6b/docs_check.html) +- `docs_output.txt` contains the building log. [Successful result example](https://clickhouse-test-reports.s3.yandex.net/12550/eabcc293eb02214caa6826b7c15f101643f67a6b/docs_check/docs_output.txt) + + +## Description Check + +Check that the description of your pull request conforms to the template +[PULL_REQUEST_TEMPLATE.md](https://github.com/ClickHouse/ClickHouse/blob/master/.github/PULL_REQUEST_TEMPLATE.md). +You have to specify a changelog category for your change (e.g., Bug Fix), and +write a user-readable message describing the change for [CHANGELOG.md](../whats-new/changelog/index.md) + + +## Push To Dockerhub + +Builds docker images used for build and tests, then pushes them to DockerHub. + + +## Marker Check + +This check means that the CI system started to process the pull request. When it has 'pending' status, it means that not all checks have been started yet. After all checks have been started, it changes status to 'success'. + + +## Style Check + +Performs some simple regex-based checks of code style, using the [`utils/check-style/check-style`](https://github.com/ClickHouse/ClickHouse/blob/master/utils/check-style/check-style) binary (note that it can be run locally). +If it fails, fix the style errors following the [code style guide](style.md). + +### Report Details +- [Status page example](https://clickhouse-test-reports.s3.yandex.net/12550/659c78c7abb56141723af6a81bfae39335aa8cb2/style_check.html) +- `output.txt` contains the check resulting errors (invalid tabulation etc), blank page means no errors. [Successful result example](https://clickhouse-test-reports.s3.yandex.net/12550/659c78c7abb56141723af6a81bfae39335aa8cb2/style_check/output.txt). + + +## Fast Test +Normally this is the first check that is ran for a PR. It builds ClickHouse and +runs most of [stateless functional tests](tests.md#functional-tests), omitting +some. If it fails, further checks are not started until it is fixed. Look at +the report to see which tests fail, then reproduce the failure locally as +described [here](tests.md#functional-test-locally). + +### Report Details +[Status page example](https://clickhouse-test-reports.s3.yandex.net/12550/67d716b5cc3987801996c31a67b31bf141bc3486/fast_test.html) + +#### Status Page Files +- `runlog.out.log` is the general log that includes all other logs. +- `test_log.txt` +- `submodule_log.txt` contains the messages about cloning and checkouting needed submodules. +- `stderr.log` +- `stdout.log` +- `clickhouse-server.log` +- `clone_log.txt` +- `install_log.txt` +- `clickhouse-server.err.log` +- `build_log.txt` +- `cmake_log.txt` contains messages about the C/C++ and Linux flags check. + +#### Status Page Columns + +- *Test name* contains the name of the test (without the path e.g. all types of tests will be stripped to the name). +- *Test status* -- one of _Skipped_, _Success_, or _Fail_. +- *Test time, sec.* -- empty on this test. + + +## Build Check {#build-check} + +Builds ClickHouse in various configurations for use in further steps. You have to fix the builds that fail. Build logs often has enough information to fix the error, but you might have to reproduce the failure locally. The `cmake` options can be found in the build log, grepping for `cmake`. Use these options and follow the [general build process](../development/build.md). + +### Report Details + +[Status page example](https://clickhouse-builds.s3.yandex.net/12550/67d716b5cc3987801996c31a67b31bf141bc3486/clickhouse_build_check/report.html). + +- **Compiler**: `gcc-9` or `clang-10` (or `clang-10-xx` for other architectures e.g. `clang-10-freebsd`). +- **Build type**: `Debug` or `RelWithDebInfo` (cmake). +- **Sanitizer**: `none` (without sanitizers), `address` (ASan), `memory` (MSan), `undefined` (UBSan), or `thread` (TSan). +- **Splitted** `splitted` is a [split build](../development/build.md#split-build) +- **Status**: `success` or `fail` +- **Build log**: link to the building and files copying log, useful when build failed. +- **Build time**. +- **Artifacts**: build result files (with `XXX` being the server version e.g. `20.8.1.4344`). + - `clickhouse-client_XXX_all.deb` + - `clickhouse-common-static-dbg_XXX[+asan, +msan, +ubsan, +tsan]_amd64.deb` + - `clickhouse-common-staticXXX_amd64.deb` + - `clickhouse-server_XXX_all.deb` + - `clickhouse_XXX_amd64.buildinfo` + - `clickhouse_XXX_amd64.changes` + - `clickhouse`: Main built binary. + - `clickhouse-odbc-bridge` + - `unit_tests_dbms`: GoogleTest binary with ClickHouse unit tests. + - `shared_build.tgz`: build with shared libraries. + - `performance.tgz`: Special package for performance tests. + + +## Special Build Check +Performs static analysis and code style checks using `clang-tidy`. The report is similar to the [build check](#build-check). Fix the errors found in the build log. + + +## Functional Stateless Tests +Runs [stateless functional tests](tests.md#functional-tests) for ClickHouse +binaries built in various configurations -- release, debug, with sanitizers, +etc. Look at the report to see which tests fail, then reproduce the failure +locally as described [here](tests.md#functional-test-locally). Note that you +have to use the correct build configuration to reproduce -- a test might fail +under AddressSanitizer but pass in Debug. Download the binary from [CI build +checks page](../development/build.md#you-dont-have-to-build-clickhouse), or build it locally. + + +## Functional Stateful Tests +Runs [stateful functional tests](tests.md#functional-tests). Treat them in the same way as the functional stateless tests. The difference is that they require `hits` and `visits` tables from the [clickstream dataset](../example-datasets/metrica.md) to run. + + +## Integration Tests +Runs [integration tests](tests.md#integration-tests). + + +## Testflows Check +Runs some tests using Testflows test system. See [here](https://github.com/ClickHouse/ClickHouse/tree/master/tests/testflows#running-tests-locally) how to run them locally. + + +## Stress Test +Runs stateless functional tests concurrently from several clients to detect +concurrency-related errors. If it fails: + + * Fix all other test failures first; + * Look at the report to find the server logs and check them for possible causes + of error. + + +## Split Build Smoke Test + +Checks that the server build in [split build](../development/build.md#split-build) +configuration can start and run simple queries. If it fails: + + * Fix other test errors first; + * Build the server in [split build](../development/build.md#split-build) configuration + locally and check whether it can start and run `select 1`. + + +## Compatibility Check +Checks that `clickhouse` binary runs on distributions with old libc versions. If it fails, ask a maintainer for help. + + +## AST Fuzzer +Runs randomly generated queries to catch program errors. If it fails, ask a maintainer for help. + + +## Performance Tests +Measure changes in query performance. This is the longest check that takes just below 6 hours to run. The performance test report is described in detail [here](https://github.com/ClickHouse/ClickHouse/tree/master/docker/test/performance-comparison#how-to-read-the-report). diff --git a/docs/en/reference/development/contrib.md b/docs/en/reference/development/contrib.md new file mode 100644 index 00000000000..7cbe32fdd8b --- /dev/null +++ b/docs/en/reference/development/contrib.md @@ -0,0 +1,107 @@ +--- +sidebar_position: 71 +sidebar_label: Third-Party Libraries +description: A list of third-party libraries used +--- + +# Third-Party Libraries Used + +The list of third-party libraries: + +| Library name | License type | +|:-|:-| +| abseil-cpp | [Apache](https://github.com/ClickHouse-Extras/abseil-cpp/blob/4f3b686f86c3ebaba7e4e926e62a79cb1c659a54/LICENSE) | +| AMQP-CPP | [Apache](https://github.com/ClickHouse-Extras/AMQP-CPP/blob/1a6c51f4ac51ac56610fa95081bd2f349911375a/LICENSE) | +| arrow | [Apache](https://github.com/ClickHouse-Extras/arrow/blob/078e21bad344747b7656ef2d7a4f7410a0a303eb/LICENSE.txt) | +| avro | [Apache](https://github.com/ClickHouse-Extras/avro/blob/e43c46e87fd32eafdc09471e95344555454c5ef8/LICENSE.txt) | +| aws | [Apache](https://github.com/ClickHouse-Extras/aws-sdk-cpp/blob/7d48b2c8193679cc4516e5bd68ae4a64b94dae7d/LICENSE.txt) | +| aws-c-common | [Apache](https://github.com/ClickHouse-Extras/aws-c-common/blob/736a82d1697c108b04a277e66438a7f4e19b6857/LICENSE) | +| aws-c-event-stream | [Apache](https://github.com/ClickHouse-Extras/aws-c-event-stream/blob/3bc33662f9ccff4f4cbcf9509cc78c26e022fde0/LICENSE) | +| aws-checksums | [Apache](https://github.com/ClickHouse-Extras/aws-checksums/blob/519d6d9093819b6cf89ffff589a27ef8f83d0f65/LICENSE) | +| base64 | [BSD 2-clause](https://github.com/ClickHouse-Extras/Turbo-Base64/blob/af9b331f2b4f30b41c70f3a571ff904a8251c1d3/LICENSE) | +| boost | [Boost](https://github.com/ClickHouse-Extras/boost/blob/9cf09dbfd55a5c6202dedbdf40781a51b02c2675/LICENSE_1_0.txt) | +| boringssl | [BSD](https://github.com/ClickHouse-Extras/boringssl/blob/a6a2e2ab3e44d97ce98e51c558e989f211de7eb3/LICENSE) | +| brotli | [MIT](https://github.com/google/brotli/blob/63be8a99401992075c23e99f7c84de1c653e39e2/LICENSE) | +| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/a00ccd91b3746ef2ab51d40fe3265829949d1ace/LICENSE) | +| cassandra | [Apache](https://github.com/ClickHouse-Extras/cpp-driver/blob/eb9b68dadbb4417a2c132ad4a1c2fa76e65e6fc1/LICENSE.txt) | +| cctz | [Apache](https://github.com/ClickHouse-Extras/cctz/blob/c0f1bcb97fd2782f7c3f972fadd5aad5affac4b8/LICENSE.txt) | +| cityhash102 | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/cityhash102/COPYING) | +| cppkafka | [BSD 2-clause](https://github.com/mfontanini/cppkafka/blob/5a119f689f8a4d90d10a9635e7ee2bee5c127de1/LICENSE) | +| croaring | [Apache](https://github.com/RoaringBitmap/CRoaring/blob/2c867e9f9c9e2a3a7032791f94c4c7ae3013f6e0/LICENSE) | +| curl | [Apache](https://github.com/curl/curl/blob/3b8bbbbd1609c638a3d3d0acb148a33dedb67be3/docs/LICENSE-MIXING.md) | +| cyrus-sasl | [BSD 2-clause](https://github.com/ClickHouse-Extras/cyrus-sasl/blob/e6466edfd638cc5073debe941c53345b18a09512/COPYING) | +| double-conversion | [BSD 3-clause](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | +| dragonbox | [Apache](https://github.com/ClickHouse-Extras/dragonbox/blob/923705af6fd953aa948fc175f6020b15f7359838/LICENSE-Apache2-LLVM) | +| fast_float | [Apache](https://github.com/fastfloat/fast_float/blob/7eae925b51fd0f570ccd5c880c12e3e27a23b86f/LICENSE) | +| fastops | [MIT](https://github.com/ClickHouse-Extras/fastops/blob/88752a5e03cf34639a4a37a4b41d8b463fffd2b5/LICENSE) | +| flatbuffers | [Apache](https://github.com/ClickHouse-Extras/flatbuffers/blob/eb3f827948241ce0e701516f16cd67324802bce9/LICENSE.txt) | +| fmtlib | [Unknown](https://github.com/fmtlib/fmt/blob/c108ee1d590089ccf642fc85652b845924067af2/LICENSE.rst) | +| gcem | [Apache](https://github.com/kthohr/gcem/blob/8d4f1b5d76ea8f6ff12f3f4f34cda45424556b00/LICENSE) | +| googletest | [BSD 3-clause](https://github.com/google/googletest/blob/e7e591764baba0a0c3c9ad0014430e7a27331d16/LICENSE) | +| grpc | [Apache](https://github.com/ClickHouse-Extras/grpc/blob/60c986e15cae70aade721d26badabab1f822fdd6/LICENSE) | +| h3 | [Apache](https://github.com/ClickHouse-Extras/h3/blob/c7f46cfd71fb60e2fefc90e28abe81657deff735/LICENSE) | +| hyperscan | [Boost](https://github.com/ClickHouse-Extras/hyperscan/blob/e9f08df0213fc637aac0a5bbde9beeaeba2fe9fa/LICENSE) | +| icu | [Public Domain](https://github.com/unicode-org/icu/blob/a56dde820dc35665a66f2e9ee8ba58e75049b668/icu4c/LICENSE) | +| icudata | [Public Domain](https://github.com/ClickHouse-Extras/icudata/blob/72d9a4a7febc904e2b0a534ccb25ae40fac5f1e5/LICENSE) | +| jemalloc | [BSD 2-clause](https://github.com/ClickHouse-Extras/jemalloc/blob/e6891d9746143bf2cf617493d880ba5a0b9a3efd/COPYING) | +| krb5 | [MIT](https://github.com/ClickHouse-Extras/krb5/blob/5149dea4e2be0f67707383d2682b897c14631374/src/lib/gssapi/LICENSE) | +| libc-headers | [LGPL](https://github.com/ClickHouse-Extras/libc-headers/blob/a720b7105a610acbd7427eea475a5b6810c151eb/LICENSE) | +| libcpuid | [BSD 2-clause](https://github.com/ClickHouse-Extras/libcpuid/blob/8db3b8d2d32d22437f063ce692a1b9bb15e42d18/COPYING) | +| libcxx | [Apache](https://github.com/ClickHouse-Extras/libcxx/blob/2fa892f69acbaa40f8a18c6484854a6183a34482/LICENSE.TXT) | +| libcxxabi | [Apache](https://github.com/ClickHouse-Extras/libcxxabi/blob/df8f1e727dbc9e2bedf2282096fa189dc3fe0076/LICENSE.TXT) | +| libdivide | [zLib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | +| libfarmhash | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libfarmhash/COPYING) | +| libgsasl | [LGPL](https://github.com/ClickHouse-Extras/libgsasl/blob/383ee28e82f69fa16ed43b48bd9c8ee5b313ab84/LICENSE) | +| libhdfs3 | [Apache](https://github.com/ClickHouse-Extras/libhdfs3/blob/095b9d48b400abb72d967cb0539af13b1e3d90cf/LICENSE.txt) | +| libmetrohash | [Apache](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | +| libpq | [Unknown](https://github.com/ClickHouse-Extras/libpq/blob/e071ea570f8985aa00e34f5b9d50a3cfe666327e/COPYRIGHT) | +| libpqxx | [BSD 3-clause](https://github.com/ClickHouse-Extras/libpqxx/blob/357608d11b7a1961c3fb7db2ef9a5dbb2e87da77/COPYING) | +| librdkafka | [MIT](https://github.com/ClickHouse-Extras/librdkafka/blob/b8554f1682062c85ba519eb54ef2f90e02b812cb/LICENSE.murmur2) | +| libunwind | [Apache](https://github.com/ClickHouse-Extras/libunwind/blob/6b816d2fba3991f8fd6aaec17d92f68947eab667/LICENSE.TXT) | +| libuv | [BSD](https://github.com/ClickHouse-Extras/libuv/blob/e2e9b7e9f978ce8a1367b5fe781d97d1ce9f94ab/LICENSE) | +| llvm | [Apache](https://github.com/ClickHouse-Extras/llvm/blob/e5751459412bce1391fb7a2e9bbc01e131bf72f1/llvm/LICENSE.TXT) | +| lz4 | [BSD](https://github.com/lz4/lz4/blob/f39b79fb02962a1cd880bbdecb6dffba4f754a11/LICENSE) | +| mariadb-connector-c | [LGPL](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/5f4034a3a6376416504f17186c55fe401c6d8e5e/COPYING.LIB) | +| miniselect | [Boost](https://github.com/danlark1/miniselect/blob/be0af6bd0b6eb044d1acc4f754b229972d99903a/LICENSE_1_0.txt) | +| msgpack-c | [Boost](https://github.com/msgpack/msgpack-c/blob/46684265d50b5d1b062d4c5c428ba08462844b1d/LICENSE_1_0.txt) | +| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) | +| NuRaft | [Apache](https://github.com/ClickHouse-Extras/NuRaft/blob/7ecb16844af6a9c283ad432d85ecc2e7d1544676/LICENSE) | +| openldap | [Unknown](https://github.com/ClickHouse-Extras/openldap/blob/0208811b6043ca06fda8631a5e473df1ec515ccb/LICENSE) | +| orc | [Apache](https://github.com/ClickHouse-Extras/orc/blob/0a936f6bbdb9303308973073f8623b5a8d82eae1/LICENSE) | +| poco | [Boost](https://github.com/ClickHouse-Extras/poco/blob/7351c4691b5d401f59e3959adfc5b4fa263b32da/LICENSE) | +| protobuf | [BSD 3-clause](https://github.com/ClickHouse-Extras/protobuf/blob/75601841d172c73ae6bf4ce8121f42b875cdbabd/LICENSE) | +| rapidjson | [MIT](https://github.com/ClickHouse-Extras/rapidjson/blob/c4ef90ccdbc21d5d5a628d08316bfd301e32d6fa/bin/jsonschema/LICENSE) | +| re2 | [BSD 3-clause](https://github.com/google/re2/blob/13ebb377c6ad763ca61d12dd6f88b1126bd0b911/LICENSE) | +| replxx | [BSD 3-clause](https://github.com/ClickHouse-Extras/replxx/blob/c81be6c68b146f15f2096b7ef80e3f21fe27004c/LICENSE.md) | +| rocksdb | [BSD 3-clause](https://github.com/ClickHouse-Extras/rocksdb/blob/b6480c69bf3ab6e298e0d019a07fd4f69029b26a/LICENSE.leveldb) | +| s2geometry | [Apache](https://github.com/ClickHouse-Extras/s2geometry/blob/20ea540d81f4575a3fc0aea585aac611bcd03ede/LICENSE) | +| sentry-native | [MIT](https://github.com/ClickHouse-Extras/sentry-native/blob/94644e92f0a3ff14bd35ed902a8622a2d15f7be4/LICENSE) | +| simdjson | [Apache](https://github.com/simdjson/simdjson/blob/8df32cea3359cb30120795da6020b3b73da01d38/LICENSE) | +| snappy | [Public Domain](https://github.com/google/snappy/blob/3f194acb57e0487531c96b97af61dcbd025a78a3/COPYING) | +| sparsehash-c11 | [BSD 3-clause](https://github.com/sparsehash/sparsehash-c11/blob/cf0bffaa456f23bc4174462a789b90f8b6f5f42f/LICENSE) | +| stats | [Apache](https://github.com/kthohr/stats/blob/b6dd459c10a88c7ea04693c007e9e35820c5d9ad/LICENSE) | +| thrift | [Apache](https://github.com/apache/thrift/blob/010ccf0a0c7023fea0f6bf4e4078ebdff7e61982/LICENSE) | +| unixodbc | [LGPL](https://github.com/ClickHouse-Extras/UnixODBC/blob/b0ad30f7f6289c12b76f04bfb9d466374bb32168/COPYING) | +| xz | [Public Domain](https://github.com/xz-mirror/xz/blob/869b9d1b4edd6df07f819d360d306251f8147353/COPYING) | +| zlib-ng | [zLib](https://github.com/ClickHouse-Extras/zlib-ng/blob/6a5e93b9007782115f7f7e5235dedc81c4f1facb/LICENSE.md) | +| zstd | [BSD](https://github.com/facebook/zstd/blob/a488ba114ec17ea1054b9057c26a046fc122b3b6/LICENSE) | + +The list of third-party libraries can be obtained by the following query: + +``` sql +SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en'; +``` + +[Example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==) + +## Guidelines for adding new third-party libraries and maintaining custom changes in them {#adding-third-party-libraries} + +1. All external third-party code should reside in the dedicated directories under `contrib` directory of ClickHouse repo. Prefer Git submodules, when available. +2. Fork/mirror the official repo in [Clickhouse-extras](https://github.com/ClickHouse-Extras). Prefer official GitHub repos, when available. +3. Branch from the branch you want to integrate, e.g., `master` -> `clickhouse/master`, or `release/vX.Y.Z` -> `clickhouse/release/vX.Y.Z`. +4. All forks in [Clickhouse-extras](https://github.com/ClickHouse-Extras) can be automatically synchronized with upstreams. `clickhouse/...` branches will remain unaffected, since virtually nobody is going to use that naming pattern in their upstream repos. +5. Add submodules under `contrib` of ClickHouse repo that refer the above forks/mirrors. Set the submodules to track the corresponding `clickhouse/...` branches. +6. Every time the custom changes have to be made in the library code, a dedicated branch should be created, like `clickhouse/my-fix`. Then this branch should be merged into the branch, that is tracked by the submodule, e.g., `clickhouse/master` or `clickhouse/release/vX.Y.Z`. +7. No code should be pushed in any branch of the forks in [Clickhouse-extras](https://github.com/ClickHouse-Extras), whose names do not follow `clickhouse/...` pattern. +8. Always write the custom changes with the official repo in mind. Once the PR is merged from (a feature/fix branch in) your personal fork into the fork in [Clickhouse-extras](https://github.com/ClickHouse-Extras), and the submodule is bumped in ClickHouse repo, consider opening another PR from (a feature/fix branch in) the fork in [Clickhouse-extras](https://github.com/ClickHouse-Extras) to the official repo of the library. This will make sure, that 1) the contribution has more than a single use case and importance, 2) others will also benefit from it, 3) the change will not remain a maintenance burden solely on ClickHouse developers. +9. When a submodule needs to start using a newer code from the original branch (e.g., `master`), and since the custom changes might be merged in the branch it is tracking (e.g., `clickhouse/master`) and so it may diverge from its original counterpart (i.e., `master`), a careful merge should be carried out first, i.e., `master` -> `clickhouse/master`, and only then the submodule can be bumped in ClickHouse. diff --git a/docs/en/reference/development/developer-instruction.md b/docs/en/reference/development/developer-instruction.md new file mode 100644 index 00000000000..291e57fef66 --- /dev/null +++ b/docs/en/reference/development/developer-instruction.md @@ -0,0 +1,278 @@ +--- +sidebar_position: 61 +sidebar_label: Getting Started +description: Prerequisites and an overview of how to build ClickHouse +--- + +# Getting Started Guide for Building ClickHouse + +The building of ClickHouse is supported on Linux, FreeBSD and Mac OS X. + +If you use Windows, you need to create a virtual machine with Ubuntu. To start working with a virtual machine please install VirtualBox. You can download Ubuntu from the website: https://www.ubuntu.com/#download. Please create a virtual machine from the downloaded image (you should reserve at least 4GB of RAM for it). To run a command-line terminal in Ubuntu, please locate a program containing the word “terminal” in its name (gnome-terminal, konsole etc.) or just press Ctrl+Alt+T. + +ClickHouse cannot work or build on a 32-bit system. You should acquire access to a 64-bit system and you can continue reading. + +## Creating a Repository on GitHub {#creating-a-repository-on-github} + +To start working with ClickHouse repository you will need a GitHub account. + +You probably already have one, but if you do not, please register at https://github.com. In case you do not have SSH keys, you should generate them and then upload them on GitHub. It is required for sending over your patches. It is also possible to use the same SSH keys that you use with any other SSH servers - probably you already have those. + +Create a fork of ClickHouse repository. To do that please click on the “fork” button in the upper right corner at https://github.com/ClickHouse/ClickHouse. It will fork your own copy of ClickHouse/ClickHouse to your account. + +The development process consists of first committing the intended changes into your fork of ClickHouse and then creating a “pull request” for these changes to be accepted into the main repository (ClickHouse/ClickHouse). + +To work with git repositories, please install `git`. + +To do that in Ubuntu you would run in the command line terminal: + + sudo apt update + sudo apt install git + +A brief manual on using Git can be found here: https://education.github.com/git-cheat-sheet-education.pdf. +For a detailed manual on Git see https://git-scm.com/book/en/v2. + +## Cloning a Repository to Your Development Machine {#cloning-a-repository-to-your-development-machine} + +Next, you need to download the source files onto your working machine. This is called “to clone a repository” because it creates a local copy of the repository on your working machine. + +In the command line terminal run: + + git clone --recursive git@github.com:your_github_username/ClickHouse.git + cd ClickHouse + +Note: please, substitute *your_github_username* with what is appropriate! + +This command will create a directory `ClickHouse` containing the working copy of the project. + +It is important that the path to the working directory contains no whitespaces as it may lead to problems with running the build system. + +Please note that ClickHouse repository uses `submodules`. That is what the references to additional repositories are called (i.e. external libraries on which the project depends). It means that when cloning the repository you need to specify the `--recursive` flag as in the example above. If the repository has been cloned without submodules, to download them you need to run the following: + + git submodule init + git submodule update + +You can check the status with the command: `git submodule status`. + +If you get the following error message: + + Permission denied (publickey). + fatal: Could not read from remote repository. + + Please make sure you have the correct access rights + and the repository exists. + +It generally means that the SSH keys for connecting to GitHub are missing. These keys are normally located in `~/.ssh`. For SSH keys to be accepted you need to upload them in the settings section of GitHub UI. + +You can also clone the repository via https protocol: + + git clone --recursive https://github.com/ClickHouse/ClickHouse.git + +This, however, will not let you send your changes to the server. You can still use it temporarily and add the SSH keys later replacing the remote address of the repository with `git remote` command. + +You can also add original ClickHouse repo’s address to your local repository to pull updates from there: + + git remote add upstream git@github.com:ClickHouse/ClickHouse.git + +After successfully running this command you will be able to pull updates from the main ClickHouse repo by running `git pull upstream master`. + +### Working with Submodules {#working-with-submodules} + +Working with submodules in git could be painful. Next commands will help to manage it: + + # ! each command accepts + # Update remote URLs for submodules. Barely rare case + git submodule sync + # Add new submodules + git submodule init + # Update existing submodules to the current state + git submodule update + # Two last commands could be merged together + git submodule update --init + +The next commands would help you to reset all submodules to the initial state (!WARNING! - any changes inside will be deleted): + + # Synchronizes submodules' remote URL with .gitmodules + git submodule sync + # Update the registered submodules with initialize not yet initialized + git submodule update --init + # Reset all changes done after HEAD + git submodule foreach git reset --hard + # Clean files from .gitignore + git submodule foreach git clean -xfd + # Repeat last 4 commands for all submodule + git submodule foreach git submodule sync + git submodule foreach git submodule update --init + git submodule foreach git submodule foreach git reset --hard + git submodule foreach git submodule foreach git clean -xfd + +## Build System {#build-system} + +ClickHouse uses CMake and Ninja for building. + +CMake - a meta-build system that can generate Ninja files (build tasks). +Ninja - a smaller build system with a focus on the speed used to execute those cmake generated tasks. + +To install on Ubuntu, Debian or Mint run `sudo apt install cmake ninja-build`. + +On CentOS, RedHat run `sudo yum install cmake ninja-build`. + +If you use Arch or Gentoo, you probably know it yourself how to install CMake. + +For installing CMake and Ninja on Mac OS X first install Homebrew and then install everything else via brew: + + /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + brew install cmake ninja + +Next, check the version of CMake: `cmake --version`. If it is below 3.12, you should install a newer version from the website: https://cmake.org/download/. + +## C++ Compiler {#c-compiler} + +Compilers Clang starting from version 11 is supported for building ClickHouse. + +Clang should be used instead of gcc. Though, our continuous integration (CI) platform runs checks for about a dozen of build combinations. + +On Ubuntu/Debian you can use the automatic installation script (check [official webpage](https://apt.llvm.org/)) + +```bash +sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" +``` + +Mac OS X build is also supported. Just run `brew install llvm` + + +## The Building Process {#the-building-process} + +Now that you are ready to build ClickHouse we recommend you to create a separate directory `build` inside `ClickHouse` that will contain all of the build artefacts: + + mkdir build + cd build + +You can have several different directories (build_release, build_debug, etc.) for different types of build. + +While inside the `build` directory, configure your build by running CMake. Before the first run, you need to define environment variables that specify compiler. + + export CC=clang CXX=clang++ + cmake .. + +If you installed clang using the automatic installation script above, also specify the version of clang installed in the first command, e.g. `export CC=clang-13 CXX=clang++-13`. The clang version will be in the script output. + +The `CC` variable specifies the compiler for C (short for C Compiler), and `CXX` variable instructs which C++ compiler is to be used for building. + +For a faster build, you can resort to the `debug` build type - a build with no optimizations. For that supply the following parameter `-D CMAKE_BUILD_TYPE=Debug`: + + cmake -D CMAKE_BUILD_TYPE=Debug .. + +You can change the type of build by running this command in the `build` directory. + +Run ninja to build: + + ninja clickhouse-server clickhouse-client + +Only the required binaries are going to be built in this example. + +If you require to build all the binaries (utilities and tests), you should run ninja with no parameters: + + ninja + +Full build requires about 30GB of free disk space or 15GB to build the main binaries. + +When a large amount of RAM is available on build machine you should limit the number of build tasks run in parallel with `-j` param: + + ninja -j 1 clickhouse-server clickhouse-client + +On machines with 4GB of RAM, it is recommended to specify 1, for 8GB of RAM `-j 2` is recommended. + +If you get the message: `ninja: error: loading 'build.ninja': No such file or directory`, it means that generating a build configuration has failed and you need to inspect the message above. + +Upon the successful start of the building process, you’ll see the build progress - the number of processed tasks and the total number of tasks. + +While building messages about protobuf files in libhdfs2 library like `libprotobuf WARNING` may show up. They affect nothing and are safe to be ignored. + +Upon successful build you get an executable file `ClickHouse//programs/clickhouse`: + + ls -l programs/clickhouse + +## Running the Built Executable of ClickHouse {#running-the-built-executable-of-clickhouse} + +To run the server under the current user you need to navigate to `ClickHouse/programs/server/` (located outside of `build`) and run: + + ../../build/programs/clickhouse server + +In this case, ClickHouse will use config files located in the current directory. You can run `clickhouse server` from any directory specifying the path to a config file as a command-line parameter `--config-file`. + +To connect to ClickHouse with clickhouse-client in another terminal navigate to `ClickHouse/build/programs/` and run `./clickhouse client`. + +If you get `Connection refused` message on Mac OS X or FreeBSD, try specifying host address 127.0.0.1: + + clickhouse client --host 127.0.0.1 + +You can replace the production version of ClickHouse binary installed in your system with your custom-built ClickHouse binary. To do that install ClickHouse on your machine following the instructions from the official website. Next, run the following: + + sudo service clickhouse-server stop + sudo cp ClickHouse/build/programs/clickhouse /usr/bin/ + sudo service clickhouse-server start + +Note that `clickhouse-client`, `clickhouse-server` and others are symlinks to the commonly shared `clickhouse` binary. + +You can also run your custom-built ClickHouse binary with the config file from the ClickHouse package installed on your system: + + sudo service clickhouse-server stop + sudo -u clickhouse ClickHouse/build/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml + +## IDE (Integrated Development Environment) {#ide-integrated-development-environment} + +If you do not know which IDE to use, we recommend that you use CLion. CLion is commercial software, but it offers 30 days free trial period. It is also free of charge for students. CLion can be used both on Linux and on Mac OS X. + +KDevelop and QTCreator are other great alternatives of an IDE for developing ClickHouse. KDevelop comes in as a very handy IDE although unstable. If KDevelop crashes after a while upon opening project, you should click “Stop All” button as soon as it has opened the list of project’s files. After doing so KDevelop should be fine to work with. + +As simple code editors, you can use Sublime Text or Visual Studio Code, or Kate (all of which are available on Linux). + +Just in case, it is worth mentioning that CLion creates `build` path on its own, it also on its own selects `debug` for build type, for configuration it uses a version of CMake that is defined in CLion and not the one installed by you, and finally, CLion will use `make` to run build tasks instead of `ninja`. This is normal behaviour, just keep that in mind to avoid confusion. + +## Writing Code {#writing-code} + +The description of ClickHouse architecture can be found here: https://clickhouse.com/docs/en/development/architecture/ + +The Code Style Guide: https://clickhouse.com/docs/en/development/style/ + +Adding third-party libraries: https://clickhouse.com/docs/en/development/contrib/#adding-third-party-libraries + +Writing tests: https://clickhouse.com/docs/en/development/tests/ + +List of tasks: https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aopen+is%3Aissue+label%3Ahacktoberfest + +## Test Data {#test-data} + +Developing ClickHouse often requires loading realistic datasets. It is particularly important for performance testing. We have a specially prepared set of anonymized data of web analytics. It requires additionally some 3GB of free disk space. Note that this data is not required to accomplish most of the development tasks. + + sudo apt install wget xz-utils + + wget https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz + wget https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz + + xz -v -d hits_v1.tsv.xz + xz -v -d visits_v1.tsv.xz + + clickhouse-client + + CREATE DATABASE IF NOT EXISTS test + + CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); + + CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); + + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv + +## Creating Pull Request {#creating-pull-request} + +Navigate to your fork repository in GitHub’s UI. If you have been developing in a branch, you need to select that branch. There will be a “Pull request” button located on the screen. In essence, this means “create a request for accepting my changes into the main repository”. + +A pull request can be created even if the work is not completed yet. In this case please put the word “WIP” (work in progress) at the beginning of the title, it can be changed later. This is useful for cooperative reviewing and discussion of changes as well as for running all of the available tests. It is important that you provide a brief description of your changes, it will later be used for generating release changelogs. + +Testing will commence as soon as ClickHouse employees label your PR with a tag “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour. + +The system will prepare ClickHouse binary builds for your pull request individually. To retrieve these builds click the “Details” link next to “ClickHouse build check” entry in the list of checks. There you will find direct links to the built .deb packages of ClickHouse which you can deploy even on your production servers (if you have no fear). + +Most probably some of the builds will fail at first times. This is due to the fact that we check builds both with gcc as well as with clang, with almost all of existing warnings (always with the `-Werror` flag) enabled for clang. On that same page, you can find all of the build logs so that you do not have to build ClickHouse in all of the possible ways. diff --git a/docs/en/reference/development/style.md b/docs/en/reference/development/style.md new file mode 100644 index 00000000000..82cd9273680 --- /dev/null +++ b/docs/en/reference/development/style.md @@ -0,0 +1,832 @@ +--- +sidebar_position: 69 +sidebar_label: C++ Guide +description: A list of recommendations regarding coding style, naming convention, formatting and more +--- + +# How to Write C++ Code + +## General Recommendations {#general-recommendations} + +**1.** The following are recommendations, not requirements. + +**2.** If you are editing code, it makes sense to follow the formatting of the existing code. + +**3.** Code style is needed for consistency. Consistency makes it easier to read the code, and it also makes it easier to search the code. + +**4.** Many of the rules do not have logical reasons; they are dictated by established practices. + +## Formatting {#formatting} + +**1.** Most of the formatting will be done automatically by `clang-format`. + +**2.** Indents are 4 spaces. Configure your development environment so that a tab adds four spaces. + +**3.** Opening and closing curly brackets must be on a separate line. + +``` cpp +inline void readBoolText(bool & x, ReadBuffer & buf) +{ + char tmp = '0'; + readChar(tmp, buf); + x = tmp != '0'; +} +``` + +**4.** If the entire function body is a single `statement`, it can be placed on a single line. Place spaces around curly braces (besides the space at the end of the line). + +``` cpp +inline size_t mask() const { return buf_size() - 1; } +inline size_t place(HashValue x) const { return x & mask(); } +``` + +**5.** For functions. Don’t put spaces around brackets. + +``` cpp +void reinsert(const Value & x) +``` + +``` cpp +memcpy(&buf[place_value], &x, sizeof(x)); +``` + +**6.** In `if`, `for`, `while` and other expressions, a space is inserted in front of the opening bracket (as opposed to function calls). + +``` cpp +for (size_t i = 0; i < rows; i += storage.index_granularity) +``` + +**7.** Add spaces around binary operators (`+`, `-`, `*`, `/`, `%`, …) and the ternary operator `?:`. + +``` cpp +UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); +UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); +UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); +``` + +**8.** If a line feed is entered, put the operator on a new line and increase the indent before it. + +``` cpp +if (elapsed_ns) + message << " (" + << rows_read_on_server * 1000000000 / elapsed_ns << " rows/s., " + << bytes_read_on_server * 1000.0 / elapsed_ns << " MB/s.) "; +``` + +**9.** You can use spaces for alignment within a line, if desired. + +``` cpp +dst.ClickLogID = click.LogID; +dst.ClickEventID = click.EventID; +dst.ClickGoodEvent = click.GoodEvent; +``` + +**10.** Don’t use spaces around the operators `.`, `->`. + +If necessary, the operator can be wrapped to the next line. In this case, the offset in front of it is increased. + +**11.** Do not use a space to separate unary operators (`--`, `++`, `*`, `&`, …) from the argument. + +**12.** Put a space after a comma, but not before it. The same rule goes for a semicolon inside a `for` expression. + +**13.** Do not use spaces to separate the `[]` operator. + +**14.** In a `template <...>` expression, use a space between `template` and `<`; no spaces after `<` or before `>`. + +``` cpp +template +struct AggregatedStatElement +{} +``` + +**15.** In classes and structures, write `public`, `private`, and `protected` on the same level as `class/struct`, and indent the rest of the code. + +``` cpp +template +class MultiVersion +{ +public: + /// Version of object for usage. shared_ptr manage lifetime of version. + using Version = std::shared_ptr; + ... +} +``` + +**16.** If the same `namespace` is used for the entire file, and there isn’t anything else significant, an offset is not necessary inside `namespace`. + +**17.** If the block for an `if`, `for`, `while`, or other expression consists of a single `statement`, the curly brackets are optional. Place the `statement` on a separate line, instead. This rule is also valid for nested `if`, `for`, `while`, … + +But if the inner `statement` contains curly brackets or `else`, the external block should be written in curly brackets. + +``` cpp +/// Finish write. +for (auto & stream : streams) + stream.second->finalize(); +``` + +**18.** There shouldn’t be any spaces at the ends of lines. + +**19.** Source files are UTF-8 encoded. + +**20.** Non-ASCII characters can be used in string literals. + +``` cpp +<< ", " << (timer.elapsed() / chunks_stats.hits) << " μsec/hit."; +``` + +**21.** Do not write multiple expressions in a single line. + +**22.** Group sections of code inside functions and separate them with no more than one empty line. + +**23.** Separate functions, classes, and so on with one or two empty lines. + +**24.** `A const` (related to a value) must be written before the type name. + +``` cpp +//correct +const char * pos +const std::string & s +//incorrect +char const * pos +``` + +**25.** When declaring a pointer or reference, the `*` and `&` symbols should be separated by spaces on both sides. + +``` cpp +//correct +const char * pos +//incorrect +const char* pos +const char *pos +``` + +**26.** When using template types, alias them with the `using` keyword (except in the simplest cases). + +In other words, the template parameters are specified only in `using` and aren’t repeated in the code. + +`using` can be declared locally, such as inside a function. + +``` cpp +//correct +using FileStreams = std::map>; +FileStreams streams; +//incorrect +std::map> streams; +``` + +**27.** Do not declare several variables of different types in one statement. + +``` cpp +//incorrect +int x, *y; +``` + +**28.** Do not use C-style casts. + +``` cpp +//incorrect +std::cerr << (int)c <<; std::endl; +//correct +std::cerr << static_cast(c) << std::endl; +``` + +**29.** In classes and structs, group members and functions separately inside each visibility scope. + +**30.** For small classes and structs, it is not necessary to separate the method declaration from the implementation. + +The same is true for small methods in any classes or structs. + +For templated classes and structs, do not separate the method declarations from the implementation (because otherwise they must be defined in the same translation unit). + +**31.** You can wrap lines at 140 characters, instead of 80. + +**32.** Always use the prefix increment/decrement operators if postfix is not required. + +``` cpp +for (Names::const_iterator it = column_names.begin(); it != column_names.end(); ++it) +``` + +## Comments {#comments} + +**1.** Be sure to add comments for all non-trivial parts of code. + +This is very important. Writing the comment might help you realize that the code isn’t necessary, or that it is designed wrong. + +``` cpp +/** Part of piece of memory, that can be used. + * For example, if internal_buffer is 1MB, and there was only 10 bytes loaded to buffer from file for reading, + * then working_buffer will have size of only 10 bytes + * (working_buffer.end() will point to position right after those 10 bytes available for read). + */ +``` + +**2.** Comments can be as detailed as necessary. + +**3.** Place comments before the code they describe. In rare cases, comments can come after the code, on the same line. + +``` cpp +/** Parses and executes the query. +*/ +void executeQuery( + ReadBuffer & istr, /// Where to read the query from (and data for INSERT, if applicable) + WriteBuffer & ostr, /// Where to write the result + Context & context, /// DB, tables, data types, engines, functions, aggregate functions... + BlockInputStreamPtr & query_plan, /// Here could be written the description on how query was executed + QueryProcessingStage::Enum stage = QueryProcessingStage::Complete /// Up to which stage process the SELECT query + ) +``` + +**4.** Comments should be written in English only. + +**5.** If you are writing a library, include detailed comments explaining it in the main header file. + +**6.** Do not add comments that do not provide additional information. In particular, do not leave empty comments like this: + +``` cpp +/* +* Procedure Name: +* Original procedure name: +* Author: +* Date of creation: +* Dates of modification: +* Modification authors: +* Original file name: +* Purpose: +* Intent: +* Designation: +* Classes used: +* Constants: +* Local variables: +* Parameters: +* Date of creation: +* Purpose: +*/ +``` + +The example is borrowed from the resource http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/. + +**7.** Do not write garbage comments (author, creation date ..) at the beginning of each file. + +**8.** Single-line comments begin with three slashes: `///` and multi-line comments begin with `/**`. These comments are considered “documentation”. + +Note: You can use Doxygen to generate documentation from these comments. But Doxygen is not generally used because it is more convenient to navigate the code in the IDE. + +**9.** Multi-line comments must not have empty lines at the beginning and end (except the line that closes a multi-line comment). + +**10.** For commenting out code, use basic comments, not “documenting” comments. + +**11.** Delete the commented out parts of the code before committing. + +**12.** Do not use profanity in comments or code. + +**13.** Do not use uppercase letters. Do not use excessive punctuation. + +``` cpp +/// WHAT THE FAIL??? +``` + +**14.** Do not use comments to make delimeters. + +``` cpp +///****************************************************** +``` + +**15.** Do not start discussions in comments. + +``` cpp +/// Why did you do this stuff? +``` + +**16.** There’s no need to write a comment at the end of a block describing what it was about. + +``` cpp +/// for +``` + +## Names {#names} + +**1.** Use lowercase letters with underscores in the names of variables and class members. + +``` cpp +size_t max_block_size; +``` + +**2.** For the names of functions (methods), use camelCase beginning with a lowercase letter. + +``` cpp +std::string getName() const override { return "Memory"; } +``` + +**3.** For the names of classes (structs), use CamelCase beginning with an uppercase letter. Prefixes other than I are not used for interfaces. + +``` cpp +class StorageMemory : public IStorage +``` + +**4.** `using` are named the same way as classes. + +**5.** Names of template type arguments: in simple cases, use `T`; `T`, `U`; `T1`, `T2`. + +For more complex cases, either follow the rules for class names, or add the prefix `T`. + +``` cpp +template +struct AggregatedStatElement +``` + +**6.** Names of template constant arguments: either follow the rules for variable names, or use `N` in simple cases. + +``` cpp +template +struct ExtractDomain +``` + +**7.** For abstract classes (interfaces) you can add the `I` prefix. + +``` cpp +class IBlockInputStream +``` + +**8.** If you use a variable locally, you can use the short name. + +In all other cases, use a name that describes the meaning. + +``` cpp +bool info_successfully_loaded = false; +``` + +**9.** Names of `define`s and global constants use ALL_CAPS with underscores. + +``` cpp +#define MAX_SRC_TABLE_NAMES_TO_STORE 1000 +``` + +**10.** File names should use the same style as their contents. + +If a file contains a single class, name the file the same way as the class (CamelCase). + +If the file contains a single function, name the file the same way as the function (camelCase). + +**11.** If the name contains an abbreviation, then: + +- For variable names, the abbreviation should use lowercase letters `mysql_connection` (not `mySQL_connection`). +- For names of classes and functions, keep the uppercase letters in the abbreviation`MySQLConnection` (not `MySqlConnection`). + +**12.** Constructor arguments that are used just to initialize the class members should be named the same way as the class members, but with an underscore at the end. + +``` cpp +FileQueueProcessor( + const std::string & path_, + const std::string & prefix_, + std::shared_ptr handler_) + : path(path_), + prefix(prefix_), + handler(handler_), + log(&Logger::get("FileQueueProcessor")) +{ +} +``` + +The underscore suffix can be omitted if the argument is not used in the constructor body. + +**13.** There is no difference in the names of local variables and class members (no prefixes required). + +``` cpp +timer (not m_timer) +``` + +**14.** For the constants in an `enum`, use CamelCase with a capital letter. ALL_CAPS is also acceptable. If the `enum` is non-local, use an `enum class`. + +``` cpp +enum class CompressionMethod +{ + QuickLZ = 0, + LZ4 = 1, +}; +``` + +**15.** All names must be in English. Transliteration of Hebrew words is not allowed. + + not T_PAAMAYIM_NEKUDOTAYIM + +**16.** Abbreviations are acceptable if they are well known (when you can easily find the meaning of the abbreviation in Wikipedia or in a search engine). + + `AST`, `SQL`. + + Not `NVDH` (some random letters) + +Incomplete words are acceptable if the shortened version is common use. + +You can also use an abbreviation if the full name is included next to it in the comments. + +**17.** File names with C++ source code must have the `.cpp` extension. Header files must have the `.h` extension. + +## How to Write Code {#how-to-write-code} + +**1.** Memory management. + +Manual memory deallocation (`delete`) can only be used in library code. + +In library code, the `delete` operator can only be used in destructors. + +In application code, memory must be freed by the object that owns it. + +Examples: + +- The easiest way is to place an object on the stack, or make it a member of another class. +- For a large number of small objects, use containers. +- For automatic deallocation of a small number of objects that reside in the heap, use `shared_ptr/unique_ptr`. + +**2.** Resource management. + +Use `RAII` and see above. + +**3.** Error handling. + +Use exceptions. In most cases, you only need to throw an exception, and do not need to catch it (because of `RAII`). + +In offline data processing applications, it’s often acceptable to not catch exceptions. + +In servers that handle user requests, it’s usually enough to catch exceptions at the top level of the connection handler. + +In thread functions, you should catch and keep all exceptions to rethrow them in the main thread after `join`. + +``` cpp +/// If there weren't any calculations yet, calculate the first block synchronously +if (!started) +{ + calculate(); + started = true; +} +else /// If calculations are already in progress, wait for the result + pool.wait(); + +if (exception) + exception->rethrow(); +``` + +Never hide exceptions without handling. Never just blindly put all exceptions to log. + +``` cpp +//Not correct +catch (...) {} +``` + +If you need to ignore some exceptions, do so only for specific ones and rethrow the rest. + +``` cpp +catch (const DB::Exception & e) +{ + if (e.code() == ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION) + return nullptr; + else + throw; +} +``` + +When using functions with response codes or `errno`, always check the result and throw an exception in case of error. + +``` cpp +if (0 != close(fd)) + throwFromErrno("Cannot close file " + file_name, ErrorCodes::CANNOT_CLOSE_FILE); +``` + +You can use assert to check invariants in code. + +**4.** Exception types. + +There is no need to use complex exception hierarchy in application code. The exception text should be understandable to a system administrator. + +**5.** Throwing exceptions from destructors. + +This is not recommended, but it is allowed. + +Use the following options: + +- Create a function (`done()` or `finalize()`) that will do all the work in advance that might lead to an exception. If that function was called, there should be no exceptions in the destructor later. +- Tasks that are too complex (such as sending messages over the network) can be put in separate method that the class user will have to call before destruction. +- If there is an exception in the destructor, it’s better to log it than to hide it (if the logger is available). +- In simple applications, it is acceptable to rely on `std::terminate` (for cases of `noexcept` by default in C++11) to handle exceptions. + +**6.** Anonymous code blocks. + +You can create a separate code block inside a single function in order to make certain variables local, so that the destructors are called when exiting the block. + +``` cpp +Block block = data.in->read(); + +{ + std::lock_guard lock(mutex); + data.ready = true; + data.block = block; +} + +ready_any.set(); +``` + +**7.** Multithreading. + +In offline data processing programs: + +- Try to get the best possible performance on a single CPU core. You can then parallelize your code if necessary. + +In server applications: + +- Use the thread pool to process requests. At this point, we haven’t had any tasks that required userspace context switching. + +Fork is not used for parallelization. + +**8.** Syncing threads. + +Often it is possible to make different threads use different memory cells (even better: different cache lines,) and to not use any thread synchronization (except `joinAll`). + +If synchronization is required, in most cases, it is sufficient to use mutex under `lock_guard`. + +In other cases use system synchronization primitives. Do not use busy wait. + +Atomic operations should be used only in the simplest cases. + +Do not try to implement lock-free data structures unless it is your primary area of expertise. + +**9.** Pointers vs references. + +In most cases, prefer references. + +**10.** const. + +Use constant references, pointers to constants, `const_iterator`, and const methods. + +Consider `const` to be default and use non-`const` only when necessary. + +When passing variables by value, using `const` usually does not make sense. + +**11.** unsigned. + +Use `unsigned` if necessary. + +**12.** Numeric types. + +Use the types `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, and `Int64`, as well as `size_t`, `ssize_t`, and `ptrdiff_t`. + +Don’t use these types for numbers: `signed/unsigned long`, `long long`, `short`, `signed/unsigned char`, `char`. + +**13.** Passing arguments. + +Pass complex values by value if they are going to be moved and use std::move; pass by reference if you want to update value in a loop. + +If a function captures ownership of an object created in the heap, make the argument type `shared_ptr` or `unique_ptr`. + +**14.** Return values. + +In most cases, just use `return`. Do not write `return std::move(res)`. + +If the function allocates an object on heap and returns it, use `shared_ptr` or `unique_ptr`. + +In rare cases (updating a value in a loop) you might need to return the value via an argument. In this case, the argument should be a reference. + +``` cpp +using AggregateFunctionPtr = std::shared_ptr; + +/** Allows creating an aggregate function by its name. + */ +class AggregateFunctionFactory +{ +public: + AggregateFunctionFactory(); + AggregateFunctionPtr get(const String & name, const DataTypes & argument_types) const; +``` + +**15.** namespace. + +There is no need to use a separate `namespace` for application code. + +Small libraries do not need this, either. + +For medium to large libraries, put everything in a `namespace`. + +In the library’s `.h` file, you can use `namespace detail` to hide implementation details not needed for the application code. + +In a `.cpp` file, you can use a `static` or anonymous namespace to hide symbols. + +Also, a `namespace` can be used for an `enum` to prevent the corresponding names from falling into an external `namespace` (but it’s better to use an `enum class`). + +**16.** Deferred initialization. + +If arguments are required for initialization, then you normally shouldn’t write a default constructor. + +If later you’ll need to delay initialization, you can add a default constructor that will create an invalid object. Or, for a small number of objects, you can use `shared_ptr/unique_ptr`. + +``` cpp +Loader(DB::Connection * connection_, const std::string & query, size_t max_block_size_); + +/// For deferred initialization +Loader() {} +``` + +**17.** Virtual functions. + +If the class is not intended for polymorphic use, you do not need to make functions virtual. This also applies to the destructor. + +**18.** Encodings. + +Use UTF-8 everywhere. Use `std::string` and `char *`. Do not use `std::wstring` and `wchar_t`. + +**19.** Logging. + +See the examples everywhere in the code. + +Before committing, delete all meaningless and debug logging, and any other types of debug output. + +Logging in cycles should be avoided, even on the Trace level. + +Logs must be readable at any logging level. + +Logging should only be used in application code, for the most part. + +Log messages must be written in English. + +The log should preferably be understandable for the system administrator. + +Do not use profanity in the log. + +Use UTF-8 encoding in the log. In rare cases you can use non-ASCII characters in the log. + +**20.** Input-output. + +Don’t use `iostreams` in internal cycles that are critical for application performance (and never use `stringstream`). + +Use the `DB/IO` library instead. + +**21.** Date and time. + +See the `DateLUT` library. + +**22.** include. + +Always use `#pragma once` instead of include guards. + +**23.** using. + +`using namespace` is not used. You can use `using` with something specific. But make it local inside a class or function. + +**24.** Do not use `trailing return type` for functions unless necessary. + +``` cpp +auto f() -> void +``` + +**25.** Declaration and initialization of variables. + +``` cpp +//right way +std::string s = "Hello"; +std::string s{"Hello"}; + +//wrong way +auto s = std::string{"Hello"}; +``` + +**26.** For virtual functions, write `virtual` in the base class, but write `override` instead of `virtual` in descendent classes. + +## Unused Features of C++ {#unused-features-of-c} + +**1.** Virtual inheritance is not used. + +**2.** Exception specifiers from C++03 are not used. + +## Platform {#platform} + +**1.** We write code for a specific platform. + +But other things being equal, cross-platform or portable code is preferred. + +**2.** Language: C++20 (see the list of available [C++20 features](https://en.cppreference.com/w/cpp/compiler_support#C.2B.2B20_features)). + +**3.** Compiler: `clang`. At this time (April 2021), the code is compiled using clang version 11. (It can also be compiled using `gcc` version 10, but it's untested and not suitable for production usage). + +The standard library is used (`libc++`). + +**4.**OS: Linux Ubuntu, not older than Precise. + +**5.**Code is written for x86_64 CPU architecture. + +The CPU instruction set is the minimum supported set among our servers. Currently, it is SSE 4.2. + +**6.** Use `-Wall -Wextra -Werror` compilation flags. Also `-Weverything` is used with few exceptions. + +**7.** Use static linking with all libraries except those that are difficult to connect to statically (see the output of the `ldd` command). + +**8.** Code is developed and debugged with release settings. + +## Tools {#tools} + +**1.** KDevelop is a good IDE. + +**2.** For debugging, use `gdb`, `valgrind` (`memcheck`), `strace`, `-fsanitize=...`, or `tcmalloc_minimal_debug`. + +**3.** For profiling, use `Linux Perf`, `valgrind` (`callgrind`), or `strace -cf`. + +**4.** Sources are in Git. + +**5.** Assembly uses `CMake`. + +**6.** Programs are released using `deb` packages. + +**7.** Commits to master must not break the build. + +Though only selected revisions are considered workable. + +**8.** Make commits as often as possible, even if the code is only partially ready. + +Use branches for this purpose. + +If your code in the `master` branch is not buildable yet, exclude it from the build before the `push`. You’ll need to finish it or remove it within a few days. + +**9.** For non-trivial changes, use branches and publish them on the server. + +**10.** Unused code is removed from the repository. + +## Libraries {#libraries} + +**1.** The C++20 standard library is used (experimental extensions are allowed), as well as `boost` and `Poco` frameworks. + +**2.** It is not allowed to use libraries from OS packages. It is also not allowed to use pre-installed libraries. All libraries should be placed in form of source code in `contrib` directory and built with ClickHouse. See [Guidelines for adding new third-party libraries](contrib.md#adding-third-party-libraries) for details. + +**3.** Preference is always given to libraries that are already in use. + +## General Recommendations {#general-recommendations-1} + +**1.** Write as little code as possible. + +**2.** Try the simplest solution. + +**3.** Don’t write code until you know how it’s going to work and how the inner loop will function. + +**4.** In the simplest cases, use `using` instead of classes or structs. + +**5.** If possible, do not write copy constructors, assignment operators, destructors (other than a virtual one, if the class contains at least one virtual function), move constructors or move assignment operators. In other words, the compiler-generated functions must work correctly. You can use `default`. + +**6.** Code simplification is encouraged. Reduce the size of your code where possible. + +## Additional Recommendations {#additional-recommendations} + +**1.** Explicitly specifying `std::` for types from `stddef.h` + +is not recommended. In other words, we recommend writing `size_t` instead `std::size_t`, because it’s shorter. + +It is acceptable to add `std::`. + +**2.** Explicitly specifying `std::` for functions from the standard C library + +is not recommended. In other words, write `memcpy` instead of `std::memcpy`. + +The reason is that there are similar non-standard functions, such as `memmem`. We do use these functions on occasion. These functions do not exist in `namespace std`. + +If you write `std::memcpy` instead of `memcpy` everywhere, then `memmem` without `std::` will look strange. + +Nevertheless, you can still use `std::` if you prefer it. + +**3.** Using functions from C when the same ones are available in the standard C++ library. + +This is acceptable if it is more efficient. + +For example, use `memcpy` instead of `std::copy` for copying large chunks of memory. + +**4.** Multiline function arguments. + +Any of the following wrapping styles are allowed: + +``` cpp +function( + T1 x1, + T2 x2) +``` + +``` cpp +function( + size_t left, size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +``` cpp +function(size_t left, size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +``` cpp +function(size_t left, size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +``` cpp +function( + size_t left, + size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +[Original article](https://clickhouse.com/docs/en/development/style/) diff --git a/docs/en/reference/development/tests.md b/docs/en/reference/development/tests.md new file mode 100644 index 00000000000..29b69f0b697 --- /dev/null +++ b/docs/en/reference/development/tests.md @@ -0,0 +1,297 @@ +--- +sidebar_position: 70 +sidebar_label: Testing +description: Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way. +--- + +# ClickHouse Testing + +## Functional Tests + +Functional tests are the most simple and convenient to use. Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way. + +Each functional test sends one or multiple queries to the running ClickHouse server and compares the result with reference. + +Tests are located in `queries` directory. There are two subdirectories: `stateless` and `stateful`. Stateless tests run queries without any preloaded test data - they often create small synthetic datasets on the fly, within the test itself. Stateful tests require preloaded test data from CLickHouse and it is available to general public. + +Each test can be one of two types: `.sql` and `.sh`. `.sql` test is the simple SQL script that is piped to `clickhouse-client --multiquery --testmode`. `.sh` test is a script that is run by itself. SQL tests are generally preferable to `.sh` tests. You should use `.sh` tests only when you have to test some feature that cannot be exercised from pure SQL, such as piping some input data into `clickhouse-client` or testing `clickhouse-local`. + +### Running a Test Locally {#functional-test-locally} + +Start the ClickHouse server locally, listening on the default port (9000). To +run, for example, the test `01428_hash_set_nan_key`, change to the repository +folder and run the following command: + +``` +PATH=$PATH: tests/clickhouse-test 01428_hash_set_nan_key +``` + +For more options, see `tests/clickhouse-test --help`. You can simply run all tests or run subset of tests filtered by substring in test name: `./clickhouse-test substring`. There are also options to run tests in parallel or in randomized order. + +### Adding a New Test + +To add new test, create a `.sql` or `.sh` file in `queries/0_stateless` directory, check it manually and then generate `.reference` file in the following way: `clickhouse-client -n --testmode < 00000_test.sql > 00000_test.reference` or `./00000_test.sh > ./00000_test.reference`. + +Tests should use (create, drop, etc) only tables in `test` database that is assumed to be created beforehand; also tests can use temporary tables. + +### Choosing the Test Name + +The name of the test starts with a five-digit prefix followed by a descriptive name, such as `00422_hash_function_constexpr.sql`. To choose the prefix, find the largest prefix already present in the directory, and increment it by one. In the meantime, some other tests might be added with the same numeric prefix, but this is OK and does not lead to any problems, you don't have to change it later. + +Some tests are marked with `zookeeper`, `shard` or `long` in their names. `zookeeper` is for tests that are using ZooKeeper. `shard` is for tests that requires server to listen `127.0.0.*`; `distributed` or `global` have the same meaning. `long` is for tests that run slightly longer that one second. You can disable these groups of tests using `--no-zookeeper`, `--no-shard` and `--no-long` options, respectively. Make sure to add a proper prefix to your test name if it needs ZooKeeper or distributed queries. + +### Checking for an Error that Must Occur + +Sometimes you want to test that a server error occurs for an incorrect query. We support special annotations for this in SQL tests, in the following form: +``` +select x; -- { serverError 49 } +``` +This test ensures that the server returns an error with code 49 about unknown column `x`. If there is no error, or the error is different, the test will fail. If you want to ensure that an error occurs on the client side, use `clientError` annotation instead. + +Do not check for a particular wording of error message, it may change in the future, and the test will needlessly break. Check only the error code. If the existing error code is not precise enough for your needs, consider adding a new one. + +### Testing a Distributed Query + +If you want to use distributed queries in functional tests, you can leverage `remote` table function with `127.0.0.{1..2}` addresses for the server to query itself; or you can use predefined test clusters in server configuration file like `test_shard_localhost`. Remember to add the words `shard` or `distributed` to the test name, so that it is run in CI in correct configurations, where the server is configured to support distributed queries. + + +## Known Bugs {#known-bugs} + +If we know some bugs that can be easily reproduced by functional tests, we place prepared functional tests in `tests/queries/bugs` directory. These tests will be moved to `tests/queries/0_stateless` when bugs are fixed. + +## Integration Tests {#integration-tests} + +Integration tests allow testing ClickHouse in clustered configuration and ClickHouse interaction with other servers like MySQL, Postgres, MongoDB. They are useful to emulate network splits, packet drops, etc. These tests are run under Docker and create multiple containers with various software. + +See `tests/integration/README.md` on how to run these tests. + +Note that integration of ClickHouse with third-party drivers is not tested. Also, we currently do not have integration tests with our JDBC and ODBC drivers. + +## Unit Tests {#unit-tests} + +Unit tests are useful when you want to test not the ClickHouse as a whole, but a single isolated library or class. You can enable or disable build of tests with `ENABLE_TESTS` CMake option. Unit tests (and other test programs) are located in `tests` subdirectories across the code. To run unit tests, type `ninja test`. Some tests use `gtest`, but some are just programs that return non-zero exit code on test failure. + +It’s not necessary to have unit tests if the code is already covered by functional tests (and functional tests are usually much more simple to use). + +You can run individual gtest checks by calling the executable directly, for example: + +```bash +$ ./src/unit_tests_dbms --gtest_filter=LocalAddress* +``` + +## Performance Tests {#performance-tests} + +Performance tests allow to measure and compare performance of some isolated part of ClickHouse on synthetic queries. Tests are located at `tests/performance`. Each test is represented by `.xml` file with description of test case. Tests are run with `docker/tests/performance-comparison` tool . See the readme file for invocation. + +Each test run one or multiple queries (possibly with combinations of parameters) in a loop. Some tests can contain preconditions on preloaded test dataset. + +If you want to improve performance of ClickHouse in some scenario, and if improvements can be observed on simple queries, it is highly recommended to write a performance test. It always makes sense to use `perf top` or other perf tools during your tests. + +## Test Tools and Scripts {#test-tools-and-scripts} + +Some programs in `tests` directory are not prepared tests, but are test tools. For example, for `Lexer` there is a tool `src/Parsers/tests/lexer` that just do tokenization of stdin and writes colorized result to stdout. You can use these kind of tools as a code examples and for exploration and manual testing. + +## Miscellaneous Tests {#miscellaneous-tests} + +There are tests for machine learned models in `tests/external_models`. These tests are not updated and must be transferred to integration tests. + +There is separate test for quorum inserts. This test run ClickHouse cluster on separate servers and emulate various failure cases: network split, packet drop (between ClickHouse nodes, between ClickHouse and ZooKeeper, between ClickHouse server and client, etc.), `kill -9`, `kill -STOP` and `kill -CONT` , like [Jepsen](https://aphyr.com/tags/Jepsen). Then the test checks that all acknowledged inserts was written and all rejected inserts was not. + +Quorum test was written by separate team before ClickHouse was open-sourced. This team no longer work with ClickHouse. Test was accidentally written in Java. For these reasons, quorum test must be rewritten and moved to integration tests. + +## Manual Testing {#manual-testing} + +When you develop a new feature, it is reasonable to also test it manually. You can do it with the following steps: + +Build ClickHouse. Run ClickHouse from the terminal: change directory to `programs/clickhouse-server` and run it with `./clickhouse-server`. It will use configuration (`config.xml`, `users.xml` and files within `config.d` and `users.d` directories) from the current directory by default. To connect to ClickHouse server, run `programs/clickhouse-client/clickhouse-client`. + +Note that all clickhouse tools (server, client, etc) are just symlinks to a single binary named `clickhouse`. You can find this binary at `programs/clickhouse`. All tools can also be invoked as `clickhouse tool` instead of `clickhouse-tool`. + +Alternatively you can install ClickHouse package: either stable release from ClickHouse repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo clickhouse start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`. + +When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary: + +``` bash +$ sudo clickhouse stop +$ sudo cp ./clickhouse /usr/bin/ +$ sudo clickhouse start +``` + +Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal: + +``` bash +$ sudo clickhouse stop +$ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml +``` + +Example with gdb: + +``` bash +$ sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml +``` + +If the system clickhouse-server is already running and you do not want to stop it, you can change port numbers in your `config.xml` (or override them in a file in `config.d` directory), provide appropriate data path, and run it. + +`clickhouse` binary has almost no dependencies and works across wide range of Linux distributions. To quick and dirty test your changes on a server, you can simply `scp` your fresh built `clickhouse` binary to your server and then run it as in examples above. + +## Build Tests {#build-tests} + +Build tests allow to check that build is not broken on various alternative configurations and on some foreign systems. These tests are automated as well. + +Examples: +- cross-compile for Darwin x86_64 (Mac OS X) +- cross-compile for FreeBSD x86_64 +- cross-compile for Linux AArch64 +- build on Ubuntu with libraries from system packages (discouraged) +- build with shared linking of libraries (discouraged) + +For example, build with system packages is bad practice, because we cannot guarantee what exact version of packages a system will have. But this is really needed by Debian maintainers. For this reason we at least have to support this variant of build. Another example: shared linking is a common source of trouble, but it is needed for some enthusiasts. + +Though we cannot run all tests on all variant of builds, we want to check at least that various build variants are not broken. For this purpose we use build tests. + +We also test that there are no translation units that are too long to compile or require too much RAM. + +We also test that there are no too large stack frames. + +## Testing for Protocol Compatibility {#testing-for-protocol-compatibility} + +When we extend ClickHouse network protocol, we test manually that old clickhouse-client works with new clickhouse-server and new clickhouse-client works with old clickhouse-server (simply by running binaries from corresponding packages). + +We also test some cases automatically with integrational tests: +- if data written by old version of ClickHouse can be successfully read by the new version; +- do distributed queries work in a cluster with different ClickHouse versions. + +## Help from the Compiler {#help-from-the-compiler} + +Main ClickHouse code (that is located in `dbms` directory) is built with `-Wall -Wextra -Werror` and with some additional enabled warnings. Although these options are not enabled for third-party libraries. + +Clang has even more useful warnings - you can look for them with `-Weverything` and pick something to default build. + +For production builds, clang is used, but we also test make gcc builds. For development, clang is usually more convenient to use. You can build on your own machine with debug mode (to save battery of your laptop), but please note that compiler is able to generate more warnings with `-O3` due to better control flow and inter-procedure analysis. When building with clang in debug mode, debug version of `libc++` is used that allows to catch more errors at runtime. + +## Sanitizers {#sanitizers} + +### Address sanitizer +We run functional, integration, stress and unit tests under ASan on per-commit basis. + +### Thread sanitizer +We run functional, integration, stress and unit tests under TSan on per-commit basis. + +### Memory sanitizer +We run functional, integration, stress and unit tests under MSan on per-commit basis. + +### Undefined behaviour sanitizer +We run functional, integration, stress and unit tests under UBSan on per-commit basis. The code of some third-party libraries is not sanitized for UB. + +### Valgrind (Memcheck) +We used to run functional tests under Valgrind overnight, but don't do it anymore. It takes multiple hours. Currently there is one known false positive in `re2` library, see [this article](https://research.swtch.com/sparse). + +## Fuzzing {#fuzzing} + +ClickHouse fuzzing is implemented both using [libFuzzer](https://llvm.org/docs/LibFuzzer.html) and random SQL queries. +All the fuzz testing should be performed with sanitizers (Address and Undefined). + +LibFuzzer is used for isolated fuzz testing of library code. Fuzzers are implemented as part of test code and have “_fuzzer” name postfixes. +Fuzzer example can be found at `src/Parsers/tests/lexer_fuzzer.cpp`. LibFuzzer-specific configs, dictionaries and corpus are stored at `tests/fuzz`. +We encourage you to write fuzz tests for every functionality that handles user input. + +Fuzzers are not built by default. To build fuzzers both `-DENABLE_FUZZING=1` and `-DENABLE_TESTS=1` options should be set. +We recommend to disable Jemalloc while building fuzzers. Configuration used to integrate ClickHouse fuzzing to +Google OSS-Fuzz can be found at `docker/fuzz`. + +We also use simple fuzz test to generate random SQL queries and to check that the server does not die executing them. +You can find it in `00746_sql_fuzzy.pl`. This test should be run continuously (overnight and longer). + +We also use sophisticated AST-based query fuzzer that is able to find huge amount of corner cases. It does random permutations and substitutions in queries AST. It remembers AST nodes from previous tests to use them for fuzzing of subsequent tests while processing them in random order. You can learn more about this fuzzer in [this blog article](https://clickhouse.com/blog/en/2021/fuzzing-clickhouse/). + +## Stress test + +Stress tests are another case of fuzzing. It runs all functional tests in parallel in random order with a single server. Results of the tests are not checked. + +It is checked that: +- server does not crash, no debug or sanitizer traps are triggered; +- there are no deadlocks; +- the database structure is consistent; +- server can successfully stop after the test and start again without exceptions. + +There are five variants (Debug, ASan, TSan, MSan, UBSan). + +## Thread Fuzzer + +Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuzzing that allows to randomize thread order of execution. It helps to find even more special cases. + +## Security Audit + +Our Security Team did some basic overview of ClickHouse capabilities from the security standpoint. + +## Static Analyzers {#static-analyzers} + +We run `clang-tidy` on per-commit basis. `clang-static-analyzer` checks are also enabled. `clang-tidy` is also used for some style checks. + +We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory. + +If you use `CLion` as an IDE, you can leverage some `clang-tidy` checks out of the box. + +We also use `shellcheck` for static analysis of shell scripts. + +## Hardening {#hardening} + +In debug build we are using custom allocator that does ASLR of user-level allocations. + +We also manually protect memory regions that are expected to be readonly after allocation. + +In debug build we also involve a customization of libc that ensures that no "harmful" (obsolete, insecure, not thread-safe) functions are called. + +Debug assertions are used extensively. + +In debug build, if exception with "logical error" code (implies a bug) is being thrown, the program is terminated prematurally. It allows to use exceptions in release build but make it an assertion in debug build. + +Debug version of jemalloc is used for debug builds. +Debug version of libc++ is used for debug builds. + +## Runtime Integrity Checks + +Data stored on disk is checksummed. Data in MergeTree tables is checksummed in three ways simultaneously* (compressed data blocks, uncompressed data blocks, the total checksum across blocks). Data transferred over network between client and server or between servers is also checksummed. Replication ensures bit-identical data on replicas. + +It is required to protect from faulty hardware (bit rot on storage media, bit flips in RAM on server, bit flips in RAM of network controller, bit flips in RAM of network switch, bit flips in RAM of client, bit flips on the wire). Note that bit flips are common and likely to occur even for ECC RAM and in presense of TCP checksums (if you manage to run thousands of servers processing petabytes of data each day). [See the video (russian)](https://www.youtube.com/watch?v=ooBAQIe0KlQ). + +ClickHouse provides diagnostics that will help ops engineers to find faulty hardware. + +\* and it is not slow. + +## Code Style {#code-style} + +Code style rules are described [here](style.md). + +To check for some common style violations, you can use `utils/check-style` script. + +To force proper style of your code, you can use `clang-format`. File `.clang-format` is located at the sources root. It mostly corresponding with our actual code style. But it’s not recommended to apply `clang-format` to existing files because it makes formatting worse. You can use `clang-format-diff` tool that you can find in clang source repository. + +Alternatively you can try `uncrustify` tool to reformat your code. Configuration is in `uncrustify.cfg` in the sources root. It is less tested than `clang-format`. + +`CLion` has its own code formatter that has to be tuned for our code style. + +We also use `codespell` to find typos in code. It is automated as well. + +## Test Coverage {#test-coverage} + +We also track test coverage but only for functional tests and only for clickhouse-server. It is performed on daily basis. + +## Tests for Tests + +There is automated check for flaky tests. It runs all new tests 100 times (for functional tests) or 10 times (for integration tests). If at least single time the test failed, it is considered flaky. + +## Testflows + +[Testflows](https://testflows.com/) is an enterprise-grade open-source testing framework, which is used to test a subset of ClickHouse. + +## Test Automation {#test-automation} + +We run tests with [GitHub Actions](https://github.com/features/actions). + +Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored for several months. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you. + +We do not use Travis CI due to the limit on time and computational power. +We do not use Jenkins. It was used before and now we are happy we are not using Jenkins. + +[Original article](https://clickhouse.com/docs/en/development/tests/) diff --git a/docs/en/reference/engines/_category_.yml b/docs/en/reference/engines/_category_.yml new file mode 100644 index 00000000000..a82c53bc65e --- /dev/null +++ b/docs/en/reference/engines/_category_.yml @@ -0,0 +1,8 @@ +position: 30 +label: 'Database & Table Engines' +collapsible: true +collapsed: true +link: + type: generated-index + title: Database & Table Engines + slug: /en/table-engines \ No newline at end of file diff --git a/docs/en/reference/engines/database-engines/atomic.md b/docs/en/reference/engines/database-engines/atomic.md new file mode 100644 index 00000000000..878307121aa --- /dev/null +++ b/docs/en/reference/engines/database-engines/atomic.md @@ -0,0 +1,61 @@ +--- +sidebar_label: Atomic +sidebar_position: 10 +--- + +# Atomic + +It supports non-blocking [DROP TABLE](#drop-detach-table) and [RENAME TABLE](#rename-table) queries and atomic [EXCHANGE TABLES](#exchange-tables) queries. `Atomic` database engine is used by default. + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE test [ENGINE = Atomic]; +``` + +## Specifics and recommendations {#specifics-and-recommendations} + +### Table UUID {#table-uuid} + +All tables in database `Atomic` have persistent [UUID](../../sql-reference/data-types/uuid.md) and store data in directory `/clickhouse_path/store/xxx/xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy/`, where `xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy` is UUID of the table. +Usually, the UUID is generated automatically, but the user can also explicitly specify the UUID in the same way when creating the table (this is not recommended). + +For example: + +```sql +CREATE TABLE name UUID '28f1c61c-2970-457a-bffe-454156ddcfef' (n UInt64) ENGINE = ...; +``` + +:::note +You can use the [show_table_uuid_in_table_create_query_if_not_nil](../../operations/settings/settings.md#show_table_uuid_in_table_create_query_if_not_nil) setting to display the UUID with the `SHOW CREATE` query. +::: + +### RENAME TABLE {#rename-table} + +[RENAME](../../sql-reference/statements/rename.md) queries are performed without changing the UUID or moving table data. These queries do not wait for the completion of queries using the table and are executed instantly. + +### DROP/DETACH TABLE {#drop-detach-table} + +On `DROP TABLE` no data is removed, database `Atomic` just marks table as dropped by moving metadata to `/clickhouse_path/metadata_dropped/` and notifies background thread. Delay before final table data deletion is specified by the [database_atomic_delay_before_drop_table_sec](../../operations/server-configuration-parameters/settings.md#database_atomic_delay_before_drop_table_sec) setting. +You can specify synchronous mode using `SYNC` modifier. Use the [database_atomic_wait_for_drop_and_detach_synchronously](../../operations/settings/settings.md#database_atomic_wait_for_drop_and_detach_synchronously) setting to do this. In this case `DROP` waits for running `SELECT`, `INSERT` and other queries which are using the table to finish. Table will be actually removed when it's not in use. + +### EXCHANGE TABLES/DICTIONARIES {#exchange-tables} + +[EXCHANGE](../../sql-reference/statements/exchange.md) query swaps tables or dictionaries atomically. For instance, instead of this non-atomic operation: + +```sql +RENAME TABLE new_table TO tmp, old_table TO new_table, tmp TO old_table; +``` +you can use one atomic query: + +``` sql +EXCHANGE TABLES new_table AND old_table; +``` + +### ReplicatedMergeTree in Atomic Database {#replicatedmergetree-in-atomic-database} + +For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables, it is recommended not to specify engine parameters - path in ZooKeeper and replica name. In this case, configuration parameters [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name) will be used. If you want to specify engine parameters explicitly, it is recommended to use `{uuid}` macros. This is useful so that unique paths are automatically generated for each table in ZooKeeper. + +## See Also + +- [system.databases](../../operations/system-tables/databases.md) system table diff --git a/docs/en/reference/engines/database-engines/index.md b/docs/en/reference/engines/database-engines/index.md new file mode 100644 index 00000000000..0cee580abcd --- /dev/null +++ b/docs/en/reference/engines/database-engines/index.md @@ -0,0 +1,25 @@ +--- +toc_folder_title: Database Engines +toc_priority: 27 +toc_title: Introduction +--- + +# Database Engines {#database-engines} + +Database engines allow you to work with tables. By default, ClickHouse uses the [Atomic](../../engines/database-engines/atomic.md) database engine, which provides configurable [table engines](../../engines/table-engines/index.md) and an [SQL dialect](../../sql-reference/syntax.md). + +Here is a complete list of available database engines. Follow the links for more details: + +- [Atomic](../../engines/database-engines/atomic.md) + +- [MySQL](../../engines/database-engines/mysql.md) + +- [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md) + +- [Lazy](../../engines/database-engines/lazy.md) + +- [PostgreSQL](../../engines/database-engines/postgresql.md) + +- [Replicated](../../engines/database-engines/replicated.md) + +- [SQLite](../../engines/database-engines/sqlite.md) diff --git a/docs/en/reference/engines/database-engines/lazy.md b/docs/en/reference/engines/database-engines/lazy.md new file mode 100644 index 00000000000..b95ade19df4 --- /dev/null +++ b/docs/en/reference/engines/database-engines/lazy.md @@ -0,0 +1,16 @@ +--- +sidebar_label: Lazy +sidebar_position: 20 +--- + +# Lazy {#lazy} + +Keeps tables in RAM only `expiration_time_in_seconds` seconds after last access. Can be used only with \*Log tables. + +It’s optimized for storing many small \*Log tables, for which there is a long time interval between accesses. + +## Creating a Database {#creating-a-database} + + CREATE DATABASE testlazy ENGINE = Lazy(expiration_time_in_seconds); + +[Original article](https://clickhouse.com/docs/en/database_engines/lazy/) diff --git a/docs/en/reference/engines/database-engines/materialized-mysql.md b/docs/en/reference/engines/database-engines/materialized-mysql.md new file mode 100644 index 00000000000..df072682097 --- /dev/null +++ b/docs/en/reference/engines/database-engines/materialized-mysql.md @@ -0,0 +1,290 @@ +--- +sidebar_label: MaterializedMySQL +sidebar_position: 70 +--- + +# [experimental] MaterializedMySQL + +:::warning +This is an experimental feature that should not be used in production. +::: + +Creates a ClickHouse database with all the tables existing in MySQL, and all the data in those tables. The ClickHouse server works as MySQL replica. It reads `binlog` and performs DDL and DML queries. + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] +ENGINE = MaterializedMySQL('host:port', ['database' | database], 'user', 'password') [SETTINGS ...] +[TABLE OVERRIDE table1 (...), TABLE OVERRIDE table2 (...)] +``` + +**Engine Parameters** + +- `host:port` — MySQL server endpoint. +- `database` — MySQL database name. +- `user` — MySQL user. +- `password` — User password. + +**Engine Settings** + +- `max_rows_in_buffer` — Maximum number of rows that data is allowed to cache in memory (for single table and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `65 505`. +- `max_bytes_in_buffer` — Maximum number of bytes that data is allowed to cache in memory (for single table and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `1 048 576`. +- `max_flush_data_time` — Maximum number of milliseconds that data is allowed to cache in memory (for database and the cache data unable to query). When this time is exceeded, the data will be materialized. Default: `1000`. +- `max_wait_time_when_mysql_unavailable` — Retry interval when MySQL is not available (milliseconds). Negative value disables retry. Default: `1000`. +- `allows_query_when_mysql_lost` — Allows to query a materialized table when MySQL is lost. Default: `0` (`false`). +- `materialized_mysql_tables_list` — a comma-separated list of mysql database tables, which will be replicated by MaterializedMySQL database engine. Default value: empty list — means whole tables will be replicated. + +```sql +CREATE DATABASE mysql ENGINE = MaterializedMySQL('localhost:3306', 'db', 'user', '***') + SETTINGS + allows_query_when_mysql_lost=true, + max_wait_time_when_mysql_unavailable=10000; +``` + +**Settings on MySQL-server Side** + +For the correct work of `MaterializedMySQL`, there are few mandatory `MySQL`-side configuration settings that must be set: + +- `default_authentication_plugin = mysql_native_password` since `MaterializedMySQL` can only authorize with this method. +- `gtid_mode = on` since GTID based logging is a mandatory for providing correct `MaterializedMySQL` replication. + +:::note +While turning on `gtid_mode` you should also specify `enforce_gtid_consistency = on`. +::: + +## Virtual Columns {#virtual-columns} + +When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree](../../engines/table-engines/mergetree-family/replacingmergetree.md) tables are used with virtual `_sign` and `_version` columns. + +- `_version` — Transaction counter. Type [UInt64](../../sql-reference/data-types/int-uint.md). +- `_sign` — Deletion mark. Type [Int8](../../sql-reference/data-types/int-uint.md). Possible values: + - `1` — Row is not deleted, + - `-1` — Row is deleted. + +## Data Types Support {#data_types-support} + +| MySQL | ClickHouse | +|-------------------------|--------------------------------------------------------------| +| TINY | [Int8](../../sql-reference/data-types/int-uint.md) | +| SHORT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INT24 | [Int32](../../sql-reference/data-types/int-uint.md) | +| LONG | [UInt32](../../sql-reference/data-types/int-uint.md) | +| LONGLONG | [UInt64](../../sql-reference/data-types/int-uint.md) | +| FLOAT | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NEWDECIMAL | [Decimal](../../sql-reference/data-types/decimal.md) | +| DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) | +| DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) | +| YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) | +| TIME | [Int64](../../sql-reference/data-types/int-uint.md) | +| ENUM | [Enum](../../sql-reference/data-types/enum.md) | +| STRING | [String](../../sql-reference/data-types/string.md) | +| VARCHAR, VAR_STRING | [String](../../sql-reference/data-types/string.md) | +| BLOB | [String](../../sql-reference/data-types/string.md) | +| GEOMETRY | [String](../../sql-reference/data-types/string.md) | +| BINARY | [FixedString](../../sql-reference/data-types/fixedstring.md) | +| BIT | [UInt64](../../sql-reference/data-types/int-uint.md) | +| SET | [UInt64](../../sql-reference/data-types/int-uint.md) | + +[Nullable](../../sql-reference/data-types/nullable.md) is supported. + +The data of TIME type in MySQL is converted to microseconds in ClickHouse. + +Other types are not supported. If MySQL table contains a column of such type, ClickHouse throws exception "Unhandled data type" and stops replication. + +## Specifics and Recommendations {#specifics-and-recommendations} + +### Compatibility Restrictions {#compatibility-restrictions} + +Apart of the data types limitations there are few restrictions comparing to `MySQL` databases, that should be resolved before replication will be possible: + +- Each table in `MySQL` should contain `PRIMARY KEY`. + +- Replication for tables, those are containing rows with `ENUM` field values out of range (specified in `ENUM` signature) will not work. + +### DDL Queries {#ddl-queries} + +MySQL DDL queries are converted into the corresponding ClickHouse DDL queries ([ALTER](../../sql-reference/statements/alter/index.md), [CREATE](../../sql-reference/statements/create/index.md), [DROP](../../sql-reference/statements/drop.md), [RENAME](../../sql-reference/statements/rename.md)). If ClickHouse cannot parse some DDL query, the query is ignored. + +### Data Replication {#data-replication} + +`MaterializedMySQL` does not support direct `INSERT`, `DELETE` and `UPDATE` queries. However, they are supported in terms of data replication: + +- MySQL `INSERT` query is converted into `INSERT` with `_sign=1`. + +- MySQL `DELETE` query is converted into `INSERT` with `_sign=-1`. + +- MySQL `UPDATE` query is converted into `INSERT` with `_sign=-1` and `INSERT` with `_sign=1` if the primary key has been changed, or + `INSERT` with `_sign=1` if not. + +### Selecting from MaterializedMySQL Tables {#select} + +`SELECT` query from `MaterializedMySQL` tables has some specifics: + +- If `_version` is not specified in the `SELECT` query, the + [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier is used, so only rows with + `MAX(_version)` are returned for each primary key value. + +- If `_sign` is not specified in the `SELECT` query, `WHERE _sign=1` is used by default. So the deleted rows are not + included into the result set. + +- The result includes columns comments in case they exist in MySQL database tables. + +### Index Conversion {#index-conversion} + +MySQL `PRIMARY KEY` and `INDEX` clauses are converted into `ORDER BY` tuples in ClickHouse tables. + +ClickHouse has only one physical order, which is determined by `ORDER BY` clause. To create a new physical order, use +[materialized views](../../sql-reference/statements/create/view.md#materialized). + +**Notes** + +- Rows with `_sign=-1` are not deleted physically from the tables. +- Cascade `UPDATE/DELETE` queries are not supported by the `MaterializedMySQL` engine, as they are not visible in the + MySQL binlog. +- Replication can be easily broken. +- Manual operations on database and tables are forbidden. +- `MaterializedMySQL` is affected by the [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert) + setting. Data is merged in the corresponding table in the `MaterializedMySQL` database when a table in the MySQL + server changes. + +### Table Overrides {#table-overrides} + +Table overrides can be used to customize the ClickHouse DDL queries, allowing you to make schema optimizations for your +application. This is especially useful for controlling partitioning, which is important for the overall performance of +MaterializedMySQL. + +These are the schema conversion manipulations you can do with table overrides for MaterializedMySQL: + + * Modify column type. Must be compatible with the original type, or replication will fail. For example, + you can modify a UInt32 column to UInt64, but you can not modify a String column to Array(String). + * Modify [column TTL](../table-engines/mergetree-family/mergetree/#mergetree-column-ttl). + * Modify [column compression codec](../../sql-reference/statements/create/table/#codecs). + * Add [ALIAS columns](../../sql-reference/statements/create/table/#alias). + * Add [skipping indexes](../table-engines/mergetree-family/mergetree/#table_engine-mergetree-data_skipping-indexes) + * Add [projections](../table-engines/mergetree-family/mergetree/#projections). Note that projection optimizations are + disabled when using `SELECT ... FINAL` (which MaterializedMySQL does by default), so their utility is limited here. + `INDEX ... TYPE hypothesis` as [described in the v21.12 blog post]](https://clickhouse.com/blog/en/2021/clickhouse-v21.12-released/) + may be more useful in this case. + * Modify [PARTITION BY](../table-engines/mergetree-family/custom-partitioning-key/) + * Modify [ORDER BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * Modify [PRIMARY KEY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * Add [SAMPLE BY](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + * Add [table TTL](../table-engines/mergetree-family/mergetree/#mergetree-query-clauses) + +```sql +CREATE DATABASE db_name ENGINE = MaterializedMySQL(...) +[SETTINGS ...] +[TABLE OVERRIDE table_name ( + [COLUMNS ( + [col_name [datatype] [ALIAS expr] [CODEC(...)] [TTL expr], ...] + [INDEX index_name expr TYPE indextype[(...)] GRANULARITY val, ...] + [PROJECTION projection_name (SELECT [GROUP BY] [ORDER BY]), ...] + )] + [ORDER BY expr] + [PRIMARY KEY expr] + [PARTITION BY expr] + [SAMPLE BY expr] + [TTL expr] +), ...] +``` + +Example: + +```sql +CREATE DATABASE db_name ENGINE = MaterializedMySQL(...) +TABLE OVERRIDE table1 ( + COLUMNS ( + userid UUID, + category LowCardinality(String), + timestamp DateTime CODEC(Delta, Default) + ) + PARTITION BY toYear(timestamp) +), +TABLE OVERRIDE table2 ( + COLUMNS ( + client_ip String TTL created + INTERVAL 72 HOUR + ) + SAMPLE BY ip_hash +) +``` + +The `COLUMNS` list is sparse; existing columns are modified as specified, extra ALIAS columns are added. It is not +possible to add ordinary or MATERIALIZED columns. Modified columns with a different type must be assignable from the +original type. There is currently no validation of this or similar issues when the `CREATE DATABASE` query executes, so +extra care needs to be taken. + +You may specify overrides for tables that do not exist yet. + +:::warning +It is easy to break replication with table overrides if not used with care. For example: + +* If an ALIAS column is added with a table override, and a column with the same name is later added to the source + MySQL table, the converted ALTER TABLE query in ClickHouse will fail and replication stops. +* It is currently possible to add overrides that reference nullable columns where not-nullable are required, such as in + `ORDER BY` or `PARTITION BY`. This will cause CREATE TABLE queries that will fail, also causing replication to stop. +::: + +## Examples of Use {#examples-of-use} + +Queries in MySQL: + +``` sql +mysql> CREATE DATABASE db; +mysql> CREATE TABLE db.test (a INT PRIMARY KEY, b INT); +mysql> INSERT INTO db.test VALUES (1, 11), (2, 22); +mysql> DELETE FROM db.test WHERE a=1; +mysql> ALTER TABLE db.test ADD COLUMN c VARCHAR(16); +mysql> UPDATE db.test SET c='Wow!', b=222; +mysql> SELECT * FROM test; +``` + +```text +┌─a─┬───b─┬─c────┐ +│ 2 │ 222 │ Wow! │ +└───┴─────┴──────┘ +``` + +Database in ClickHouse, exchanging data with the MySQL server: + +The database and the table created: + +``` sql +CREATE DATABASE mysql ENGINE = MaterializedMySQL('localhost:3306', 'db', 'user', '***'); +SHOW TABLES FROM mysql; +``` + +``` text +┌─name─┐ +│ test │ +└──────┘ +``` + +After inserting data: + +``` sql +SELECT * FROM mysql.test; +``` + +``` text +┌─a─┬──b─┐ +│ 1 │ 11 │ +│ 2 │ 22 │ +└───┴────┘ +``` + +After deleting data, adding the column and updating: + +``` sql +SELECT * FROM mysql.test; +``` + +``` text +┌─a─┬───b─┬─c────┐ +│ 2 │ 222 │ Wow! │ +└───┴─────┴──────┘ +``` + +[Original article](https://clickhouse.com/docs/en/engines/database-engines/materialized-mysql/) diff --git a/docs/en/reference/engines/database-engines/materialized-postgresql.md b/docs/en/reference/engines/database-engines/materialized-postgresql.md new file mode 100644 index 00000000000..ff8f7b192e0 --- /dev/null +++ b/docs/en/reference/engines/database-engines/materialized-postgresql.md @@ -0,0 +1,279 @@ +--- +sidebar_label: MaterializedPostgreSQL +sidebar_position: 60 +--- + +# [experimental] MaterializedPostgreSQL {#materialize-postgresql} + +Creates a ClickHouse database with tables from PostgreSQL database. Firstly, database with engine `MaterializedPostgreSQL` creates a snapshot of PostgreSQL database and loads required tables. Required tables can include any subset of tables from any subset of schemas from specified database. Along with the snapshot database engine acquires LSN and once initial dump of tables is performed - it starts pulling updates from WAL. After database is created, newly added tables to PostgreSQL database are not automatically added to replication. They have to be added manually with `ATTACH TABLE db.table` query. + +Replication is implemented with PostgreSQL Logical Replication Protocol, which does not allow to replicate DDL, but allows to know whether replication breaking changes happened (column type changes, adding/removing columns). Such changes are detected and according tables stop receiving updates. Such tables can be automatically reloaded in the background in case required setting is turned on (can be used starting from 22.1). Safest way for now is to use `ATTACH`/ `DETACH` queries to reload table completely. If DDL does not break replication (for example, renaming a column) table will still receive updates (insertion is done by position). + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] +ENGINE = MaterializedPostgreSQL('host:port', 'database', 'user', 'password') [SETTINGS ...] +``` + +**Engine Parameters** + +- `host:port` — PostgreSQL server endpoint. +- `database` — PostgreSQL database name. +- `user` — PostgreSQL user. +- `password` — User password. + +## Example of Use {#example-of-use} + +``` sql +CREATE DATABASE postgres_db +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password'); + +SHOW TABLES FROM postgres_db; + +┌─name───┐ +│ table1 │ +└────────┘ + +SELECT * FROM postgresql_db.postgres_table; +``` + +## Dynamically adding new tables to replication {#dynamically-adding-table-to-replication} + +After `MaterializedPostgreSQL` database is created, it does not automatically detect new tables in according PostgreSQL database. Such tables can be added manually: + +``` sql +ATTACH TABLE postgres_database.new_table; +``` + +:::warning +Before version 22.1, adding a table to replication left an unremoved temporary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in ClickHouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. This issue is fixed in 22.1. +::: + +## Dynamically removing tables from replication {#dynamically-removing-table-from-replication} + +It is possible to remove specific tables from replication: + +``` sql +DETACH TABLE postgres_database.table_to_remove; +``` + +## PostgreSQL schema {#schema} + +PostgreSQL [schema](https://www.postgresql.org/docs/9.1/ddl-schemas.html) can be configured in 3 ways (starting from version 21.12). + +1. One schema for one `MaterializedPostgreSQL` database engine. Requires to use setting `materialized_postgresql_schema`. +Tables are accessed via table name only: + +``` sql +CREATE DATABASE postgres_database +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') +SETTINGS materialized_postgresql_schema = 'postgres_schema'; + +SELECT * FROM postgres_database.table1; +``` + +2. Any number of schemas with specified set of tables for one `MaterializedPostgreSQL` database engine. Requires to use setting `materialized_postgresql_tables_list`. Each table is written along with its schema. +Tables are accessed via schema name and table name at the same time: + +``` sql +CREATE DATABASE database1 +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') +SETTINGS materialized_postgresql_tables_list = 'schema1.table1,schema2.table2,schema1.table3', + materialized_postgresql_tables_list_with_schema = 1; + +SELECT * FROM database1.`schema1.table1`; +SELECT * FROM database1.`schema2.table2`; +``` + +But in this case all tables in `materialized_postgresql_tables_list` must be written with its schema name. +Requires `materialized_postgresql_tables_list_with_schema = 1`. + +Warning: for this case dots in table name are not allowed. + +3. Any number of schemas with full set of tables for one `MaterializedPostgreSQL` database engine. Requires to use setting `materialized_postgresql_schema_list`. + +``` sql +CREATE DATABASE database1 +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') +SETTINGS materialized_postgresql_schema_list = 'schema1,schema2,schema3'; + +SELECT * FROM database1.`schema1.table1`; +SELECT * FROM database1.`schema1.table2`; +SELECT * FROM database1.`schema2.table2`; +``` + +Warning: for this case dots in table name are not allowed. + + +## Requirements {#requirements} + +1. The [wal_level](https://www.postgresql.org/docs/current/runtime-config-wal.html) setting must have a value `logical` and `max_replication_slots` parameter must have a value at least `2` in the PostgreSQL config file. + +2. Each replicated table must have one of the following [replica identity](https://www.postgresql.org/docs/10/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY): + +- primary key (by default) + +- index + +``` bash +postgres# CREATE TABLE postgres_table (a Integer NOT NULL, b Integer, c Integer NOT NULL, d Integer, e Integer NOT NULL); +postgres# CREATE unique INDEX postgres_table_index on postgres_table(a, c, e); +postgres# ALTER TABLE postgres_table REPLICA IDENTITY USING INDEX postgres_table_index; +``` + +The primary key is always checked first. If it is absent, then the index, defined as replica identity index, is checked. +If the index is used as a replica identity, there has to be only one such index in a table. +You can check what type is used for a specific table with the following command: + +``` bash +postgres# SELECT CASE relreplident + WHEN 'd' THEN 'default' + WHEN 'n' THEN 'nothing' + WHEN 'f' THEN 'full' + WHEN 'i' THEN 'index' + END AS replica_identity +FROM pg_class +WHERE oid = 'postgres_table'::regclass; +``` + +:::warning +Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. +::: + +## Settings {#settings} + +1. `materialized_postgresql_tables_list` {#materialized-postgresql-tables-list} + + Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. + + Default value: empty list — means whole PostgreSQL database will be replicated. + +2. `materialized_postgresql_schema` {#materialized-postgresql-schema} + + Default value: empty string. (Default schema is used) + +3. `materialized_postgresql_schema_list` {#materialized-postgresql-schema-list} + + Default value: empty list. (Default schema is used) + +4. `materialized_postgresql_allow_automatic_update` {#materialized-postgresql-allow-automatic-update} + + Do not use this setting before 22.1 version. + + Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. + + Possible values: + + - 0 — The table is not automatically updated in the background, when schema changes are detected. + - 1 — The table is automatically updated in the background, when schema changes are detected. + + Default value: `0`. + +5. `materialized_postgresql_max_block_size` {#materialized-postgresql-max-block-size} + + Sets the number of rows collected in memory before flushing data into PostgreSQL database table. + + Possible values: + + - Positive integer. + + Default value: `65536`. + +6. `materialized_postgresql_replication_slot` {#materialized-postgresql-replication-slot} + + A user-created replication slot. Must be used together with `materialized_postgresql_snapshot`. + +7. `materialized_postgresql_snapshot` {#materialized-postgresql-snapshot} + + A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with `materialized_postgresql_replication_slot`. + + ``` sql + CREATE DATABASE database1 + ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') + SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; + + SELECT * FROM database1.table1; + ``` + + The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. + + ``` sql + ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; + ``` + +## Notes {#notes} + +### Failover of the logical replication slot {#logical-replication-slot-failover} + +Logical Replication Slots which exist on the primary are not available on standby replicas. +So if there is a failover, new primary (the old physical standby) won’t be aware of any slots which were existing with old primary. This will lead to a broken replication from PostgreSQL. +A solution to this is to manage replication slots yourself and define a permanent replication slot (some information can be found [here](https://patroni.readthedocs.io/en/latest/SETTINGS.html)). You'll need to pass slot name via `materialized_postgresql_replication_slot` setting, and it has to be exported with `EXPORT SNAPSHOT` option. The snapshot identifier needs to be passed via `materialized_postgresql_snapshot` setting. + +Please note that this should be used only if it is actually needed. If there is no real need for that or full understanding why, then it is better to allow the table engine to create and manage its own replication slot. + +**Example (from [@bchrobot](https://github.com/bchrobot))** + +1. Configure replication slot in PostgreSQL. + + ```yaml + apiVersion: "acid.zalan.do/v1" + kind: postgresql + metadata: + name: acid-demo-cluster + spec: + numberOfInstances: 2 + postgresql: + parameters: + wal_level: logical + patroni: + slots: + clickhouse_sync: + type: logical + database: demodb + plugin: pgoutput + ``` + +2. Wait for replication slot to be ready, then begin a transaction and export the transaction snapshot identifier: + + ```sql + BEGIN; + SELECT pg_export_snapshot(); + ``` + +3. In ClickHouse create database: + + ```sql + CREATE DATABASE demodb + ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') + SETTINGS + materialized_postgresql_replication_slot = 'clickhouse_sync', + materialized_postgresql_snapshot = '0000000A-0000023F-3', + materialized_postgresql_tables_list = 'table1,table2,table3'; + ``` + +4. End the PostgreSQL transaction once replication to ClickHouse DB is confirmed. Verify that replication continues after failover: + + ```bash + kubectl exec acid-demo-cluster-0 -c postgres -- su postgres -c 'patronictl failover --candidate acid-demo-cluster-1 --force' + ``` + +### Required permissions + +1. [CREATE PUBLICATION](https://postgrespro.ru/docs/postgresql/14/sql-createpublication) -- create query privilege. + +2. [CREATE_REPLICATION_SLOT](https://postgrespro.ru/docs/postgrespro/10/protocol-replication#PROTOCOL-REPLICATION-CREATE-SLOT) -- replication privelege. + +3. [pg_drop_replication_slot](https://postgrespro.ru/docs/postgrespro/9.5/functions-admin#functions-replication) -- replication privilege or superuser. + +4. [DROP PUBLICATION](https://postgrespro.ru/docs/postgresql/10/sql-droppublication) -- owner of publication (`username` in MaterializedPostgreSQL engine itself). + +It is possible to avoid executing `2` and `3` commands and having those permissions. Use settings `materialized_postgresql_replication_slot` and `materialized_postgresql_snapshot`. But with much care. + +Access to tables: + +1. pg_publication + +2. pg_replication_slots + +3. pg_publication_tables diff --git a/docs/en/reference/engines/database-engines/mysql.md b/docs/en/reference/engines/database-engines/mysql.md new file mode 100644 index 00000000000..89a0786a9ec --- /dev/null +++ b/docs/en/reference/engines/database-engines/mysql.md @@ -0,0 +1,151 @@ +--- +sidebar_position: 50 +sidebar_label: MySQL +--- + +# MySQL + +Allows to connect to databases on a remote MySQL server and perform `INSERT` and `SELECT` queries to exchange data between ClickHouse and MySQL. + +The `MySQL` database engine translate queries to the MySQL server so you can perform operations such as `SHOW TABLES` or `SHOW CREATE TABLE`. + +You cannot perform the following queries: + +- `RENAME` +- `CREATE TABLE` +- `ALTER` + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] +ENGINE = MySQL('host:port', ['database' | database], 'user', 'password') +``` + +**Engine Parameters** + +- `host:port` — MySQL server address. +- `database` — Remote database name. +- `user` — MySQL user. +- `password` — User password. + +## Data Types Support {#data_types-support} + +| MySQL | ClickHouse | +|----------------------------------|--------------------------------------------------------------| +| UNSIGNED TINYINT | [UInt8](../../sql-reference/data-types/int-uint.md) | +| TINYINT | [Int8](../../sql-reference/data-types/int-uint.md) | +| UNSIGNED SMALLINT | [UInt16](../../sql-reference/data-types/int-uint.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../../sql-reference/data-types/int-uint.md) | +| INT, MEDIUMINT | [Int32](../../sql-reference/data-types/int-uint.md) | +| UNSIGNED BIGINT | [UInt64](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| FLOAT | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DATE | [Date](../../sql-reference/data-types/date.md) | +| DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| BINARY | [FixedString](../../sql-reference/data-types/fixedstring.md) | + +All other MySQL data types are converted into [String](../../sql-reference/data-types/string.md). + +[Nullable](../../sql-reference/data-types/nullable.md) is supported. + +## Global Variables Support {#global-variables-support} + +For better compatibility you may address global variables in MySQL style, as `@@identifier`. + +These variables are supported: +- `version` +- `max_allowed_packet` + +:::warning +By now these variables are stubs and don't correspond to anything. +::: + +Example: + +``` sql +SELECT @@version; +``` + +## Examples of Use {#examples-of-use} + +Table in MySQL: + +``` text +mysql> USE test; +Database changed + +mysql> CREATE TABLE `mysql_table` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `float` FLOAT NOT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into mysql_table (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from mysql_table; ++------+-----+ +| int_id | value | ++------+-----+ +| 1 | 2 | ++------+-----+ +1 row in set (0,00 sec) +``` + +Database in ClickHouse, exchanging data with the MySQL server: + +``` sql +CREATE DATABASE mysql_db ENGINE = MySQL('localhost:3306', 'test', 'my_user', 'user_password') +``` + +``` sql +SHOW DATABASES +``` + +``` text +┌─name─────┐ +│ default │ +│ mysql_db │ +│ system │ +└──────────┘ +``` + +``` sql +SHOW TABLES FROM mysql_db +``` + +``` text +┌─name─────────┐ +│ mysql_table │ +└──────────────┘ +``` + +``` sql +SELECT * FROM mysql_db.mysql_table +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +└────────┴───────┘ +``` + +``` sql +INSERT INTO mysql_db.mysql_table VALUES (3,4) +``` + +``` sql +SELECT * FROM mysql_db.mysql_table +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +[Original article](https://clickhouse.com/docs/en/database_engines/mysql/) diff --git a/docs/en/reference/engines/database-engines/postgresql.md b/docs/en/reference/engines/database-engines/postgresql.md new file mode 100644 index 00000000000..bc5e93d0923 --- /dev/null +++ b/docs/en/reference/engines/database-engines/postgresql.md @@ -0,0 +1,139 @@ +--- +sidebar_position: 40 +sidebar_label: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +Allows to connect to databases on a remote [PostgreSQL](https://www.postgresql.org) server. Supports read and write operations (`SELECT` and `INSERT` queries) to exchange data between ClickHouse and PostgreSQL. + +Gives the real-time access to table list and table structure from remote PostgreSQL with the help of `SHOW TABLES` and `DESCRIBE TABLE` queries. + +Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) it set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries. + +## Creating a Database {#creating-a-database} + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('host:port', 'database', 'user', 'password'[, `schema`, `use_table_cache`]); +``` + +**Engine Parameters** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `user` — PostgreSQL user. +- `password` — User password. +- `schema` — PostgreSQL schema. +- `use_table_cache` — Defines if the database table structure is cached or not. Optional. Default value: `0`. + +## Data Types Support {#data_types-support} + +| PostgerSQL | ClickHouse | +|------------------|--------------------------------------------------------------| +| DATE | [Date](../../sql-reference/data-types/date.md) | +| TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| DOUBLE | [Float64](../../sql-reference/data-types/float.md) | +| DECIMAL, NUMERIC | [Decimal](../../sql-reference/data-types/decimal.md) | +| SMALLINT | [Int16](../../sql-reference/data-types/int-uint.md) | +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| BIGINT | [Int64](../../sql-reference/data-types/int-uint.md) | +| SERIAL | [UInt32](../../sql-reference/data-types/int-uint.md) | +| BIGSERIAL | [UInt64](../../sql-reference/data-types/int-uint.md) | +| TEXT, CHAR | [String](../../sql-reference/data-types/string.md) | +| INTEGER | Nullable([Int32](../../sql-reference/data-types/int-uint.md))| +| ARRAY | [Array](../../sql-reference/data-types/array.md) | + + +## Examples of Use {#examples-of-use} + +Database in ClickHouse, exchanging data with the PostgreSQL server: + +``` sql +CREATE DATABASE test_database +ENGINE = PostgreSQL('postgres1:5432', 'test_database', 'postgres', 'mysecretpassword', 1); +``` + +``` sql +SHOW DATABASES; +``` + +``` text +┌─name──────────┐ +│ default │ +│ test_database │ +│ system │ +└───────────────┘ +``` + +``` sql +SHOW TABLES FROM test_database; +``` + +``` text +┌─name───────┐ +│ test_table │ +└────────────┘ +``` + +Reading data from the PostgreSQL table: + +``` sql +SELECT * FROM test_database.test_table; +``` + +``` text +┌─id─┬─value─┐ +│ 1 │ 2 │ +└────┴───────┘ +``` + +Writing data to the PostgreSQL table: + +``` sql +INSERT INTO test_database.test_table VALUES (3,4); +SELECT * FROM test_database.test_table; +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +Consider the table structure was modified in PostgreSQL: + +``` sql +postgre> ALTER TABLE test_table ADD COLUMN data Text +``` + +As the `use_table_cache` parameter was set to `1` when the database was created, the table structure in ClickHouse was cached and therefore not modified: + +``` sql +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +└────────┴───────────────────┘ +``` + +After detaching the table and attaching it again, the structure was updated: + +``` sql +DETACH TABLE test_database.test_table; +ATTACH TABLE test_database.test_table; +DESCRIBE TABLE test_database.test_table; +``` +``` text +┌─name───┬─type──────────────┐ +│ id │ Nullable(Integer) │ +│ value │ Nullable(Integer) │ +│ data │ Nullable(String) │ +└────────┴───────────────────┘ +``` + +[Original article](https://clickhouse.com/docs/en/database-engines/postgresql/) diff --git a/docs/en/reference/engines/database-engines/replicated.md b/docs/en/reference/engines/database-engines/replicated.md new file mode 100644 index 00000000000..63d955dc889 --- /dev/null +++ b/docs/en/reference/engines/database-engines/replicated.md @@ -0,0 +1,123 @@ +--- +sidebar_position: 30 +sidebar_label: Replicated +--- + +# [experimental] Replicated {#replicated} + +The engine is based on the [Atomic](../../engines/database-engines/atomic.md) engine. It supports replication of metadata via DDL log being written to ZooKeeper and executed on all of the replicas for a given database. + +One ClickHouse server can have multiple replicated databases running and updating at the same time. But there can't be multiple replicas of the same replicated database. + +## Creating a Database {#creating-a-database} +``` sql + CREATE DATABASE testdb ENGINE = Replicated('zoo_path', 'shard_name', 'replica_name') [SETTINGS ...] +``` + +**Engine Parameters** + +- `zoo_path` — ZooKeeper path. The same ZooKeeper path corresponds to the same database. +- `shard_name` — Shard name. Database replicas are grouped into shards by `shard_name`. +- `replica_name` — Replica name. Replica names must be different for all replicas of the same shard. + +:::warning +For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables if no arguments provided, then default arguments are used: `/clickhouse/tables/{uuid}/{shard}` and `{replica}`. These can be changed in the server settings [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). Macro `{uuid}` is unfolded to table's uuid, `{shard}` and `{replica}` are unfolded to values from server config, not from database engine arguments. But in the future, it will be possible to use `shard_name` and `replica_name` of Replicated database. +::: + +## Specifics and Recommendations {#specifics-and-recommendations} + +DDL queries with `Replicated` database work in a similar way to [ON CLUSTER](../../sql-reference/distributed-ddl.md) queries, but with minor differences. + +First, the DDL request tries to execute on the initiator (the host that originally received the request from the user). If the request is not fulfilled, then the user immediately receives an error, other hosts do not try to fulfill it. If the request has been successfully completed on the initiator, then all other hosts will automatically retry until they complete it. The initiator will try to wait for the query to be completed on other hosts (no longer than [distributed_ddl_task_timeout](../../operations/settings/settings.md#distributed_ddl_task_timeout)) and will return a table with the query execution statuses on each host. + +The behavior in case of errors is regulated by the [distributed_ddl_output_mode](../../operations/settings/settings.md#distributed_ddl_output_mode) setting, for a `Replicated` database it is better to set it to `null_status_on_timeout` — i.e. if some hosts did not have time to execute the request for [distributed_ddl_task_timeout](../../operations/settings/settings.md#distributed_ddl_task_timeout), then do not throw an exception, but show the `NULL` status for them in the table. + +The [system.clusters](../../operations/system-tables/clusters.md) system table contains a cluster named like the replicated database, which consists of all replicas of the database. This cluster is updated automatically when creating/deleting replicas, and it can be used for [Distributed](../../engines/table-engines/special/distributed.md#distributed) tables. + +When creating a new replica of the database, this replica creates tables by itself. If the replica has been unavailable for a long time and has lagged behind the replication log — it checks its local metadata with the current metadata in ZooKeeper, moves the extra tables with data to a separate non-replicated database (so as not to accidentally delete anything superfluous), creates the missing tables, updates the table names if they have been renamed. The data is replicated at the `ReplicatedMergeTree` level, i.e. if the table is not replicated, the data will not be replicated (the database is responsible only for metadata). + +[`ALTER TABLE ATTACH|FETCH|DROP|DROP DETACHED|DETACH PARTITION|PART`](../../sql-reference/statements/alter/partition.md) queries are allowed but not replicated. The database engine will only add/fetch/remove the partition/part to the current replica. However, if the table itself uses a Replicated table engine, then the data will be replicated after using `ATTACH`. + +## Usage Example {#usage-example} + +Creating a cluster with three hosts: + +``` sql +node1 :) CREATE DATABASE r ENGINE=Replicated('some/path/r','shard1','replica1'); +node2 :) CREATE DATABASE r ENGINE=Replicated('some/path/r','shard1','other_replica'); +node3 :) CREATE DATABASE r ENGINE=Replicated('some/path/r','other_shard','{replica}'); +``` + +Running the DDL-query: + +``` sql +CREATE TABLE r.rmt (n UInt64) ENGINE=ReplicatedMergeTree ORDER BY n; +``` + +``` text +┌─────hosts────────────┬──status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐ +│ shard1|replica1 │ 0 │ │ 2 │ 0 │ +│ shard1|other_replica │ 0 │ │ 1 │ 0 │ +│ other_shard|r1 │ 0 │ │ 0 │ 0 │ +└──────────────────────┴─────────┴───────┴─────────────────────┴──────────────────┘ +``` + +Showing the system table: + +``` sql +SELECT cluster, shard_num, replica_num, host_name, host_address, port, is_local +FROM system.clusters WHERE cluster='r'; +``` + +``` text +┌─cluster─┬─shard_num─┬─replica_num─┬─host_name─┬─host_address─┬─port─┬─is_local─┐ +│ r │ 1 │ 1 │ node3 │ 127.0.0.1 │ 9002 │ 0 │ +│ r │ 2 │ 1 │ node2 │ 127.0.0.1 │ 9001 │ 0 │ +│ r │ 2 │ 2 │ node1 │ 127.0.0.1 │ 9000 │ 1 │ +└─────────┴───────────┴─────────────┴───────────┴──────────────┴──────┴──────────┘ +``` + +Creating a distributed table and inserting the data: + +``` sql +node2 :) CREATE TABLE r.d (n UInt64) ENGINE=Distributed('r','r','rmt', n % 2); +node3 :) INSERT INTO r SELECT * FROM numbers(10); +node1 :) SELECT materialize(hostName()) AS host, groupArray(n) FROM r.d GROUP BY host; +``` + +``` text +┌─hosts─┬─groupArray(n)─┐ +│ node1 │ [1,3,5,7,9] │ +│ node2 │ [0,2,4,6,8] │ +└───────┴───────────────┘ +``` + +Adding replica on the one more host: + +``` sql +node4 :) CREATE DATABASE r ENGINE=Replicated('some/path/r','other_shard','r2'); +``` + +The cluster configuration will look like this: + +``` text +┌─cluster─┬─shard_num─┬─replica_num─┬─host_name─┬─host_address─┬─port─┬─is_local─┐ +│ r │ 1 │ 1 │ node3 │ 127.0.0.1 │ 9002 │ 0 │ +│ r │ 1 │ 2 │ node4 │ 127.0.0.1 │ 9003 │ 0 │ +│ r │ 2 │ 1 │ node2 │ 127.0.0.1 │ 9001 │ 0 │ +│ r │ 2 │ 2 │ node1 │ 127.0.0.1 │ 9000 │ 1 │ +└─────────┴───────────┴─────────────┴───────────┴──────────────┴──────┴──────────┘ +``` + +The distributed table also will get data from the new host: + +```sql +node2 :) SELECT materialize(hostName()) AS host, groupArray(n) FROM r.d GROUP BY host; +``` + +```text +┌─hosts─┬─groupArray(n)─┐ +│ node2 │ [1,3,5,7,9] │ +│ node4 │ [0,2,4,6,8] │ +└───────┴───────────────┘ +``` \ No newline at end of file diff --git a/docs/en/reference/engines/database-engines/sqlite.md b/docs/en/reference/engines/database-engines/sqlite.md new file mode 100644 index 00000000000..2f8b44c9a09 --- /dev/null +++ b/docs/en/reference/engines/database-engines/sqlite.md @@ -0,0 +1,80 @@ +--- +sidebar_position: 55 +sidebar_label: SQLite +--- + +# SQLite {#sqlite} + +Allows to connect to [SQLite](https://www.sqlite.org/index.html) database and perform `INSERT` and `SELECT` queries to exchange data between ClickHouse and SQLite. + +## Creating a Database {#creating-a-database} + +``` sql + CREATE DATABASE sqlite_database + ENGINE = SQLite('db_path') +``` + +**Engine Parameters** + +- `db_path` — Path to a file with SQLite database. + +## Data Types Support {#data_types-support} + +| SQLite | ClickHouse | +|---------------|---------------------------------------------------------| +| INTEGER | [Int32](../../sql-reference/data-types/int-uint.md) | +| REAL | [Float32](../../sql-reference/data-types/float.md) | +| TEXT | [String](../../sql-reference/data-types/string.md) | +| BLOB | [String](../../sql-reference/data-types/string.md) | + +## Specifics and Recommendations {#specifics-and-recommendations} + +SQLite stores the entire database (definitions, tables, indices, and the data itself) as a single cross-platform file on a host machine. During writing SQLite locks the entire database file, therefore write operations are performed sequentially. Read operations can be multitasked. +SQLite does not require service management (such as startup scripts) or access control based on `GRANT` and passwords. Access control is handled by means of file-system permissions given to the database file itself. + +## Usage Example {#usage-example} + +Database in ClickHouse, connected to the SQLite: + +``` sql +CREATE DATABASE sqlite_db ENGINE = SQLite('sqlite.db'); +SHOW TABLES FROM sqlite_db; +``` + +``` text +┌──name───┐ +│ table1 │ +│ table2 │ +└─────────┘ +``` + +Shows the tables: + +``` sql +SELECT * FROM sqlite_db.table1; +``` + +``` text +┌─col1──┬─col2─┐ +│ line1 │ 1 │ +│ line2 │ 2 │ +│ line3 │ 3 │ +└───────┴──────┘ +``` +Inserting data into SQLite table from ClickHouse table: + +``` sql +CREATE TABLE clickhouse_table(`col1` String,`col2` Int16) ENGINE = MergeTree() ORDER BY col2; +INSERT INTO clickhouse_table VALUES ('text',10); +INSERT INTO sqlite_db.table1 SELECT * FROM clickhouse_table; +SELECT * FROM sqlite_db.table1; +``` + +``` text +┌─col1──┬─col2─┐ +│ line1 │ 1 │ +│ line2 │ 2 │ +│ line3 │ 3 │ +│ text │ 10 │ +└───────┴──────┘ +``` diff --git a/docs/en/reference/engines/table-engines/index.md b/docs/en/reference/engines/table-engines/index.md new file mode 100644 index 00000000000..09e0147bbf7 --- /dev/null +++ b/docs/en/reference/engines/table-engines/index.md @@ -0,0 +1,89 @@ +--- +toc_folder_title: Table Engines +toc_priority: 26 +toc_title: Introduction +--- + +# Table Engines {#table_engines} + +The table engine (type of table) determines: + +- How and where data is stored, where to write it to, and where to read it from. +- Which queries are supported, and how. +- Concurrent data access. +- Use of indexes, if present. +- Whether multithreaded request execution is possible. +- Data replication parameters. + +## Engine Families {#engine-families} + +### MergeTree {#mergetree} + +The most universal and functional table engines for high-load tasks. The property shared by these engines is quick data insertion with subsequent background data processing. `MergeTree` family engines support data replication (with [Replicated\*](../../engines/table-engines/mergetree-family/replication.md#table_engines-replication) versions of engines), partitioning, secondary data-skipping indexes, and other features not supported in other engines. + +Engines in the family: + +- [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#mergetree) +- [ReplacingMergeTree](../../engines/table-engines/mergetree-family/replacingmergetree.md#replacingmergetree) +- [SummingMergeTree](../../engines/table-engines/mergetree-family/summingmergetree.md#summingmergetree) +- [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md#aggregatingmergetree) +- [CollapsingMergeTree](../../engines/table-engines/mergetree-family/collapsingmergetree.md#table_engine-collapsingmergetree) +- [VersionedCollapsingMergeTree](../../engines/table-engines/mergetree-family/versionedcollapsingmergetree.md#versionedcollapsingmergetree) +- [GraphiteMergeTree](../../engines/table-engines/mergetree-family/graphitemergetree.md#graphitemergetree) + +### Log {#log} + +Lightweight [engines](../../engines/table-engines/log-family/index.md) with minimum functionality. They’re the most effective when you need to quickly write many small tables (up to approximately 1 million rows) and read them later as a whole. + +Engines in the family: + +- [TinyLog](../../engines/table-engines/log-family/tinylog.md#tinylog) +- [StripeLog](../../engines/table-engines/log-family/stripelog.md#stripelog) +- [Log](../../engines/table-engines/log-family/log.md#log) + +### Integration Engines {#integration-engines} + +Engines for communicating with other data storage and processing systems. + +Engines in the family: + + +- [ODBC](../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../engines/table-engines/integrations/hdfs.md) +- [S3](../../engines/table-engines/integrations/s3.md) +- [Kafka](../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../engines/table-engines/integrations/postgresql.md) + +### Special Engines {#special-engines} + +Engines in the family: + +- [Distributed](../../engines/table-engines/special/distributed.md#distributed) +- [MaterializedView](../../engines/table-engines/special/materializedview.md#materializedview) +- [Dictionary](../../engines/table-engines/special/dictionary.md#dictionary) +- [Merge](../../engines/table-engines/special/merge.md#merge) +- [File](../../engines/table-engines/special/file.md#file) +- [Null](../../engines/table-engines/special/null.md#null) +- [Set](../../engines/table-engines/special/set.md#set) +- [Join](../../engines/table-engines/special/join.md#join) +- [URL](../../engines/table-engines/special/url.md#table_engines-url) +- [View](../../engines/table-engines/special/view.md#table_engines-view) +- [Memory](../../engines/table-engines/special/memory.md#memory) +- [Buffer](../../engines/table-engines/special/buffer.md#buffer) + +## Virtual Columns {#table_engines-virtual_columns} + +Virtual column is an integral table engine attribute that is defined in the engine source code. + +You shouldn’t specify virtual columns in the `CREATE TABLE` query and you can’t see them in `SHOW CREATE TABLE` and `DESCRIBE TABLE` query results. Virtual columns are also read-only, so you can’t insert data into virtual columns. + +To select data from a virtual column, you must specify its name in the `SELECT` query. `SELECT *` does not return values from virtual columns. + +If you create a table with a column that has the same name as one of the table virtual columns, the virtual column becomes inaccessible. We do not recommend doing this. To help avoid conflicts, virtual column names are usually prefixed with an underscore. + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/) diff --git a/docs/en/reference/engines/table-engines/integrations/ExternalDistributed.md b/docs/en/reference/engines/table-engines/integrations/ExternalDistributed.md new file mode 100644 index 00000000000..c9aae1934db --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/ExternalDistributed.md @@ -0,0 +1,56 @@ +--- +sidebar_position: 12 +sidebar_label: ExternalDistributed +--- + +# ExternalDistributed {#externaldistributed} + +The `ExternalDistributed` engine allows to perform `SELECT` queries on data that is stored on a remote servers MySQL or PostgreSQL. Accepts [MySQL](../../../engines/table-engines/integrations/mysql.md) or [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md) engines as an argument so sharding is possible. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... +) ENGINE = ExternalDistributed('engine', 'host:port', 'database', 'table', 'user', 'password'); +``` + +See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +The table structure can differ from the original table structure: + +- Column names should be the same as in the original table, but you can use just some of these columns and in any order. +- Column types may differ from those in the original table. ClickHouse tries to [cast](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) values to the ClickHouse data types. + +**Engine Parameters** + +- `engine` — The table engine `MySQL` or `PostgreSQL`. +- `host:port` — MySQL or PostgreSQL server address. +- `database` — Remote database name. +- `table` — Remote table name. +- `user` — User name. +- `password` — User password. + +## Implementation Details {#implementation-details} + +Supports multiple replicas that must be listed by `|` and shards must be listed by `,`. For example: + +```sql +CREATE TABLE test_shards (id UInt32, name String, age UInt32, money UInt32) ENGINE = ExternalDistributed('MySQL', `mysql{1|2}:3306,mysql{3|4}:3306`, 'clickhouse', 'test_replicas', 'root', 'clickhouse'); +``` + +When specifying replicas, one of the available replicas is selected for each of the shards when reading. If the connection fails, the next replica is selected, and so on for all the replicas. If the connection attempt fails for all the replicas, the attempt is repeated the same way several times. + +You can specify any number of shards and any number of replicas for each shard. + +**See Also** + +- [MySQL table engine](../../../engines/table-engines/integrations/mysql.md) +- [PostgreSQL table engine](../../../engines/table-engines/integrations/postgresql.md) +- [Distributed table engine](../../../engines/table-engines/special/distributed.md) + + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/ExternalDistributed/) diff --git a/docs/en/reference/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/reference/engines/table-engines/integrations/embedded-rocksdb.md new file mode 100644 index 00000000000..701d190f022 --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/embedded-rocksdb.md @@ -0,0 +1,84 @@ +--- +sidebar_position: 9 +sidebar_label: EmbeddedRocksDB +--- + +# EmbeddedRocksDB Engine {#EmbeddedRocksDB-engine} + +This engine allows integrating ClickHouse with [rocksdb](http://rocksdb.org/). + +## Creating a Table {#table_engine-EmbeddedRocksDB-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = EmbeddedRocksDB PRIMARY KEY(primary_key_name) +``` + +Required parameters: + +- `primary_key_name` – any column name in the column list. +- `primary key` must be specified, it supports only one column in the primary key. The primary key will be serialized in binary as a `rocksdb key`. +- columns other than the primary key will be serialized in binary as `rocksdb` value in corresponding order. +- queries with key `equals` or `in` filtering will be optimized to multi keys lookup from `rocksdb`. + +Example: + +``` sql +CREATE TABLE test +( + `key` String, + `v1` UInt32, + `v2` String, + `v3` Float32 +) +ENGINE = EmbeddedRocksDB +PRIMARY KEY key +``` + +## Metrics + +There is also `system.rocksdb` table, that expose rocksdb statistics: + +```sql +SELECT + name, + value +FROM system.rocksdb + +┌─name──────────────────────┬─value─┐ +│ no.file.opens │ 1 │ +│ number.block.decompressed │ 1 │ +└───────────────────────────┴───────┘ +``` + +## Configuration + +You can also change any [rocksdb options](https://github.com/facebook/rocksdb/wiki/Option-String-and-Option-Map) using config: + +```xml + + + 8 + + + 2 + + + + TABLE + + 8 + + + 2 + +
+
+
+``` + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/embedded-rocksdb/) diff --git a/docs/en/reference/engines/table-engines/integrations/hdfs.md b/docs/en/reference/engines/table-engines/integrations/hdfs.md new file mode 100644 index 00000000000..503bd779abf --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/hdfs.md @@ -0,0 +1,230 @@ +--- +sidebar_position: 6 +sidebar_label: HDFS +--- + +# HDFS {#table_engines-hdfs} + +This engine provides integration with the [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) via ClickHouse. This engine is similar to the [File](../../../engines/table-engines/special/file.md#table_engines-file) and [URL](../../../engines/table-engines/special/url.md#table_engines-url) engines, but provides Hadoop-specific features. + +## Usage {#usage} + +``` sql +ENGINE = HDFS(URI, format) +``` + +**Engine Parameters** + +- `URI` - whole file URI in HDFS. The path part of `URI` may contain globs. In this case the table would be readonly. +- `format` - specifies one of the available file formats. To perform +`SELECT` queries, the format must be supported for input, and to perform +`INSERT` queries – for output. The available formats are listed in the +[Formats](../../../interfaces/formats.md#formats) section. + +**Example:** + +**1.** Set up the `hdfs_engine_table` table: + +``` sql +CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') +``` + +**2.** Fill file: + +``` sql +INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) +``` + +**3.** Query the data: + +``` sql +SELECT * FROM hdfs_engine_table LIMIT 2 +``` + +``` text +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` + +## Implementation Details {#implementation-details} + +- Reads and writes can be parallel. +- [Zero-copy](../../../operations/storing-data.md#zero-copy) replication is supported. +- Not supported: + - `ALTER` and `SELECT...SAMPLE` operations. + - Indexes. + +**Globs in path** + +Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment). + +- `*` — Substitutes any number of any characters except `/` including empty string. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. + +Constructions with `{}` are similar to the [remote](../../../sql-reference/table-functions/remote.md) table function. + +**Example** + +1. Suppose we have several files in TSV format with the following URIs on HDFS: + + - 'hdfs://hdfs1:9000/some_dir/some_file_1' + - 'hdfs://hdfs1:9000/some_dir/some_file_2' + - 'hdfs://hdfs1:9000/some_dir/some_file_3' + - 'hdfs://hdfs1:9000/another_dir/some_file_1' + - 'hdfs://hdfs1:9000/another_dir/some_file_2' + - 'hdfs://hdfs1:9000/another_dir/some_file_3' + +1. There are several ways to make a table consisting of all six files: + + + +``` sql +CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV') +``` + +Another way: + +``` sql +CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV') +``` + +Table consists of all the files in both directories (all files should satisfy format and schema described in query): + +``` sql +CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') +``` + +:::warning +If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: + +**Example** + +Create table with files named `file000`, `file001`, … , `file999`: + +``` sql +CREATE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') +``` +## Configuration {#configuration} + +Similar to GraphiteMergeTree, the HDFS engine supports extended configuration using the ClickHouse config file. There are two configuration keys that you can use: global (`hdfs`) and user-level (`hdfs_*`). The global configuration is applied first, and then the user-level configuration is applied (if it exists). + +``` xml + + + /tmp/keytab/clickhouse.keytab + clickuser@TEST.CLICKHOUSE.TECH + kerberos + + + + + root@TEST.CLICKHOUSE.TECH + +``` + +### Configuration Options {#configuration-options} + +#### Supported by libhdfs3 {#supported-by-libhdfs3} + + +| **parameter** | **default value** | +| - | - | +| rpc\_client\_connect\_tcpnodelay | true | +| dfs\_client\_read\_shortcircuit | true | +| output\_replace-datanode-on-failure | true | +| input\_notretry-another-node | false | +| input\_localread\_mappedfile | true | +| dfs\_client\_use\_legacy\_blockreader\_local | false | +| rpc\_client\_ping\_interval | 10 * 1000 | +| rpc\_client\_connect\_timeout | 600 * 1000 | +| rpc\_client\_read\_timeout | 3600 * 1000 | +| rpc\_client\_write\_timeout | 3600 * 1000 | +| rpc\_client\_socekt\_linger\_timeout | -1 | +| rpc\_client\_connect\_retry | 10 | +| rpc\_client\_timeout | 3600 * 1000 | +| dfs\_default\_replica | 3 | +| input\_connect\_timeout | 600 * 1000 | +| input\_read\_timeout | 3600 * 1000 | +| input\_write\_timeout | 3600 * 1000 | +| input\_localread\_default\_buffersize | 1 * 1024 * 1024 | +| dfs\_prefetchsize | 10 | +| input\_read\_getblockinfo\_retry | 3 | +| input\_localread\_blockinfo\_cachesize | 1000 | +| input\_read\_max\_retry | 60 | +| output\_default\_chunksize | 512 | +| output\_default\_packetsize | 64 * 1024 | +| output\_default\_write\_retry | 10 | +| output\_connect\_timeout | 600 * 1000 | +| output\_read\_timeout | 3600 * 1000 | +| output\_write\_timeout | 3600 * 1000 | +| output\_close\_timeout | 3600 * 1000 | +| output\_packetpool\_size | 1024 | +| output\_heeartbeat\_interval | 10 * 1000 | +| dfs\_client\_failover\_max\_attempts | 15 | +| dfs\_client\_read\_shortcircuit\_streams\_cache\_size | 256 | +| dfs\_client\_socketcache\_expiryMsec | 3000 | +| dfs\_client\_socketcache\_capacity | 16 | +| dfs\_default\_blocksize | 64 * 1024 * 1024 | +| dfs\_default\_uri | "hdfs://localhost:9000" | +| hadoop\_security\_authentication | "simple" | +| hadoop\_security\_kerberos\_ticket\_cache\_path | "" | +| dfs\_client\_log\_severity | "INFO" | +| dfs\_domain\_socket\_path | "" | + + +[HDFS Configuration Reference](https://hawq.apache.org/docs/userguide/2.3.0.0-incubating/reference/HDFSConfigurationParameterReference.html) might explain some parameters. + + +#### ClickHouse extras {#clickhouse-extras} + +| **parameter** | **default value** | +| - | - | +|hadoop\_kerberos\_keytab | "" | +|hadoop\_kerberos\_principal | "" | +|hadoop\_kerberos\_kinit\_command | kinit | +|libhdfs3\_conf | "" | + +### Limitations {#limitations} +* `hadoop_security_kerberos_ticket_cache_path` and `libhdfs3_conf` can be global only, not user specific + +## Kerberos support {#kerberos-support} + +If the `hadoop_security_authentication` parameter has the value `kerberos`, ClickHouse authenticates via Kerberos. +Parameters are [here](#clickhouse-extras) and `hadoop_security_kerberos_ticket_cache_path` may be of help. +Note that due to libhdfs3 limitations only old-fashioned approach is supported, +datanode communications are not secured by SASL (`HADOOP_SECURE_DN_USER` is a reliable indicator of such +security approach). Use `tests/integration/test_storage_kerberized_hdfs/hdfs_configs/bootstrap.sh` for reference. + +If `hadoop_kerberos_keytab`, `hadoop_kerberos_principal` or `hadoop_kerberos_kinit_command` is specified, `kinit` will be invoked. `hadoop_kerberos_keytab` and `hadoop_kerberos_principal` are mandatory in this case. `kinit` tool and krb5 configuration files are required. + +## HDFS Namenode HA support {#namenode-ha} + +libhdfs3 support HDFS namenode HA. + +- Copy `hdfs-site.xml` from an HDFS node to `/etc/clickhouse-server/`. +- Add following piece to ClickHouse config file: + +``` xml + + /etc/clickhouse-server/hdfs-site.xml + +``` + +- Then use `dfs.nameservices` tag value of `hdfs-site.xml` as the namenode address in the HDFS URI. For example, replace `hdfs://appadmin@192.168.101.11:8020/abc/` with `hdfs://appadmin@my_nameservice/abc/`. + + +## Virtual Columns {#virtual-columns} + +- `_path` — Path to the file. +- `_file` — Name of the file. + +**See Also** + +- [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/hdfs/) diff --git a/docs/en/reference/engines/table-engines/integrations/hive.md b/docs/en/reference/engines/table-engines/integrations/hive.md new file mode 100644 index 00000000000..6731f0e7559 --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/hive.md @@ -0,0 +1,410 @@ +--- +sidebar_position: 4 +sidebar_label: Hive +--- + +# Hive {#hive} + +The Hive engine allows you to perform `SELECT` quries on HDFS Hive table. Currently it supports input formats as below: + +- Text: only supports simple scalar column types except `binary` + +- ORC: support simple scalar columns types except `char`; only support complex types like `array` + +- Parquet: support all simple scalar columns types; only support complex types like `array` + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [ALIAS expr1], + name2 [type2] [ALIAS expr2], + ... +) ENGINE = Hive('thrift://host:port', 'database', 'table'); +PARTITION BY expr +``` +See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +The table structure can differ from the original Hive table structure: +- Column names should be the same as in the original Hive table, but you can use just some of these columns and in any order, also you can use some alias columns calculated from other columns. +- Column types should be the same from those in the original Hive table. +- Partition by expression should be consistent with the original Hive table, and columns in partition by expression should be in the table structure. + +**Engine Parameters** + +- `thrift://host:port` — Hive Metastore address + +- `database` — Remote database name. + +- `table` — Remote table name. + +## Usage Example {#usage-example} + +### How to Use Local Cache for HDFS Filesystem +We strongly advice you to enable local cache for remote filesystems. Benchmark shows that its almost 2x faster with cache. + +Before using cache, add it to `config.xml` +``` xml + + true + local_cache + 559096952 + 1048576 + +``` + +- enable: ClickHouse will maintain local cache for remote filesystem(HDFS) after startup if true. +- root_dir: Required. The root directory to store local cache files for remote filesystem. +- limit_size: Required. The maximum size(in bytes) of local cache files. +- bytes_read_before_flush: Control bytes before flush to local filesystem when downloading file from remote filesystem. The default value is 1MB. + +When ClickHouse is started up with local cache for remote filesystem enabled, users can still choose not to use cache with `settings use_local_cache_for_remote_fs = 0` in their query. `use_local_cache_for_remote_fs` is `false` in default. + +### Query Hive Table with ORC Input Format + +#### Create Table in Hive +``` text +hive > CREATE TABLE `test`.`test_orc`( + `f_tinyint` tinyint, + `f_smallint` smallint, + `f_int` int, + `f_integer` int, + `f_bigint` bigint, + `f_float` float, + `f_double` double, + `f_decimal` decimal(10,0), + `f_timestamp` timestamp, + `f_date` date, + `f_string` string, + `f_varchar` varchar(100), + `f_bool` boolean, + `f_binary` binary, + `f_array_int` array, + `f_array_string` array, + `f_array_float` array, + `f_array_array_int` array>, + `f_array_array_string` array>, + `f_array_array_float` array>) +PARTITIONED BY ( + `day` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' +LOCATION + 'hdfs://testcluster/data/hive/test.db/test_orc' + +OK +Time taken: 0.51 seconds + +hive > insert into test.test_orc partition(day='2021-09-18') select 1, 2, 3, 4, 5, 6.11, 7.22, 8.333, current_timestamp(), current_date(), 'hello world', 'hello world', 'hello world', true, 'hello world', array(1, 2, 3), array('hello world', 'hello world'), array(float(1.1), float(1.2)), array(array(1, 2), array(3, 4)), array(array('a', 'b'), array('c', 'd')), array(array(float(1.11), float(2.22)), array(float(3.33), float(4.44))); +OK +Time taken: 36.025 seconds + +hive > select * from test.test_orc; +OK +1 2 3 4 5 6.11 7.22 8 2021-11-05 12:38:16.314 2021-11-05 hello world hello world hello world true hello world [1,2,3] ["hello world","hello world"] [1.1,1.2] [[1,2],[3,4]] [["a","b"],["c","d"]] [[1.11,2.22],[3.33,4.44]] 2021-09-18 +Time taken: 0.295 seconds, Fetched: 1 row(s) +``` + +#### Create Table in ClickHouse +Table in ClickHouse, retrieving data from the Hive table created above: +``` sql +CREATE TABLE test.test_orc +( + `f_tinyint` Int8, + `f_smallint` Int16, + `f_int` Int32, + `f_integer` Int32, + `f_bigint` Int64, + `f_float` Float32, + `f_double` Float64, + `f_decimal` Float64, + `f_timestamp` DateTime, + `f_date` Date, + `f_string` String, + `f_varchar` String, + `f_bool` Bool, + `f_binary` String, + `f_array_int` Array(Int32), + `f_array_string` Array(String), + `f_array_float` Array(Float32), + `f_array_array_int` Array(Array(Int32)), + `f_array_array_string` Array(Array(String)), + `f_array_array_float` Array(Array(Float32)), + `day` String +) +ENGINE = Hive('thrift://202.168.117.26:9083', 'test', 'test_orc') +PARTITION BY day + +``` + +``` sql +SELECT * FROM test.test_orc settings input_format_orc_allow_missing_columns = 1\G +``` + +``` text +SELECT * +FROM test.test_orc +SETTINGS input_format_orc_allow_missing_columns = 1 + +Query id: c3eaffdc-78ab-43cd-96a4-4acc5b480658 + +Row 1: +────── +f_tinyint: 1 +f_smallint: 2 +f_int: 3 +f_integer: 4 +f_bigint: 5 +f_float: 6.11 +f_double: 7.22 +f_decimal: 8 +f_timestamp: 2021-12-04 04:00:44 +f_date: 2021-12-03 +f_string: hello world +f_varchar: hello world +f_bool: true +f_binary: hello world +f_array_int: [1,2,3] +f_array_string: ['hello world','hello world'] +f_array_float: [1.1,1.2] +f_array_array_int: [[1,2],[3,4]] +f_array_array_string: [['a','b'],['c','d']] +f_array_array_float: [[1.11,2.22],[3.33,4.44]] +day: 2021-09-18 + + +1 rows in set. Elapsed: 0.078 sec. +``` + +### Query Hive Table with Parquet Input Format + +#### Create Table in Hive +``` text +hive > +CREATE TABLE `test`.`test_parquet`( + `f_tinyint` tinyint, + `f_smallint` smallint, + `f_int` int, + `f_integer` int, + `f_bigint` bigint, + `f_float` float, + `f_double` double, + `f_decimal` decimal(10,0), + `f_timestamp` timestamp, + `f_date` date, + `f_string` string, + `f_varchar` varchar(100), + `f_char` char(100), + `f_bool` boolean, + `f_binary` binary, + `f_array_int` array, + `f_array_string` array, + `f_array_float` array, + `f_array_array_int` array>, + `f_array_array_string` array>, + `f_array_array_float` array>) +PARTITIONED BY ( + `day` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' +LOCATION + 'hdfs://testcluster/data/hive/test.db/test_parquet' +OK +Time taken: 0.51 seconds + +hive > insert into test.test_parquet partition(day='2021-09-18') select 1, 2, 3, 4, 5, 6.11, 7.22, 8.333, current_timestamp(), current_date(), 'hello world', 'hello world', 'hello world', true, 'hello world', array(1, 2, 3), array('hello world', 'hello world'), array(float(1.1), float(1.2)), array(array(1, 2), array(3, 4)), array(array('a', 'b'), array('c', 'd')), array(array(float(1.11), float(2.22)), array(float(3.33), float(4.44))); +OK +Time taken: 36.025 seconds + +hive > select * from test.test_parquet; +OK +1 2 3 4 5 6.11 7.22 8 2021-12-14 17:54:56.743 2021-12-14 hello world hello world hello world true hello world [1,2,3] ["hello world","hello world"] [1.1,1.2] [[1,2],[3,4]] [["a","b"],["c","d"]] [[1.11,2.22],[3.33,4.44]] 2021-09-18 +Time taken: 0.766 seconds, Fetched: 1 row(s) +``` + +#### Create Table in ClickHouse +Table in ClickHouse, retrieving data from the Hive table created above: +``` sql +CREATE TABLE test.test_parquet +( + `f_tinyint` Int8, + `f_smallint` Int16, + `f_int` Int32, + `f_integer` Int32, + `f_bigint` Int64, + `f_float` Float32, + `f_double` Float64, + `f_decimal` Float64, + `f_timestamp` DateTime, + `f_date` Date, + `f_string` String, + `f_varchar` String, + `f_char` String, + `f_bool` Bool, + `f_binary` String, + `f_array_int` Array(Int32), + `f_array_string` Array(String), + `f_array_float` Array(Float32), + `f_array_array_int` Array(Array(Int32)), + `f_array_array_string` Array(Array(String)), + `f_array_array_float` Array(Array(Float32)), + `day` String +) +ENGINE = Hive('thrift://localhost:9083', 'test', 'test_parquet') +PARTITION BY day +``` + +``` sql +SELECT * FROM test.test_parquet settings input_format_parquet_allow_missing_columns = 1\G +``` + +``` text +SELECT * +FROM test_parquet +SETTINGS input_format_parquet_allow_missing_columns = 1 + +Query id: 4e35cf02-c7b2-430d-9b81-16f438e5fca9 + +Row 1: +────── +f_tinyint: 1 +f_smallint: 2 +f_int: 3 +f_integer: 4 +f_bigint: 5 +f_float: 6.11 +f_double: 7.22 +f_decimal: 8 +f_timestamp: 2021-12-14 17:54:56 +f_date: 2021-12-14 +f_string: hello world +f_varchar: hello world +f_char: hello world +f_bool: true +f_binary: hello world +f_array_int: [1,2,3] +f_array_string: ['hello world','hello world'] +f_array_float: [1.1,1.2] +f_array_array_int: [[1,2],[3,4]] +f_array_array_string: [['a','b'],['c','d']] +f_array_array_float: [[1.11,2.22],[3.33,4.44]] +day: 2021-09-18 + +1 rows in set. Elapsed: 0.357 sec. +``` + +### Query Hive Table with Text Input Format +#### Create Table in Hive +``` text +hive > +CREATE TABLE `test`.`test_text`( + `f_tinyint` tinyint, + `f_smallint` smallint, + `f_int` int, + `f_integer` int, + `f_bigint` bigint, + `f_float` float, + `f_double` double, + `f_decimal` decimal(10,0), + `f_timestamp` timestamp, + `f_date` date, + `f_string` string, + `f_varchar` varchar(100), + `f_char` char(100), + `f_bool` boolean, + `f_binary` binary, + `f_array_int` array, + `f_array_string` array, + `f_array_float` array, + `f_array_array_int` array>, + `f_array_array_string` array>, + `f_array_array_float` array>) +PARTITIONED BY ( + `day` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.TextInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' +LOCATION + 'hdfs://testcluster/data/hive/test.db/test_text' +Time taken: 0.1 seconds, Fetched: 34 row(s) + + +hive > insert into test.test_text partition(day='2021-09-18') select 1, 2, 3, 4, 5, 6.11, 7.22, 8.333, current_timestamp(), current_date(), 'hello world', 'hello world', 'hello world', true, 'hello world', array(1, 2, 3), array('hello world', 'hello world'), array(float(1.1), float(1.2)), array(array(1, 2), array(3, 4)), array(array('a', 'b'), array('c', 'd')), array(array(float(1.11), float(2.22)), array(float(3.33), float(4.44))); +OK +Time taken: 36.025 seconds + +hive > select * from test.test_text; +OK +1 2 3 4 5 6.11 7.22 8 2021-12-14 18:11:17.239 2021-12-14 hello world hello world hello world true hello world [1,2,3] ["hello world","hello world"] [1.1,1.2] [[1,2],[3,4]] [["a","b"],["c","d"]] [[1.11,2.22],[3.33,4.44]] 2021-09-18 +Time taken: 0.624 seconds, Fetched: 1 row(s) +``` + +#### Create Table in ClickHouse + +Table in ClickHouse, retrieving data from the Hive table created above: +``` sql +CREATE TABLE test.test_text +( + `f_tinyint` Int8, + `f_smallint` Int16, + `f_int` Int32, + `f_integer` Int32, + `f_bigint` Int64, + `f_float` Float32, + `f_double` Float64, + `f_decimal` Float64, + `f_timestamp` DateTime, + `f_date` Date, + `f_string` String, + `f_varchar` String, + `f_char` String, + `f_bool` Bool, + `day` String +) +ENGINE = Hive('thrift://localhost:9083', 'test', 'test_text') +PARTITION BY day +``` + +``` sql +SELECT * FROM test.test_text settings input_format_skip_unknown_fields = 1, input_format_with_names_use_header = 1, date_time_input_format = 'best_effort'\G +``` + +``` text +SELECT * +FROM test.test_text +SETTINGS input_format_skip_unknown_fields = 1, input_format_with_names_use_header = 1, date_time_input_format = 'best_effort' + +Query id: 55b79d35-56de-45b9-8be6-57282fbf1f44 + +Row 1: +────── +f_tinyint: 1 +f_smallint: 2 +f_int: 3 +f_integer: 4 +f_bigint: 5 +f_float: 6.11 +f_double: 7.22 +f_decimal: 8 +f_timestamp: 2021-12-14 18:11:17 +f_date: 2021-12-14 +f_string: hello world +f_varchar: hello world +f_char: hello world +f_bool: true +day: 2021-09-18 +``` + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/hive/) diff --git a/docs/en/reference/engines/table-engines/integrations/index.md b/docs/en/reference/engines/table-engines/integrations/index.md new file mode 100644 index 00000000000..9230ad624ba --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/index.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 40 +sidebar_label: Integrations +--- + +# Table Engines for Integrations {#table-engines-for-integrations} + +ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like external dictionaries or table functions, which require to use custom query methods on each use. + +List of supported integrations: + +- [ODBC](../../../engines/table-engines/integrations/odbc.md) +- [JDBC](../../../engines/table-engines/integrations/jdbc.md) +- [MySQL](../../../engines/table-engines/integrations/mysql.md) +- [MongoDB](../../../engines/table-engines/integrations/mongodb.md) +- [HDFS](../../../engines/table-engines/integrations/hdfs.md) +- [S3](../../../engines/table-engines/integrations/s3.md) +- [Kafka](../../../engines/table-engines/integrations/kafka.md) +- [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md) +- [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) +- [PostgreSQL](../../../engines/table-engines/integrations/postgresql.md) +- [SQLite](../../../engines/table-engines/integrations/sqlite.md) +- [Hive](../../../engines/table-engines/integrations/hive.md) diff --git a/docs/en/reference/engines/table-engines/integrations/jdbc.md b/docs/en/reference/engines/table-engines/integrations/jdbc.md new file mode 100644 index 00000000000..0ce31f36070 --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/jdbc.md @@ -0,0 +1,95 @@ +--- +sidebar_position: 3 +sidebar_label: JDBC +--- + +# JDBC {#table-engine-jdbc} + +Allows ClickHouse to connect to external databases via [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity). + +To implement the JDBC connection, ClickHouse uses the separate program [clickhouse-jdbc-bridge](https://github.com/ClickHouse/clickhouse-jdbc-bridge) that should run as a daemon. + +This engine supports the [Nullable](../../../sql-reference/data-types/nullable.md) data type. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name +( + columns list... +) +ENGINE = JDBC(datasource_uri, external_database, external_table) +``` + +**Engine Parameters** + + +- `datasource_uri` — URI or name of an external DBMS. + + URI Format: `jdbc:://:/?user=&password=`. + Example for MySQL: `jdbc:mysql://localhost:3306/?user=root&password=root`. + +- `external_database` — Database in an external DBMS. + +- `external_table` — Name of the table in `external_database` or a select query like `select * from table1 where column1=1`. + +## Usage Example {#usage-example} + +Creating a table in MySQL server by connecting directly with it’s console client: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++------+----------+-----+----------+ +| int_id | int_nullable | float | float_nullable | ++------+----------+-----+----------+ +| 1 | NULL | 2 | NULL | ++------+----------+-----+----------+ +1 row in set (0,00 sec) +``` + +Creating a table in ClickHouse server and selecting data from it: + +``` sql +CREATE TABLE jdbc_table +( + `int_id` Int32, + `int_nullable` Nullable(Int32), + `float` Float32, + `float_nullable` Nullable(Float32) +) +ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test') +``` + +``` sql +SELECT * +FROM jdbc_table +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ +└────────┴──────────────┴───────┴────────────────┘ +``` + +``` sql +INSERT INTO jdbc_table(`int_id`, `float`) +SELECT toInt32(number), toFloat32(number * 1.0) +FROM system.numbers +``` + +## See Also {#see-also} + +- [JDBC table function](../../../sql-reference/table-functions/jdbc.md). + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/jdbc/) diff --git a/docs/en/reference/engines/table-engines/integrations/kafka.md b/docs/en/reference/engines/table-engines/integrations/kafka.md new file mode 100644 index 00000000000..3a8d98e1ca9 --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/kafka.md @@ -0,0 +1,198 @@ +--- +sidebar_position: 8 +sidebar_label: Kafka +--- + +# Kafka {#kafka} + +This engine works with [Apache Kafka](http://kafka.apache.org/). + +Kafka lets you: + +- Publish or subscribe to data flows. +- Organize fault-tolerant storage. +- Process streams as they become available. + +## Creating a Table {#table_engine-kafka-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = Kafka() +SETTINGS + kafka_broker_list = 'host:port', + kafka_topic_list = 'topic1,topic2,...', + kafka_group_name = 'group_name', + kafka_format = 'data_format'[,] + [kafka_row_delimiter = 'delimiter_symbol',] + [kafka_schema = '',] + [kafka_num_consumers = N,] + [kafka_max_block_size = 0,] + [kafka_skip_broken_messages = N,] + [kafka_commit_every_batch = 0,] + [kafka_thread_per_consumer = 0] +``` + +Required parameters: + +- `kafka_broker_list` — A comma-separated list of brokers (for example, `localhost:9092`). +- `kafka_topic_list` — A list of Kafka topics. +- `kafka_group_name` — A group of Kafka consumers. Reading margins are tracked for each group separately. If you do not want messages to be duplicated in the cluster, use the same group name everywhere. +- `kafka_format` — Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section. + +Optional parameters: + +- `kafka_row_delimiter` — Delimiter character, which ends the message. +- `kafka_schema` — Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. +- `kafka_num_consumers` — The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition. +- `kafka_max_block_size` — The maximum batch size (in messages) for poll (default: `max_block_size`). +- `kafka_skip_broken_messages` — Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data). +- `kafka_commit_every_batch` — Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`). +- `kafka_thread_per_consumer` — Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise — rows from several consumers squashed to form one block). + +Examples: + +``` sql + CREATE TABLE queue ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka('localhost:9092', 'topic', 'group1', 'JSONEachRow'); + + SELECT * FROM queue LIMIT 5; + + CREATE TABLE queue2 ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka SETTINGS kafka_broker_list = 'localhost:9092', + kafka_topic_list = 'topic', + kafka_group_name = 'group1', + kafka_format = 'JSONEachRow', + kafka_num_consumers = 4; + + CREATE TABLE queue3 ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka('localhost:9092', 'topic', 'group1') + SETTINGS kafka_format = 'JSONEachRow', + kafka_num_consumers = 4; +``` + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects. If possible, switch old projects to the method described above. +::: + +``` sql +Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format + [, kafka_row_delimiter, kafka_schema, kafka_num_consumers, kafka_skip_broken_messages]) +``` + +
+ +## Description {#description} + +The delivered messages are tracked automatically, so each message in a group is only counted once. If you want to get the data twice, then create a copy of the table with another group name. + +Groups are flexible and synced on the cluster. For instance, if you have 10 topics and 5 copies of a table in a cluster, then each copy gets 2 topics. If the number of copies changes, the topics are redistributed across the copies automatically. Read more about this at http://kafka.apache.org/intro. + +`SELECT` is not particularly useful for reading messages (except for debugging), because each message can be read only once. It is more practical to create real-time threads using materialized views. To do this: + +1. Use the engine to create a Kafka consumer and consider it a data stream. +2. Create a table with the desired structure. +3. Create a materialized view that converts data from the engine and puts it into a previously created table. + +When the `MATERIALIZED VIEW` joins the engine, it starts collecting data in the background. This allows you to continually receive messages from Kafka and convert them to the required format using `SELECT`. +One kafka table can have as many materialized views as you like, they do not read data from the kafka table directly, but receive new records (in blocks), this way you can write to several tables with different detail level (with grouping - aggregation and without). + +Example: + +``` sql + CREATE TABLE queue ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka('localhost:9092', 'topic', 'group1', 'JSONEachRow'); + + CREATE TABLE daily ( + day Date, + level String, + total UInt64 + ) ENGINE = SummingMergeTree(day, (day, level), 8192); + + CREATE MATERIALIZED VIEW consumer TO daily + AS SELECT toDate(toDateTime(timestamp)) AS day, level, count() as total + FROM queue GROUP BY day, level; + + SELECT level, sum(total) FROM daily GROUP BY level; +``` +To improve performance, received messages are grouped into blocks the size of [max_insert_block_size](../../../operations/settings/settings.md#settings-max_insert_block_size). If the block wasn’t formed within [stream_flush_interval_ms](../../../operations/settings/settings.md/#stream-flush-interval-ms) milliseconds, the data will be flushed to the table regardless of the completeness of the block. + +To stop receiving topic data or to change the conversion logic, detach the materialized view: + +``` sql + DETACH TABLE consumer; + ATTACH TABLE consumer; +``` + +If you want to change the target table by using `ALTER`, we recommend disabling the material view to avoid discrepancies between the target table and the data from the view. + +## Configuration {#configuration} + +Similar to GraphiteMergeTree, the Kafka engine supports extended configuration using the ClickHouse config file. There are two configuration keys that you can use: global (`kafka`) and topic-level (`kafka_*`). The global configuration is applied first, and then the topic-level configuration is applied (if it exists). + +``` xml + + + cgrp + smallest + + + + + 250 + 100000 + +``` + +For a list of possible configuration options, see the [librdkafka configuration reference](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md). Use the underscore (`_`) instead of a dot in the ClickHouse configuration. For example, `check.crcs=true` will be `true`. + +### Kerberos support {#kafka-kerberos-support} + +To deal with Kerberos-aware Kafka, add `security_protocol` child element with `sasl_plaintext` value. It is enough if Kerberos ticket-granting ticket is obtained and cached by OS facilities. +ClickHouse is able to maintain Kerberos credentials using a keytab file. Consider `sasl_kerberos_service_name`, `sasl_kerberos_keytab`, `sasl_kerberos_principal` and `sasl.kerberos.kinit.cmd` child elements. + +Example: + +``` xml + + + SASL_PLAINTEXT + /home/kafkauser/kafkauser.keytab + kafkauser/kafkahost@EXAMPLE.COM + +``` + +## Virtual Columns {#virtual-columns} + +- `_topic` — Kafka topic. +- `_key` — Key of the message. +- `_offset` — Offset of the message. +- `_timestamp` — Timestamp of the message. +- `_timestamp_ms` — Timestamp in milliseconds of the message. +- `_partition` — Partition of Kafka topic. + +**See Also** + +- [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) +- [background_message_broker_schedule_pool_size](../../../operations/settings/settings.md#background_message_broker_schedule_pool_size) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/kafka/) diff --git a/docs/en/reference/engines/table-engines/integrations/materialized-postgresql.md b/docs/en/reference/engines/table-engines/integrations/materialized-postgresql.md new file mode 100644 index 00000000000..61f97961ddb --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/materialized-postgresql.md @@ -0,0 +1,59 @@ +--- +sidebar_position: 12 +sidebar_label: MaterializedPostgreSQL +--- + +# MaterializedPostgreSQL {#materialize-postgresql} + +Creates ClickHouse table with an initial data dump of PostgreSQL table and starts replication process, i.e. executes background job to apply new changes as they happen on PostgreSQL table in the remote PostgreSQL database. + +If more than one table is required, it is highly recommended to use the [MaterializedPostgreSQL](../../../engines/database-engines/materialized-postgresql.md) database engine instead of the table engine and use the `materialized_postgresql_tables_list` setting, which specifies the tables to be replicated (will also be possible to add database `schema`). It will be much better in terms of CPU, fewer connections and fewer replication slots inside the remote PostgreSQL database. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64) +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password') +PRIMARY KEY key; +``` + +**Engine Parameters** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `table` — Remote table name. +- `user` — PostgreSQL user. +- `password` — User password. + +## Requirements {#requirements} + +1. The [wal_level](https://www.postgresql.org/docs/current/runtime-config-wal.html) setting must have a value `logical` and `max_replication_slots` parameter must have a value at least `2` in the PostgreSQL config file. + +2. A table with `MaterializedPostgreSQL` engine must have a primary key — the same as a replica identity index (by default: primary key) of a PostgreSQL table (see [details on replica identity index](../../../engines/database-engines/materialized-postgresql.md#requirements)). + +3. Only database [Atomic](https://en.wikipedia.org/wiki/Atomicity_(database_systems)) is allowed. + +## Virtual columns {#virtual-columns} + +- `_version` — Transaction counter. Type: [UInt64](../../../sql-reference/data-types/int-uint.md). + +- `_sign` — Deletion mark. Type: [Int8](../../../sql-reference/data-types/int-uint.md). Possible values: + - `1` — Row is not deleted, + - `-1` — Row is deleted. + +These columns do not need to be added when a table is created. They are always accessible in `SELECT` query. +`_version` column equals `LSN` position in `WAL`, so it might be used to check how up-to-date replication is. + +``` sql +CREATE TABLE postgresql_db.postgresql_replica (key UInt64, value UInt64) +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgresql_replica', 'postgres_user', 'postgres_password') +PRIMARY KEY key; + +SELECT key, value, _version FROM postgresql_db.postgresql_replica; +``` + +:::warning +Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. +::: + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/materialized-postgresql) diff --git a/docs/en/reference/engines/table-engines/integrations/mongodb.md b/docs/en/reference/engines/table-engines/integrations/mongodb.md new file mode 100644 index 00000000000..d212ab4720f --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/mongodb.md @@ -0,0 +1,79 @@ +--- +sidebar_position: 5 +sidebar_label: MongoDB +--- + +# MongoDB {#mongodb} + +MongoDB engine is read-only table engine which allows to read data (`SELECT` queries) from remote MongoDB collection. Engine supports only non-nested data types. `INSERT` queries are not supported. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name +( + name1 [type1], + name2 [type2], + ... +) ENGINE = MongoDB(host:port, database, collection, user, password [, options]); +``` + +**Engine Parameters** + +- `host:port` — MongoDB server address. + +- `database` — Remote database name. + +- `collection` — Remote collection name. + +- `user` — MongoDB user. + +- `password` — User password. + +- `options` — MongoDB connection string options (optional parameter). + +## Usage Example {#usage-example} + +Create a table in ClickHouse which allows to read data from MongoDB collection: + +``` sql +CREATE TABLE mongo_table +( + key UInt64, + data String +) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'testuser', 'clickhouse'); +``` + +To read from an SSL secured MongoDB server: + +``` sql +CREATE TABLE mongo_table_ssl +( + key UInt64, + data String +) ENGINE = MongoDB('mongo2:27017', 'test', 'simple_table', 'testuser', 'clickhouse', 'ssl=true'); +``` + +Query: + +``` sql +SELECT COUNT() FROM mongo_table; +``` + +``` text +┌─count()─┐ +│ 4 │ +└─────────┘ +``` + +You can also adjust connection timeout: + +``` sql +CREATE TABLE mongo_table +( + key UInt64, + data String +) ENGINE = MongoDB('mongo2:27017', 'test', 'simple_table', 'testuser', 'clickhouse', 'connectTimeoutMS=100000'); +``` + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/mongodb/) diff --git a/docs/en/reference/engines/table-engines/integrations/mysql.md b/docs/en/reference/engines/table-engines/integrations/mysql.md new file mode 100644 index 00000000000..e962db58873 --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/mysql.md @@ -0,0 +1,152 @@ +--- +sidebar_position: 4 +sidebar_label: MySQL +--- + +# MySQL {#mysql} + +The MySQL engine allows you to perform `SELECT` and `INSERT` queries on data that is stored on a remote MySQL server. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... +) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']) +SETTINGS + [connection_pool_size=16, ] + [connection_max_tries=3, ] + [connection_wait_timeout=5, ] /* 0 -- do not wait */ + [connection_auto_close=true ] +; +``` + +See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +The table structure can differ from the original MySQL table structure: + +- Column names should be the same as in the original MySQL table, but you can use just some of these columns and in any order. +- Column types may differ from those in the original MySQL table. ClickHouse tries to [cast](../../../engines/database-engines/mysql.md#data_types-support) values to the ClickHouse data types. +- The [external_table_functions_use_nulls](../../../operations/settings/settings.md#external-table-functions-use-nulls) setting defines how to handle Nullable columns. Default value: 1. If 0, the table function does not make Nullable columns and inserts default values instead of nulls. This is also applicable for NULL values inside arrays. + +**Engine Parameters** + +- `host:port` — MySQL server address. + +- `database` — Remote database name. + +- `table` — Remote table name. + +- `user` — MySQL user. + +- `password` — User password. + +- `replace_query` — Flag that converts `INSERT INTO` queries to `REPLACE INTO`. If `replace_query=1`, the query is substituted. + +- `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression that is added to the `INSERT` query. + + Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, where `on_duplicate_clause` is `UPDATE c2 = c2 + 1`. See the [MySQL documentation](https://dev.mysql.com/doc/refman/8.0/en/insert-on-duplicate.html) to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause. + + To specify `on_duplicate_clause` you need to pass `0` to the `replace_query` parameter. If you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception. + +Simple `WHERE` clauses such as `=, !=, >, >=, <, <=` are executed on the MySQL server. + +The rest of the conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to MySQL finishes. + +Supports multiple replicas that must be listed by `|`. For example: + +```sql +CREATE TABLE test_replicas (id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL(`mysql{2|3|4}:3306`, 'clickhouse', 'test_replicas', 'root', 'clickhouse'); +``` + +## Usage Example {#usage-example} + +Table in MySQL: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++------+----------+-----+----------+ +| int_id | int_nullable | float | float_nullable | ++------+----------+-----+----------+ +| 1 | NULL | 2 | NULL | ++------+----------+-----+----------+ +1 row in set (0,00 sec) +``` + +Table in ClickHouse, retrieving data from the MySQL table created above: + +``` sql +CREATE TABLE mysql_table +( + `float_nullable` Nullable(Float32), + `int_id` Int32 +) +ENGINE = MySQL('localhost:3306', 'test', 'test', 'bayonet', '123') +``` + +``` sql +SELECT * FROM mysql_table +``` + +``` text +┌─float_nullable─┬─int_id─┐ +│ ᴺᵁᴸᴸ │ 1 │ +└────────────────┴────────┘ +``` + +## Settings {#mysql-settings} + +Default settings are not very efficient, since they do not even reuse connections. These settings allow you to increase the number of queries run by the server per second. + +### connection_auto_close {#connection-auto-close} + +Allows to automatically close the connection after query execution, i.e. disable connection reuse. + +Possible values: + +- 1 — Auto-close connection is allowed, so the connection reuse is disabled +- 0 — Auto-close connection is not allowed, so the connection reuse is enabled + +Default value: `1`. + +### connection_max_tries {#connection-max-tries} + +Sets the number of retries for pool with failover. + +Possible values: + +- Positive integer. +- 0 — There are no retries for pool with failover. + +Default value: `3`. + +### connection_pool_size {#connection-pool-size} + +Size of connection pool (if all connections are in use, the query will wait until some connection will be freed). + +Possible values: + +- Positive integer. + +Default value: `16`. + +## See Also {#see-also} + +- [The mysql table function](../../../sql-reference/table-functions/mysql.md) +- [Using MySQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/mysql/) diff --git a/docs/en/reference/engines/table-engines/integrations/odbc.md b/docs/en/reference/engines/table-engines/integrations/odbc.md new file mode 100644 index 00000000000..ed2b77d7ca3 --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/odbc.md @@ -0,0 +1,131 @@ +--- +sidebar_position: 2 +sidebar_label: ODBC +--- + +# ODBC {#table-engine-odbc} + +Allows ClickHouse to connect to external databases via [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). + +To safely implement ODBC connections, ClickHouse uses a separate program `clickhouse-odbc-bridge`. If the ODBC driver is loaded directly from `clickhouse-server`, driver problems can crash the ClickHouse server. ClickHouse automatically starts `clickhouse-odbc-bridge` when it is required. The ODBC bridge program is installed from the same package as the `clickhouse-server`. + +This engine supports the [Nullable](../../../sql-reference/data-types/nullable.md) data type. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1], + name2 [type2], + ... +) +ENGINE = ODBC(connection_settings, external_database, external_table) +``` + +See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +The table structure can differ from the source table structure: + +- Column names should be the same as in the source table, but you can use just some of these columns and in any order. +- Column types may differ from those in the source table. ClickHouse tries to [cast](../../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) values to the ClickHouse data types. +- The [external_table_functions_use_nulls](../../../operations/settings/settings.md#external-table-functions-use-nulls) setting defines how to handle Nullable columns. Default value: 1. If 0, the table function does not make Nullable columns and inserts default values instead of nulls. This is also applicable for NULL values inside arrays. + +**Engine Parameters** + +- `connection_settings` — Name of the section with connection settings in the `odbc.ini` file. +- `external_database` — Name of a database in an external DBMS. +- `external_table` — Name of a table in the `external_database`. + +## Usage Example {#usage-example} + +**Retrieving data from the local MySQL installation via ODBC** + +This example is checked for Ubuntu Linux 18.04 and MySQL server 5.7. + +Ensure that unixODBC and MySQL Connector are installed. + +By default (if installed from packages), ClickHouse starts as user `clickhouse`. Thus, you need to create and configure this user in the MySQL server. + +``` bash +$ sudo mysql +``` + +``` sql +mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; +mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; +``` + +Then configure the connection in `/etc/odbc.ini`. + +``` bash +$ cat /etc/odbc.ini +[mysqlconn] +DRIVER = /usr/local/lib/libmyodbc5w.so +SERVER = 127.0.0.1 +PORT = 3306 +DATABASE = test +USERNAME = clickhouse +PASSWORD = clickhouse +``` + +You can check the connection using the `isql` utility from the unixODBC installation. + +``` bash +$ isql -v mysqlconn ++-------------------------+ +| Connected! | +| | +... +``` + +Table in MySQL: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++------+----------+-----+----------+ +| int_id | int_nullable | float | float_nullable | ++------+----------+-----+----------+ +| 1 | NULL | 2 | NULL | ++------+----------+-----+----------+ +1 row in set (0,00 sec) +``` + +Table in ClickHouse, retrieving data from the MySQL table: + +``` sql +CREATE TABLE odbc_t +( + `int_id` Int32, + `float_nullable` Nullable(Float32) +) +ENGINE = ODBC('DSN=mysqlconn', 'test', 'test') +``` + +``` sql +SELECT * FROM odbc_t +``` + +``` text +┌─int_id─┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ +└────────┴────────────────┘ +``` + +## See Also {#see-also} + +- [ODBC external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC table function](../../../sql-reference/table-functions/odbc.md) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/odbc/) diff --git a/docs/en/reference/engines/table-engines/integrations/postgresql.md b/docs/en/reference/engines/table-engines/integrations/postgresql.md new file mode 100644 index 00000000000..d6826000a1a --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/postgresql.md @@ -0,0 +1,178 @@ +--- +sidebar_position: 11 +sidebar_label: PostgreSQL +--- + +# PostgreSQL {#postgresql} + +The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data that is stored on a remote PostgreSQL server. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... +) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); +``` + +See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +The table structure can differ from the original PostgreSQL table structure: + +- Column names should be the same as in the original PostgreSQL table, but you can use just some of these columns and in any order. +- Column types may differ from those in the original PostgreSQL table. ClickHouse tries to [cast](../../../engines/database-engines/postgresql.md#data_types-support) values to the ClickHouse data types. +- The [external_table_functions_use_nulls](../../../operations/settings/settings.md#external-table-functions-use-nulls) setting defines how to handle Nullable columns. Default value: 1. If 0, the table function does not make Nullable columns and inserts default values instead of nulls. This is also applicable for NULL values inside arrays. + +**Engine Parameters** + +- `host:port` — PostgreSQL server address. +- `database` — Remote database name. +- `table` — Remote table name. +- `user` — PostgreSQL user. +- `password` — User password. +- `schema` — Non-default table schema. Optional. +- `on conflict ...` — example: `ON CONFLICT DO NOTHING`. Optional. Note: adding this option will make insertion less efficient. + +or via config (since version 21.11): + +``` + + + + + + +
+
+ + + + + + +
+``` + +Some parameters can be overriden by key value arguments: +``` sql +SELECT * FROM postgresql(postgres1, schema='schema1', table='table1'); +``` + +## Implementation Details {#implementation-details} + +`SELECT` queries on PostgreSQL side run as `COPY (SELECT ...) TO STDOUT` inside read-only PostgreSQL transaction with commit after each `SELECT` query. + +Simple `WHERE` clauses such as `=`, `!=`, `>`, `>=`, `<`, `<=`, and `IN` are executed on the PostgreSQL server. + +All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to PostgreSQL finishes. + +`INSERT` queries on PostgreSQL side run as `COPY "table_name" (field1, field2, ... fieldN) FROM STDIN` inside PostgreSQL transaction with auto-commit after each `INSERT` statement. + +PostgreSQL `Array` types are converted into ClickHouse arrays. + +:::warning +Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. +::: + +Supports multiple replicas that must be listed by `|`. For example: + +```sql +CREATE TABLE test_replicas (id UInt32, name String) ENGINE = PostgreSQL(`postgres{2|3|4}:5432`, 'clickhouse', 'test_replicas', 'postgres', 'mysecretpassword'); +``` + +Replicas priority for PostgreSQL dictionary source is supported. The bigger the number in map, the less the priority. The highest priority is `0`. + +In the example below replica `example01-1` has the highest priority: + +```xml + + 5432 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 2 + + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` + +## Usage Example {#usage-example} + +Table in PostgreSQL: + +``` text +postgres=# CREATE TABLE "public"."test" ( +"int_id" SERIAL, +"int_nullable" INT NULL DEFAULT NULL, +"float" FLOAT NOT NULL, +"str" VARCHAR(100) NOT NULL DEFAULT '', +"float_nullable" FLOAT NULL DEFAULT NULL, +PRIMARY KEY (int_id)); + +CREATE TABLE + +postgres=# INSERT INTO test (int_id, str, "float") VALUES (1,'test',2); +INSERT 0 1 + +postgresql> SELECT * FROM test; + int_id | int_nullable | float | str | float_nullable + --------+--------------+-------+------+---------------- + 1 | | 2 | test | + (1 row) +``` + +Table in ClickHouse, retrieving data from the PostgreSQL table created above: + +``` sql +CREATE TABLE default.postgresql_table +( + `float_nullable` Nullable(Float32), + `str` String, + `int_id` Int32 +) +ENGINE = PostgreSQL('localhost:5432', 'public', 'test', 'postges_user', 'postgres_password'); +``` + +``` sql +SELECT * FROM postgresql_table WHERE str IN ('test'); +``` + +``` text +┌─float_nullable─┬─str──┬─int_id─┐ +│ ᴺᵁᴸᴸ │ test │ 1 │ +└────────────────┴──────┴────────┘ +``` + +Using Non-default Schema: + +```text +postgres=# CREATE SCHEMA "nice.schema"; + +postgres=# CREATE TABLE "nice.schema"."nice.table" (a integer); + +postgres=# INSERT INTO "nice.schema"."nice.table" SELECT i FROM generate_series(0, 99) as t(i) +``` + +```sql +CREATE TABLE pg_table_schema_with_dots (a UInt32) + ENGINE PostgreSQL('localhost:5432', 'clickhouse', 'nice.table', 'postgrsql_user', 'password', 'nice.schema'); +``` + +**See Also** + +- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md) +- [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/postgresql/) diff --git a/docs/en/reference/engines/table-engines/integrations/rabbitmq.md b/docs/en/reference/engines/table-engines/integrations/rabbitmq.md new file mode 100644 index 00000000000..6653b76594a --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/rabbitmq.md @@ -0,0 +1,175 @@ +--- +sidebar_position: 10 +sidebar_label: RabbitMQ +--- + +# RabbitMQ Engine {#rabbitmq-engine} + +This engine allows integrating ClickHouse with [RabbitMQ](https://www.rabbitmq.com). + +`RabbitMQ` lets you: + +- Publish or subscribe to data flows. +- Process streams as they become available. + +## Creating a Table {#table_engine-rabbitmq-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = RabbitMQ SETTINGS + rabbitmq_host_port = 'host:port' [or rabbitmq_address = 'amqp(s)://guest:guest@localhost/vhost'], + rabbitmq_exchange_name = 'exchange_name', + rabbitmq_format = 'data_format'[,] + [rabbitmq_exchange_type = 'exchange_type',] + [rabbitmq_routing_key_list = 'key1,key2,...',] + [rabbitmq_secure = 0,] + [rabbitmq_row_delimiter = 'delimiter_symbol',] + [rabbitmq_schema = '',] + [rabbitmq_num_consumers = N,] + [rabbitmq_num_queues = N,] + [rabbitmq_queue_base = 'queue',] + [rabbitmq_deadletter_exchange = 'dl-exchange',] + [rabbitmq_persistent = 0,] + [rabbitmq_skip_broken_messages = N,] + [rabbitmq_max_block_size = N,] + [rabbitmq_flush_interval_ms = N] + [rabbitmq_queue_settings_list = 'x-dead-letter-exchange=my-dlx,x-max-length=10,x-overflow=reject-publish'] +``` + +Required parameters: + +- `rabbitmq_host_port` – host:port (for example, `localhost:5672`). +- `rabbitmq_exchange_name` – RabbitMQ exchange name. +- `rabbitmq_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section. + +Optional parameters: + +- `rabbitmq_exchange_type` – The type of RabbitMQ exchange: `direct`, `fanout`, `topic`, `headers`, `consistent_hash`. Default: `fanout`. +- `rabbitmq_routing_key_list` – A comma-separated list of routing keys. +- `rabbitmq_row_delimiter` – Delimiter character, which ends the message. +- `rabbitmq_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. +- `rabbitmq_num_consumers` – The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. +- `rabbitmq_num_queues` – Total number of queues. Default: `1`. Increasing this number can significantly improve performance. +- `rabbitmq_queue_base` - Specify a hint for queue names. Use cases of this setting are described below. +- `rabbitmq_deadletter_exchange` - Specify name for a [dead letter exchange](https://www.rabbitmq.com/dlx.html). You can create another table with this exchange name and collect messages in cases when they are republished to dead letter exchange. By default dead letter exchange is not specified. +- `rabbitmq_persistent` - If set to 1 (true), in insert query delivery mode will be set to 2 (marks messages as 'persistent'). Default: `0`. +- `rabbitmq_skip_broken_messages` – RabbitMQ message parser tolerance to schema-incompatible messages per block. Default: `0`. If `rabbitmq_skip_broken_messages = N` then the engine skips *N* RabbitMQ messages that cannot be parsed (a message equals a row of data). +- `rabbitmq_max_block_size` +- `rabbitmq_flush_interval_ms` +- `rabbitmq_queue_settings_list` - allows to set RabbitMQ settings when creating a queue. Available settings: `x-max-length`, `x-max-length-bytes`, `x-message-ttl`, `x-expires`, `x-priority`, `x-max-priority`, `x-overflow`, `x-dead-letter-exchange`, `x-queue-type`. The `durable` setting is enabled automatically for the queue. + +SSL connection: + +Use either `rabbitmq_secure = 1` or `amqps` in connection address: `rabbitmq_address = 'amqps://guest:guest@localhost/vhost'`. +The default behaviour of the used library is not to check if the created TLS connection is sufficiently secure. Whether the certificate is expired, self-signed, missing or invalid: the connection is simply permitted. More strict checking of certificates can possibly be implemented in the future. + +Also format settings can be added along with rabbitmq-related settings. + +Example: + +``` sql + CREATE TABLE queue ( + key UInt64, + value UInt64, + date DateTime + ) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672', + rabbitmq_exchange_name = 'exchange1', + rabbitmq_format = 'JSONEachRow', + rabbitmq_num_consumers = 5, + date_time_input_format = 'best_effort'; +``` + +The RabbitMQ server configuration should be added using the ClickHouse config file. + +Required configuration: + +``` xml + + root + clickhouse + +``` + +Additional configuration: + +``` xml + + clickhouse + +``` + +## Description {#description} + +`SELECT` is not particularly useful for reading messages (except for debugging), because each message can be read only once. It is more practical to create real-time threads using [materialized views](../../../sql-reference/statements/create/view.md). To do this: + +1. Use the engine to create a RabbitMQ consumer and consider it a data stream. +2. Create a table with the desired structure. +3. Create a materialized view that converts data from the engine and puts it into a previously created table. + +When the `MATERIALIZED VIEW` joins the engine, it starts collecting data in the background. This allows you to continually receive messages from RabbitMQ and convert them to the required format using `SELECT`. +One RabbitMQ table can have as many materialized views as you like. + +Data can be channeled based on `rabbitmq_exchange_type` and the specified `rabbitmq_routing_key_list`. +There can be no more than one exchange per table. One exchange can be shared between multiple tables - it enables routing into multiple tables at the same time. + +Exchange type options: + +- `direct` - Routing is based on the exact matching of keys. Example table key list: `key1,key2,key3,key4,key5`, message key can equal any of them. +- `fanout` - Routing to all tables (where exchange name is the same) regardless of the keys. +- `topic` - Routing is based on patterns with dot-separated keys. Examples: `*.logs`, `records.*.*.2020`, `*.2018,*.2019,*.2020`. +- `headers` - Routing is based on `key=value` matches with a setting `x-match=all` or `x-match=any`. Example table key list: `x-match=all,format=logs,type=report,year=2020`. +- `consistent_hash` - Data is evenly distributed between all bound tables (where the exchange name is the same). Note that this exchange type must be enabled with RabbitMQ plugin: `rabbitmq-plugins enable rabbitmq_consistent_hash_exchange`. + +Setting `rabbitmq_queue_base` may be used for the following cases: + +- to let different tables share queues, so that multiple consumers could be registered for the same queues, which makes a better performance. If using `rabbitmq_num_consumers` and/or `rabbitmq_num_queues` settings, the exact match of queues is achieved in case these parameters are the same. +- to be able to restore reading from certain durable queues when not all messages were successfully consumed. To resume consumption from one specific queue - set its name in `rabbitmq_queue_base` setting and do not specify `rabbitmq_num_consumers` and `rabbitmq_num_queues` (defaults to 1). To resume consumption from all queues, which were declared for a specific table - just specify the same settings: `rabbitmq_queue_base`, `rabbitmq_num_consumers`, `rabbitmq_num_queues`. By default, queue names will be unique to tables. +- to reuse queues as they are declared durable and not auto-deleted. (Can be deleted via any of RabbitMQ CLI tools.) + +To improve performance, received messages are grouped into blocks the size of [max_insert_block_size](../../../operations/server-configuration-parameters/settings.md#settings-max_insert_block_size). If the block wasn’t formed within [stream_flush_interval_ms](../../../operations/server-configuration-parameters/settings.md) milliseconds, the data will be flushed to the table regardless of the completeness of the block. + +If `rabbitmq_num_consumers` and/or `rabbitmq_num_queues` settings are specified along with `rabbitmq_exchange_type`, then: + +- `rabbitmq-consistent-hash-exchange` plugin must be enabled. +- `message_id` property of the published messages must be specified (unique for each message/batch). + +For insert query there is message metadata, which is added for each published message: `messageID` and `republished` flag (true, if published more than once) - can be accessed via message headers. + +Do not use the same table for inserts and materialized views. + +Example: + +``` sql + CREATE TABLE queue ( + key UInt64, + value UInt64 + ) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672', + rabbitmq_exchange_name = 'exchange1', + rabbitmq_exchange_type = 'headers', + rabbitmq_routing_key_list = 'format=logs,type=report,year=2020', + rabbitmq_format = 'JSONEachRow', + rabbitmq_num_consumers = 5; + + CREATE TABLE daily (key UInt64, value UInt64) + ENGINE = MergeTree() ORDER BY key; + + CREATE MATERIALIZED VIEW consumer TO daily + AS SELECT key, value FROM queue; + + SELECT key, value FROM daily ORDER BY key; +``` + +## Virtual Columns {#virtual-columns} + +- `_exchange_name` - RabbitMQ exchange name. +- `_channel_id` - ChannelID, on which consumer, who received the message, was declared. +- `_delivery_tag` - DeliveryTag of the received message. Scoped per channel. +- `_redelivered` - `redelivered` flag of the message. +- `_message_id` - messageID of the received message; non-empty if was set, when message was published. +- `_timestamp` - timestamp of the received message; non-empty if was set, when message was published. + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/rabbitmq/) diff --git a/docs/en/reference/engines/table-engines/integrations/s3.md b/docs/en/reference/engines/table-engines/integrations/s3.md new file mode 100644 index 00000000000..42abc2a0b1e --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/s3.md @@ -0,0 +1,163 @@ +--- +sidebar_position: 7 +sidebar_label: S3 +--- + +# S3 Table Engine {#table-engine-s3} + +This engine provides integration with [Amazon S3](https://aws.amazon.com/s3/) ecosystem. This engine is similar to the [HDFS](../../../engines/table-engines/special/file.md#table_engines-hdfs) engine, but provides S3-specific features. + +## Create Table {#creating-a-table} + +``` sql +CREATE TABLE s3_engine_table (name String, value UInt32) + ENGINE = S3(path, [aws_access_key_id, aws_secret_access_key,] format, [compression]) + [SETTINGS ...] +``` + +**Engine parameters** + +- `path` — Bucket url with path to file. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. For more information see [below](#wildcards-in-path). +- `format` — The [format](../../../interfaces/formats.md#formats) of the file. +- `aws_access_key_id`, `aws_secret_access_key` - Long-term credentials for the [AWS](https://aws.amazon.com/) account user. You can use these to authenticate your requests. Parameter is optional. If credentials are not specified, they are used from the configuration file. For more information see [Using S3 for Data Storage](../mergetree-family/mergetree.md#table_engine-mergetree-s3). +- `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will autodetect compression by file extension. + +**Example** + +``` sql +CREATE TABLE s3_engine_table (name String, value UInt32) + ENGINE=S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/test-data.csv.gz', 'CSV', 'gzip') + SETTINGS input_format_with_names_use_header = 0; + +INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3); + +SELECT * FROM s3_engine_table LIMIT 2; +``` + +```text +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` +## Virtual columns {#virtual-columns} + +- `_path` — Path to the file. +- `_file` — Name of the file. + +For more information about virtual columns see [here](../../../engines/table-engines/index.md#table_engines-virtual_columns). + +## Implementation Details {#implementation-details} + +- Reads and writes can be parallel +- [Zero-copy](../../../operations/storing-data.md#zero-copy) replication is supported. +- Not supported: + - `ALTER` and `SELECT...SAMPLE` operations. + - Indexes. + +## Wildcards In Path {#wildcards-in-path} + +`path` argument can specify multiple files using bash-like wildcards. For being processed file should exist and match to the whole path pattern. Listing of files is determined during `SELECT` (not at `CREATE` moment). + +- `*` — Substitutes any number of any characters except `/` including empty string. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. N and M can have leading zeroes e.g. `000..078`. + +Constructions with `{}` are similar to the [remote](../../../sql-reference/table-functions/remote.md) table function. + +:::warning +If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: + +**Example with wildcards 1** + +Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: + +``` sql +CREATE TABLE big_table (name String, value UInt32) + ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/my_folder/file-{000..999}.csv', 'CSV'); +``` + +**Example with wildcards 2** + +Suppose we have several files in CSV format with the following URIs on S3: + +- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/some_folder/some_file_1.csv' +- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/some_folder/some_file_2.csv' +- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/some_folder/some_file_3.csv' +- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/another_folder/some_file_1.csv' +- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/another_folder/some_file_2.csv' +- 'https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/another_folder/some_file_3.csv' + + +There are several ways to make a table consisting of all six files: + +1. Specify the range of file postfixes: + +``` sql +CREATE TABLE table_with_range (name String, value UInt32) + ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/{some,another}_folder/some_file_{1..3}', 'CSV'); +``` + +2. Take all files with `some_file_` prefix (there should be no extra files with such prefix in both folders): + +``` sql +CREATE TABLE table_with_question_mark (name String, value UInt32) + ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/{some,another}_folder/some_file_?', 'CSV'); +``` + +3. Take all the files in both folders (all files should satisfy format and schema described in query): + +``` sql +CREATE TABLE table_with_asterisk (name String, value UInt32) + ENGINE = S3('https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/{some,another}_folder/*', 'CSV'); +``` + +## S3-related Settings {#settings} + +The following settings can be set before query execution or placed into configuration file. + +- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`. +- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`. +- `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. +- `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. + +Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. + +## Endpoint-based Settings {#endpoint-settings} + +The following settings can be specified in configuration file for given endpoint (which will be matched by exact prefix of a URL): + +- `endpoint` — Specifies prefix of an endpoint. Mandatory. +- `access_key_id` and `secret_access_key` — Specifies credentials to use with given endpoint. Optional. +- `use_environment_credentials` — If set to `true`, S3 client will try to obtain credentials from environment variables and [Amazon EC2](https://en.wikipedia.org/wiki/Amazon_Elastic_Compute_Cloud) metadata for given endpoint. Optional, default value is `false`. +- `region` — Specifies S3 region name. Optional. +- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Optional, default value is `false`. +- `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be speficied multiple times. +- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. +- `max_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. Optional. + +**Example:** + +``` xml + + + https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/ + + + + + + + + + + +``` + +## See also + +- [s3 table function](../../../sql-reference/table-functions/s3.md) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/s3/) diff --git a/docs/en/reference/engines/table-engines/integrations/sqlite.md b/docs/en/reference/engines/table-engines/integrations/sqlite.md new file mode 100644 index 00000000000..45cc1cfc28a --- /dev/null +++ b/docs/en/reference/engines/table-engines/integrations/sqlite.md @@ -0,0 +1,62 @@ +--- +sidebar_position: 7 +sidebar_label: SQLite +--- + +# SQLite {#sqlite} + +The engine allows to import and export data to SQLite and supports queries to SQLite tables directly from ClickHouse. + +## Creating a Table {#creating-a-table} + +``` sql + CREATE TABLE [IF NOT EXISTS] [db.]table_name + ( + name1 [type1], + name2 [type2], ... + ) ENGINE = SQLite('db_path', 'table') +``` + +**Engine Parameters** + +- `db_path` — Path to SQLite file with a database. +- `table` — Name of a table in the SQLite database. + +## Usage Example {#usage-example} + +Shows a query creating the SQLite table: + +```sql +SHOW CREATE TABLE sqlite_db.table2; +``` + +``` text +CREATE TABLE SQLite.table2 +( + `col1` Nullable(Int32), + `col2` Nullable(String) +) +ENGINE = SQLite('sqlite.db','table2'); +``` + +Returns the data from the table: + +``` sql +SELECT * FROM sqlite_db.table2 ORDER BY col1; +``` + +```text +┌─col1─┬─col2──┐ +│ 1 │ text1 │ +│ 2 │ text2 │ +│ 3 │ text3 │ +└──────┴───────┘ +``` + +**See Also** + +- [SQLite](../../../engines/database-engines/sqlite.md) engine +- [sqlite](../../../sql-reference/table-functions/sqlite.md) table function + + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/sqlite/) diff --git a/docs/en/reference/engines/table-engines/log-family/index.md b/docs/en/reference/engines/table-engines/log-family/index.md new file mode 100644 index 00000000000..89eb08ad7b9 --- /dev/null +++ b/docs/en/reference/engines/table-engines/log-family/index.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 20 +sidebar_label: Log Family +--- + +# Log Engine Family {#log-engine-family} + +These engines were developed for scenarios when you need to quickly write many small tables (up to about 1 million rows) and read them later as a whole. + +Engines of the family: + +- [StripeLog](../../../engines/table-engines/log-family/stripelog.md) +- [Log](../../../engines/table-engines/log-family/log.md) +- [TinyLog](../../../engines/table-engines/log-family/tinylog.md) + +`Log` family table engines can store data to [HDFS](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-hdfs) or [S3](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3) distributed file systems. + +## Common Properties {#common-properties} + +Engines: + +- Store data on a disk. + +- Append data to the end of file when writing. + +- Support locks for concurrent data access. + + During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently. + +- Do not support [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations). + +- Do not support indexes. + + This means that `SELECT` queries for ranges of data are not efficient. + +- Do not write data atomically. + + You can get a table with corrupted data if something breaks the write operation, for example, abnormal server shutdown. + +## Differences {#differences} + +The `TinyLog` engine is the simplest in the family and provides the poorest functionality and lowest efficiency. The `TinyLog` engine does not support parallel data reading by several threads in a single query. It reads data slower than other engines in the family that support parallel reading from a single query and it uses almost as many file descriptors as the `Log` engine because it stores each column in a separate file. Use it only in simple scenarios. + +The `Log` and `StripeLog` engines support parallel data reading. When reading data, ClickHouse uses multiple threads. Each thread processes a separate data block. The `Log` engine uses a separate file for each column of the table. `StripeLog` stores all the data in one file. As a result, the `StripeLog` engine uses fewer file descriptors, but the `Log` engine provides higher efficiency when reading data. + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/log_family/) diff --git a/docs/en/reference/engines/table-engines/log-family/log.md b/docs/en/reference/engines/table-engines/log-family/log.md new file mode 100644 index 00000000000..8858699f045 --- /dev/null +++ b/docs/en/reference/engines/table-engines/log-family/log.md @@ -0,0 +1,15 @@ +--- +toc_priority: 33 +toc_title: Log +--- + +# Log {#log} + +The engine belongs to the family of `Log` engines. See the common properties of `Log` engines and their differences in the [Log Engine Family](../../../engines/table-engines/log-family/index.md) article. + +`Log` differs from [TinyLog](../../../engines/table-engines/log-family/tinylog.md) in that a small file of "marks" resides with the column files. These marks are written on every data block and contain offsets that indicate where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads. +For concurrent data access, the read operations can be performed simultaneously, while write operations block reads and each other. +The `Log` engine does not support indexes. Similarly, if writing to a table failed, the table is broken, and reading from it returns an error. The `Log` engine is appropriate for temporary data, write-once tables, and for testing or demonstration purposes. + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/log-family/log/) + diff --git a/docs/en/reference/engines/table-engines/log-family/stripelog.md b/docs/en/reference/engines/table-engines/log-family/stripelog.md new file mode 100644 index 00000000000..62703245062 --- /dev/null +++ b/docs/en/reference/engines/table-engines/log-family/stripelog.md @@ -0,0 +1,93 @@ +--- +toc_priority: 32 +toc_title: StripeLog +--- + +# Stripelog {#stripelog} + +This engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](../../../engines/table-engines/log-family/index.md) article. + +Use this engine in scenarios when you need to write many tables with a small amount of data (less than 1 million rows). + +## Creating a Table {#table_engines-stripelog-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + column2_name [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = StripeLog +``` + +See the detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +## Writing the Data {#table_engines-stripelog-writing-the-data} + +The `StripeLog` engine stores all the columns in one file. For each `INSERT` query, ClickHouse appends the data block to the end of a table file, writing columns one by one. + +For each table ClickHouse writes the files: + +- `data.bin` — Data file. +- `index.mrk` — File with marks. Marks contain offsets for each column of each data block inserted. + +The `StripeLog` engine does not support the `ALTER UPDATE` and `ALTER DELETE` operations. + +## Reading the Data {#table_engines-stripelog-reading-the-data} + +The file with marks allows ClickHouse to parallelize the reading of data. This means that a `SELECT` query returns rows in an unpredictable order. Use the `ORDER BY` clause to sort rows. + +## Example of Use {#table_engines-stripelog-example-of-use} + +Creating a table: + +``` sql +CREATE TABLE stripe_log_table +( + timestamp DateTime, + message_type String, + message String +) +ENGINE = StripeLog +``` + +Inserting data: + +``` sql +INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The first regular message') +INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The second regular message'),(now(),'WARNING','The first warning message') +``` + +We used two `INSERT` queries to create two data blocks inside the `data.bin` file. + +ClickHouse uses multiple threads when selecting data. Each thread reads a separate data block and returns resulting rows independently as it finishes. As a result, the order of blocks of rows in the output does not match the order of the same blocks in the input in most cases. For example: + +``` sql +SELECT * FROM stripe_log_table +``` + +``` text +┌───────────timestamp─┬─message_type─┬─message────────────────────┐ +│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ +│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ +└─────────────────────┴──────────────┴────────────────────────────┘ +┌───────────timestamp─┬─message_type─┬─message───────────────────┐ +│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ +└─────────────────────┴──────────────┴───────────────────────────┘ +``` + +Sorting the results (ascending order by default): + +``` sql +SELECT * FROM stripe_log_table ORDER BY timestamp +``` + +``` text +┌───────────timestamp─┬─message_type─┬─message────────────────────┐ +│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ +│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ +│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ +└─────────────────────┴──────────────┴────────────────────────────┘ +``` + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/stripelog/) diff --git a/docs/en/reference/engines/table-engines/log-family/tinylog.md b/docs/en/reference/engines/table-engines/log-family/tinylog.md new file mode 100644 index 00000000000..2407355a857 --- /dev/null +++ b/docs/en/reference/engines/table-engines/log-family/tinylog.md @@ -0,0 +1,14 @@ +--- +toc_priority: 34 +toc_title: TinyLog +--- + +# TinyLog {#tinylog} + +The engine belongs to the log engine family. See [Log Engine Family](../../../engines/table-engines/log-family/index.md) for common properties of log engines and their differences. + +This table engine is typically used with the write-once method: write data one time, then read it as many times as necessary. For example, you can use `TinyLog`-type tables for intermediary data that is processed in small batches. Note that storing data in a large number of small tables is inefficient. + +Queries are executed in a single stream. In other words, this engine is intended for relatively small tables (up to about 1,000,000 rows). It makes sense to use this table engine if you have many small tables, since it’s simpler than the [Log](../../../engines/table-engines/log-family/log.md) engine (fewer files need to be opened). + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/tinylog/) diff --git a/docs/en/reference/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/en/reference/engines/table-engines/mergetree-family/aggregatingmergetree.md new file mode 100644 index 00000000000..7be10cec2f5 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -0,0 +1,104 @@ +--- +sidebar_position: 60 +sidebar_label: AggregatingMergeTree +--- + +# AggregatingMergeTree {#aggregatingmergetree} + +The engine inherits from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree), altering the logic for data parts merging. ClickHouse replaces all rows with the same primary key (or more accurately, with the same [sorting key](../../../engines/table-engines/mergetree-family/mergetree.md)) with a single row (within a one data part) that stores a combination of states of aggregate functions. + +You can use `AggregatingMergeTree` tables for incremental data aggregation, including for aggregated materialized views. + +The engine processes all columns with the following types: + +- [AggregateFunction](../../../sql-reference/data-types/aggregatefunction.md) +- [SimpleAggregateFunction](../../../sql-reference/data-types/simpleaggregatefunction.md) + +It is appropriate to use `AggregatingMergeTree` if it reduces the number of rows by orders. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = AggregatingMergeTree() +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[TTL expr] +[SETTINGS name=value, ...] +``` + +For a description of request parameters, see [request description](../../../sql-reference/statements/create/table.md). + +**Query clauses** + +When creating a `AggregatingMergeTree` table the same [clauses](../../../engines/table-engines/mergetree-family/mergetree.md) are required, as when creating a `MergeTree` table. + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects and, if possible, switch the old projects to the method described above. +::: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] AggregatingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity) +``` + +All of the parameters have the same meaning as in `MergeTree`. +
+ +## SELECT and INSERT {#select-and-insert} + +To insert data, use [INSERT SELECT](../../../sql-reference/statements/insert-into.md) query with aggregate -State- functions. +When selecting data from `AggregatingMergeTree` table, use `GROUP BY` clause and the same aggregate functions as when inserting data, but using `-Merge` suffix. + +In the results of `SELECT` query, the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. If dump data into, for example, `TabSeparated` format with `SELECT` query then this dump can be loaded back using `INSERT` query. + +## Example of an Aggregated Materialized View {#example-of-an-aggregated-materialized-view} + +`AggregatingMergeTree` materialized view that watches the `test.visits` table: + +``` sql +CREATE MATERIALIZED VIEW test.basic +ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) +AS SELECT + CounterID, + StartDate, + sumState(Sign) AS Visits, + uniqState(UserID) AS Users +FROM test.visits +GROUP BY CounterID, StartDate; +``` + +Inserting data into the `test.visits` table. + +``` sql +INSERT INTO test.visits ... +``` + +The data are inserted in both the table and view `test.basic` that will perform the aggregation. + +To get the aggregated data, we need to execute a query such as `SELECT ... GROUP BY ...` from the view `test.basic`: + +``` sql +SELECT + StartDate, + sumMerge(Visits) AS Visits, + uniqMerge(Users) AS Users +FROM test.basic +GROUP BY StartDate +ORDER BY StartDate; +``` + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/aggregatingmergetree/) diff --git a/docs/en/reference/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/en/reference/engines/table-engines/mergetree-family/collapsingmergetree.md new file mode 100644 index 00000000000..22863611e79 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -0,0 +1,307 @@ +--- +sidebar_position: 70 +sidebar_label: CollapsingMergeTree +--- + +# CollapsingMergeTree {#table_engine-collapsingmergetree} + +The engine inherits from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) and adds the logic of rows collapsing to data parts merge algorithm. + +`CollapsingMergeTree` asynchronously deletes (collapses) pairs of rows if all of the fields in a sorting key (`ORDER BY`) are equivalent excepting the particular field `Sign` which can have `1` and `-1` values. Rows without a pair are kept. For more details see the [Collapsing](#table_engine-collapsingmergetree-collapsing) section of the document. + +The engine may significantly reduce the volume of storage and increase the efficiency of `SELECT` query as a consequence. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = CollapsingMergeTree(sign) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +For a description of query parameters, see [query description](../../../sql-reference/statements/create/table.md). + +**CollapsingMergeTree Parameters** + +- `sign` — Name of the column with the type of row: `1` is a “state” row, `-1` is a “cancel” row. + + Column data type — `Int8`. + +**Query clauses** + +When creating a `CollapsingMergeTree` table, the same [query clauses](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) are required, as when creating a `MergeTree` table. + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects and, if possible, switch old projects to the method described above. +::: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] CollapsingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, sign) +``` + +All of the parameters excepting `sign` have the same meaning as in `MergeTree`. + +- `sign` — Name of the column with the type of row: `1` — “state” row, `-1` — “cancel” row. + + Column Data Type — `Int8`. + +
+ +## Collapsing {#table_engine-collapsingmergetree-collapsing} + +### Data {#data} + +Consider the situation where you need to save continually changing data for some object. It sounds logical to have one row for an object and update it at any change, but update operation is expensive and slow for DBMS because it requires rewriting of the data in the storage. If you need to write data quickly, update not acceptable, but you can write the changes of an object sequentially as follows. + +Use the particular column `Sign`. If `Sign = 1` it means that the row is a state of an object, let’s call it “state” row. If `Sign = -1` it means the cancellation of the state of an object with the same attributes, let’s call it “cancel” row. + +For example, we want to calculate how much pages users checked at some site and how long they were there. At some moment we write the following row with the state of user activity: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +At some moment later we register the change of user activity and write it with the following two rows. + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +The first row cancels the previous state of the object (user). It should copy the sorting key fields of the cancelled state excepting `Sign`. + +The second row contains the current state. + +As we need only the last state of user activity, the rows + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +can be deleted collapsing the invalid (old) state of an object. `CollapsingMergeTree` does this while merging of the data parts. + +Why we need 2 rows for each change read in the [Algorithm](#table_engine-collapsingmergetree-collapsing-algorithm) paragraph. + +**Peculiar properties of such approach** + +1. The program that writes the data should remember the state of an object to be able to cancel it. “Cancel” string should contain copies of the sorting key fields of the “state” string and the opposite `Sign`. It increases the initial size of storage but allows to write the data quickly. +2. Long growing arrays in columns reduce the efficiency of the engine due to load for writing. The more straightforward data, the higher the efficiency. +3. The `SELECT` results depend strongly on the consistency of object changes history. Be accurate when preparing data for inserting. You can get unpredictable results in inconsistent data, for example, negative values for non-negative metrics such as session depth. + +### Algorithm {#table_engine-collapsingmergetree-collapsing-algorithm} + +When ClickHouse merges data parts, each group of consecutive rows with the same sorting key (`ORDER BY`) is reduced to not more than two rows, one with `Sign = 1` (“state” row) and another with `Sign = -1` (“cancel” row). In other words, entries collapse. + +For each resulting data part ClickHouse saves: + +1. The first “cancel” and the last “state” rows, if the number of “state” and “cancel” rows matches and the last row is a “state” row. +2. The last “state” row, if there are more “state” rows than “cancel” rows. +3. The first “cancel” row, if there are more “cancel” rows than “state” rows. +4. None of the rows, in all other cases. + +Also when there are at least 2 more “state” rows than “cancel” rows, or at least 2 more “cancel” rows then “state” rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once. + +Thus, collapsing should not change the results of calculating statistics. +Changes gradually collapsed so that in the end only the last state of almost every object left. + +The `Sign` is required because the merging algorithm does not guarantee that all of the rows with the same sorting key will be in the same resulting data part and even on the same physical server. ClickHouse process `SELECT` queries with multiple threads, and it can not predict the order of rows in the result. The aggregation is required if there is a need to get completely “collapsed” data from `CollapsingMergeTree` table. + +To finalize collapsing, write a query with `GROUP BY` clause and aggregate functions that account for the sign. For example, to calculate quantity, use `sum(Sign)` instead of `count()`. To calculate the sum of something, use `sum(Sign * x)` instead of `sum(x)`, and so on, and also add `HAVING sum(Sign) > 0`. + +The aggregates `count`, `sum` and `avg` could be calculated this way. The aggregate `uniq` could be calculated if an object has at least one state not collapsed. The aggregates `min` and `max` could not be calculated because `CollapsingMergeTree` does not save the values history of the collapsed states. + +If you need to extract data without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is significantly less efficient. + +## Example of Use {#example-of-use} + +Example data: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +Creation of the table: + +``` sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews UInt8, + Duration UInt8, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY UserID +``` + +Insertion of the data: + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1) +``` + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1) +``` + +We use two `INSERT` queries to create two different data parts. If we insert the data with one query ClickHouse creates one data part and will not perform any merge ever. + +Getting the data: + +``` sql +SELECT * FROM UAct +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +What do we see and where is collapsing? + +With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows. Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment which we can not predict. + +Thus we need aggregation: + +``` sql +SELECT + UserID, + sum(PageViews * Sign) AS PageViews, + sum(Duration * Sign) AS Duration +FROM UAct +GROUP BY UserID +HAVING sum(Sign) > 0 +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┐ +│ 4324182021466249494 │ 6 │ 185 │ +└─────────────────────┴───────────┴──────────┘ +``` + +If we do not need aggregation and want to force collapsing, we can use `FINAL` modifier for `FROM` clause. + +``` sql +SELECT * FROM UAct FINAL +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +This way of selecting the data is very inefficient. Don’t use it for big tables. + +## Example of Another Approach {#example-of-another-approach} + +Example data: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ -5 │ -146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +The idea is that merges take into account only key fields. And in the “Cancel” line we can specify negative values that equalize the previous version of the row when summing without using the Sign column. For this approach, it is necessary to change the data type `PageViews`,`Duration` to store negative values of UInt8 -\> Int16. + +``` sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews Int16, + Duration Int16, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY UserID +``` + +Let’s test the approach: + +``` sql +insert into UAct values(4324182021466249494, 5, 146, 1); +insert into UAct values(4324182021466249494, -5, -146, -1); +insert into UAct values(4324182021466249494, 6, 185, 1); + +select * from UAct final; // avoid using final in production (just for a test or small tables) +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +``` sql +SELECT + UserID, + sum(PageViews) AS PageViews, + sum(Duration) AS Duration +FROM UAct +GROUP BY UserID +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┐ +│ 4324182021466249494 │ 6 │ 185 │ +└─────────────────────┴───────────┴──────────┘ +``` + +``` sql +select count() FROM UAct +``` + +``` text +┌─count()─┐ +│ 3 │ +└─────────┘ +``` + +``` sql +optimize table UAct final; + +select * FROM UAct +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/collapsingmergetree/) diff --git a/docs/en/reference/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/en/reference/engines/table-engines/mergetree-family/custom-partitioning-key.md new file mode 100644 index 00000000000..716528f8d77 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -0,0 +1,136 @@ +--- +sidebar_position: 30 +sidebar_label: Custom Partitioning Key +--- + +# Custom Partitioning Key {#custom-partitioning-key} + +:::warning +In most cases you do not need a partition key, and in most other cases you do not need a partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). + +You should never use too granular of partitioning. Don't partition your data by client identifiers or names. Instead, make a client identifier or name the first column in the ORDER BY expression. +::: + +Partitioning is available for the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family tables (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). [Materialized views](../../../engines/table-engines/special/materializedview.md#materializedview) based on MergeTree tables support partitioning, as well. + +A partition is a logical combination of records in a table by a specified criterion. You can set a partition by an arbitrary criterion, such as by month, by day, or by event type. Each partition is stored separately to simplify manipulations of this data. When accessing the data, ClickHouse uses the smallest subset of partitions possible. + +The partition is specified in the `PARTITION BY expr` clause when [creating a table](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table). The partition key can be any expression from the table columns. For example, to specify partitioning by month, use the expression `toYYYYMM(date_column)`: + +``` sql +CREATE TABLE visits +( + VisitDate Date, + Hour UInt8, + ClientID UUID +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(VisitDate) +ORDER BY Hour; +``` + +The partition key can also be a tuple of expressions (similar to the [primary key](../../../engines/table-engines/mergetree-family/mergetree.md#primary-keys-and-indexes-in-queries)). For example: + +``` sql +ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/name', 'replica1', Sign) +PARTITION BY (toMonday(StartDate), EventType) +ORDER BY (CounterID, StartDate, intHash32(UserID)); +``` + +In this example, we set partitioning by the event types that occurred during the current week. + +By default, the floating-point partition key is not supported. To use it enable the setting [allow_floating_point_partition_key](../../../operations/settings/merge-tree-settings.md#allow_floating_point_partition_key). + +When inserting new data to a table, this data is stored as a separate part (chunk) sorted by the primary key. In 10-15 minutes after inserting, the parts of the same partition are merged into the entire part. + +:::info +A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn’t make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors. +::: + +Use the [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) table to view the table parts and partitions. For example, let’s assume that we have a `visits` table with partitioning by month. Let’s perform the `SELECT` query for the `system.parts` table: + +``` sql +SELECT + partition, + name, + active +FROM system.parts +WHERE table = 'visits' +``` + +``` text +┌─partition─┬─name──────────────┬─active─┐ +│ 201901 │ 201901_1_3_1 │ 0 │ +│ 201901 │ 201901_1_9_2_11 │ 1 │ +│ 201901 │ 201901_8_8_0 │ 0 │ +│ 201901 │ 201901_9_9_0 │ 0 │ +│ 201902 │ 201902_4_6_1_11 │ 1 │ +│ 201902 │ 201902_10_10_0_11 │ 1 │ +│ 201902 │ 201902_11_11_0_11 │ 1 │ +└───────────┴───────────────────┴────────┘ +``` + +The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](../../../sql-reference/statements/alter/partition.md) queries. + +The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query. + +Let’s break down the name of the part: `201901_1_9_2_11`: + +- `201901` is the partition name. +- `1` is the minimum number of the data block. +- `9` is the maximum number of the data block. +- `2` is the chunk level (the depth of the merge tree it is formed from). +- `11` is the mutation version (if a part mutated) + +:::info +The parts of old-type tables have the name: `20190117_20190123_2_2_0` (minimum date - maximum date - minimum block number - maximum block number - level). +::: + +The `active` column shows the status of the part. `1` is active; `0` is inactive. The inactive parts are, for example, source parts remaining after merging to a larger part. The corrupted data parts are also indicated as inactive. + +As you can see in the example, there are several separated parts of the same partition (for example, `201901_1_3_1` and `201901_1_9_2`). This means that these parts are not merged yet. ClickHouse merges the inserted parts of data periodically, approximately 15 minutes after inserting. In addition, you can perform a non-scheduled merge using the [OPTIMIZE](../../../sql-reference/statements/optimize.md) query. Example: + +``` sql +OPTIMIZE TABLE visits PARTITION 201902; +``` + +``` text +┌─partition─┬─name─────────────┬─active─┐ +│ 201901 │ 201901_1_3_1 │ 0 │ +│ 201901 │ 201901_1_9_2_11 │ 1 │ +│ 201901 │ 201901_8_8_0 │ 0 │ +│ 201901 │ 201901_9_9_0 │ 0 │ +│ 201902 │ 201902_4_6_1 │ 0 │ +│ 201902 │ 201902_4_11_2_11 │ 1 │ +│ 201902 │ 201902_10_10_0 │ 0 │ +│ 201902 │ 201902_11_11_0 │ 0 │ +└───────────┴──────────────────┴────────┘ +``` + +Inactive parts will be deleted approximately 10 minutes after merging. + +Another way to view a set of parts and partitions is to go into the directory of the table: `/var/lib/clickhouse/data///`. For example: + +``` bash +/var/lib/clickhouse/data/default/visits$ ls -l +total 40 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2_11 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_8_8_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_9_9_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_10_10_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_11_11_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:19 201902_4_11_2_11 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached +``` + +The folders ‘201901_1_1_0’, ‘201901_1_7_1’ and so on are the directories of the parts. Each part relates to a corresponding partition and contains data just for a certain month (the table in this example has partitioning by month). + +The `detached` directory contains parts that were detached from the table using the [DETACH](../../../sql-reference/statements/alter/partition.md#alter_detach-partition) query. The corrupted parts are also moved to this directory, instead of being deleted. The server does not use the parts from the `detached` directory. You can add, delete, or modify the data in this directory at any time – the server will not know about this until you run the [ATTACH](../../../sql-reference/statements/alter/partition.md#alter_attach-partition) query. + +Note that on the operating server, you cannot manually change the set of parts or their data on the file system, since the server will not know about it. For non-replicated tables, you can do this when the server is stopped, but it isn’t recommended. For replicated tables, the set of parts cannot be changed in any case. + +ClickHouse allows you to perform operations with the partitions: delete them, copy from one table to another, or create a backup. See the list of all operations in the section [Manipulations With Partitions and Parts](../../../sql-reference/statements/alter/partition.md#alter_manipulations-with-partitions). + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/custom_partitioning_key/) diff --git a/docs/en/reference/engines/table-engines/mergetree-family/graphitemergetree.md b/docs/en/reference/engines/table-engines/mergetree-family/graphitemergetree.md new file mode 100644 index 00000000000..35f3f99d5a9 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/graphitemergetree.md @@ -0,0 +1,260 @@ +--- +sidebar_position: 90 +sidebar_label: GraphiteMergeTree +--- + +# GraphiteMergeTree {#graphitemergetree} + +This engine is designed for thinning and aggregating/averaging (rollup) [Graphite](http://graphite.readthedocs.io/en/latest/index.html) data. It may be helpful to developers who want to use ClickHouse as a data store for Graphite. + +You can use any ClickHouse table engine to store the Graphite data if you do not need rollup, but if you need a rollup use `GraphiteMergeTree`. The engine reduces the volume of storage and increases the efficiency of queries from Graphite. + +The engine inherits properties from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md). + +## Creating a Table {#creating-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + Path String, + Time DateTime, + Value , + Version + ... +) ENGINE = GraphiteMergeTree(config_section) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +See a detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +A table for the Graphite data should have the following columns for the following data: + +- Metric name (Graphite sensor). Data type: `String`. + +- Time of measuring the metric. Data type: `DateTime`. + +- Value of the metric. Data type: any numeric. + +- Version of the metric. Data type: any numeric (ClickHouse saves the rows with the highest version or the last written if versions are the same. Other rows are deleted during the merge of data parts). + +The names of these columns should be set in the rollup configuration. + +**GraphiteMergeTree parameters** + +- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. + +**Query clauses** + +When creating a `GraphiteMergeTree` table, the same [clauses](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) are required, as when creating a `MergeTree` table. + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects and, if possible, switch old projects to the method described above. +::: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + EventDate Date, + Path String, + Time DateTime, + Value , + Version + ... +) ENGINE [=] GraphiteMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, config_section) +``` + +All of the parameters excepting `config_section` have the same meaning as in `MergeTree`. + +- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. + +
+ +## Rollup Configuration {#rollup-configuration} + +The settings for rollup are defined by the [graphite_rollup](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-graphite) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables. + +Rollup configuration structure: + + required-columns + patterns + +### Required Columns {#required-columns} + +- `path_column_name` — The name of the column storing the metric name (Graphite sensor). Default value: `Path`. +- `time_column_name` — The name of the column storing the time of measuring the metric. Default value: `Time`. +- `value_column_name` — The name of the column storing the value of the metric at the time set in `time_column_name`. Default value: `Value`. +- `version_column_name` — The name of the column storing the version of the metric. Default value: `Timestamp`. + +### Patterns {#patterns} + +Structure of the `patterns` section: + +``` text +pattern + rule_type + regexp + function +pattern + rule_type + regexp + age + precision + ... +pattern + rule_type + regexp + function + age + precision + ... +pattern + ... +default + function + age + precision + ... +``` + +:::warning +Patterns must be strictly ordered: + +1. Patterns without `function` or `retention`. +1. Patterns with both `function` and `retention`. +1. Pattern `default`. +::: + +When processing a row, ClickHouse checks the rules in the `pattern` sections. Each of `pattern` (including `default`) sections can contain `function` parameter for aggregation, `retention` parameters or both. If the metric name matches the `regexp`, the rules from the `pattern` section (or sections) are applied; otherwise, the rules from the `default` section are used. + +Fields for `pattern` and `default` sections: + +- `rule_type` - a rule's type. It's applied only to a particular metrics. The engine use it to separate plain and tagged metrics. Optional parameter. Default value: `all`. +It's unnecessary when performance is not critical, or only one metrics type is used, e.g. plain metrics. By default only one type of rules set is created. Otherwise, if any of special types is defined, two different sets are created. One for plain metrics (root.branch.leaf) and one for tagged metrics (root.branch.leaf;tag1=value1). +The default rules are ended up in both sets. +Valid values: + - `all` (default) - a universal rule, used when `rule_type` is omitted. + - `plain` - a rule for plain metrics. The field `regexp` is processed as regular expression. + - `tagged` - a rule for tagged metrics (metrics are stored in DB in the format of `someName?tag1=value1&tag2=value2&tag3=value3`). Regular expression must be sorted by tags' names, first tag must be `__name__` if exists. The field `regexp` is processed as regular expression. + - `tag_list` - a rule for tagged matrics, a simple DSL for easier metric description in graphite format `someName;tag1=value1;tag2=value2`, `someName`, or `tag1=value1;tag2=value2`. The field `regexp` is translated into a `tagged` rule. The sorting by tags' names is unnecessary, ti will be done automatically. A tag's value (but not a name) can be set as a regular expression, e.g. `env=(dev|staging)`. +- `regexp` – A pattern for the metric name (a regular or DSL). +- `age` – The minimum age of the data in seconds. +- `precision`– How precisely to define the age of the data in seconds. Should be a divisor for 86400 (seconds in a day). +- `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`. Accepted functions: min / max / any / avg. The average is calculated imprecisely, like the average of the averages. + +### Configuration Example without rules types {#configuration-example} + +``` xml + + Version + + click_cost + any + + 0 + 5 + + + 86400 + 60 + + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + +``` + +### Configuration Example with rules types {#configuration-typed-example} + +``` xml + + Version + + plain + click_cost + any + + 0 + 5 + + + 86400 + 60 + + + + tagged + ^((.*)|.)min\? + min + + 0 + 5 + + + 86400 + 60 + + + + tagged + + min + + 0 + 5 + + + 86400 + 60 + + + + tag_list + someName;tag2=value2 + + 0 + 5 + + + 86400 + 60 + + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + +``` + + +:::warning +Data rollup is performed during merges. Usually, for old partitions, merges are not started, so for rollup it is necessary to trigger an unscheduled merge using [optimize](../../../sql-reference/statements/optimize.md). Or use additional tools, for example [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer). +::: diff --git a/docs/en/reference/engines/table-engines/mergetree-family/index.md b/docs/en/reference/engines/table-engines/mergetree-family/index.md new file mode 100644 index 00000000000..37e7bf5b589 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/index.md @@ -0,0 +1,16 @@ +--- +sidebar_position: 10 +sidebar_label: MergeTree Family +--- + +# MergeTree Engine Family {#mergetree-engine-family} + +Table engines from the MergeTree family are the core of ClickHouse data storage capabilities. They provide most features for resilience and high-performance data retrieval: columnar storage, custom partitioning, sparse primary index, secondary data-skipping indexes, etc. + +Base [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) table engine can be considered the default table engine for single-node ClickHouse instances because it is versatile and practical for a wide range of use cases. + +For production usage [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) is the way to go, because it adds high-availability to all features of regular MergeTree engine. A bonus is automatic data deduplication on data ingestion, so the software can safely retry if there was some network issue during insert. + +All other engines of MergeTree family add extra functionality for some specific use cases. Usually, it’s implemented as additional data manipulation in background. + +The main downside of MergeTree engines is that they are rather heavy-weight. So the typical pattern is to have not so many of them. If you need many small tables, for example for temporary data, consider [Log engine family](../../../engines/table-engines/log-family/index.md). diff --git a/docs/en/reference/engines/table-engines/mergetree-family/mergetree.md b/docs/en/reference/engines/table-engines/mergetree-family/mergetree.md new file mode 100644 index 00000000000..1195ee55dc7 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/mergetree.md @@ -0,0 +1,953 @@ +--- +sidebar_position: 11 +sidebar_label: MergeTree +--- + +# MergeTree {#table_engines-mergetree} + +The `MergeTree` engine and other engines of this family (`*MergeTree`) are the most robust ClickHouse table engines. + +Engines in the `MergeTree` family are designed for inserting a very large amount of data into a table. The data is quickly written to the table part by part, then rules are applied for merging the parts in the background. This method is much more efficient than continually rewriting the data in storage during insert. + +Main features: + +- Stores data sorted by primary key. + + This allows you to create a small sparse index that helps find data faster. + +- Partitions can be used if the [partitioning key](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md) is specified. + + ClickHouse supports certain operations with partitions that are more efficient than general operations on the same data with the same result. ClickHouse also automatically cuts off the partition data where the partitioning key is specified in the query. + +- Data replication support. + + The family of `ReplicatedMergeTree` tables provides data replication. For more information, see [Data replication](../../../engines/table-engines/mergetree-family/replication.md). + +- Data sampling support. + + If necessary, you can set the data sampling method in the table. + +:::info +The [Merge](../../../engines/table-engines/special/merge.md#merge) engine does not belong to the `*MergeTree` family. +::: + +## Creating a Table {#table_engine-mergetree-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... + INDEX index_name1 expr1 TYPE type1(...) GRANULARITY value1, + INDEX index_name2 expr2 TYPE type2(...) GRANULARITY value2, + ... + PROJECTION projection_name_1 (SELECT [GROUP BY] [ORDER BY]), + PROJECTION projection_name_2 (SELECT [GROUP BY] [ORDER BY]) +) ENGINE = MergeTree() +ORDER BY expr +[PARTITION BY expr] +[PRIMARY KEY expr] +[SAMPLE BY expr] +[TTL expr + [DELETE|TO DISK 'xxx'|TO VOLUME 'xxx' [, ...] ] + [WHERE conditions] + [GROUP BY key_expr [SET v1 = aggr_func(v1) [, v2 = aggr_func(v2) ...]] ] ] +[SETTINGS name=value, ...] +``` + +For a description of parameters, see the [CREATE query description](../../../sql-reference/statements/create/table.md). + +### Query Clauses {#mergetree-query-clauses} + +- `ENGINE` — Name and parameters of the engine. `ENGINE = MergeTree()`. The `MergeTree` engine does not have parameters. + +- `ORDER BY` — The sorting key. + + A tuple of column names or arbitrary expressions. Example: `ORDER BY (CounterID, EventDate)`. + + ClickHouse uses the sorting key as a primary key if the primary key is not defined obviously by the `PRIMARY KEY` clause. + + Use the `ORDER BY tuple()` syntax, if you do not need sorting. See [Selecting the Primary Key](#selecting-the-primary-key). + +- `PARTITION BY` — The [partitioning key](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md). Optional. In most cases you don't need partition key, and in most other cases you don't need partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead make client identifier or name the first column in the ORDER BY expression). + + For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](../../../sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format. + +- `PRIMARY KEY` — The primary key if it [differs from the sorting key](#choosing-a-primary-key-that-differs-from-the-sorting-key). Optional. + + By default the primary key is the same as the sorting key (which is specified by the `ORDER BY` clause). Thus in most cases it is unnecessary to specify a separate `PRIMARY KEY` clause. + +- `SAMPLE BY` — An expression for sampling. Optional. + + If a sampling expression is used, the primary key must contain it. The result of a sampling expression must be an unsigned integer. Example: `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`. + +- `TTL` — A list of rules specifying storage duration of rows and defining logic of automatic parts movement [between disks and volumes](#table_engine-mergetree-multiple-volumes). Optional. + + Expression must have one `Date` or `DateTime` column as a result. Example: + `TTL date + INTERVAL 1 DAY` + + Type of the rule `DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'|GROUP BY` specifies an action to be done with the part if the expression is satisfied (reaches current time): removal of expired rows, moving a part (if expression is satisfied for all rows in a part) to specified disk (`TO DISK 'xxx'`) or to volume (`TO VOLUME 'xxx'`), or aggregating values in expired rows. Default type of the rule is removal (`DELETE`). List of multiple rules can be specified, but there should be no more than one `DELETE` rule. + + For more details, see [TTL for columns and tables](#table_engine-mergetree-ttl) + +- `SETTINGS` — Additional parameters that control the behavior of the `MergeTree` (optional): + + - `index_granularity` — Maximum number of data rows between the marks of an index. Default value: 8192. See [Data Storage](#mergetree-data-storage). + - `index_granularity_bytes` — Maximum size of data granules in bytes. Default value: 10Mb. To restrict the granule size only by number of rows, set to 0 (not recommended). See [Data Storage](#mergetree-data-storage). + - `min_index_granularity_bytes` — Min allowed size of data granules in bytes. Default value: 1024b. To provide a safeguard against accidentally creating tables with very low index_granularity_bytes. See [Data Storage](#mergetree-data-storage). + - `enable_mixed_granularity_parts` — Enables or disables transitioning to control the granule size with the `index_granularity_bytes` setting. Before version 19.11, there was only the `index_granularity` setting for restricting granule size. The `index_granularity_bytes` setting improves ClickHouse performance when selecting data from tables with big rows (tens and hundreds of megabytes). If you have tables with big rows, you can enable this setting for the tables to improve the efficiency of `SELECT` queries. + - `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../../../operations/server-configuration-parameters/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in “Server configuration parameters”. + - `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse reads and writes the data to the storage disk using the direct I/O interface (`O_DIRECT` option). If `min_merge_bytes_to_use_direct_io = 0`, then direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes. + + - `merge_with_ttl_timeout` — Minimum delay in seconds before repeating a merge with delete TTL. Default value: `14400` seconds (4 hours). + - `merge_with_recompression_ttl_timeout` — Minimum delay in seconds before repeating a merge with recompression TTL. Default value: `14400` seconds (4 hours). + - `try_fetch_recompressed_part_timeout` — Timeout (in seconds) before starting merge with recompression. During this time ClickHouse tries to fetch recompressed part from replica which assigned this merge with recompression. Default value: `7200` seconds (2 hours). + - `write_final_mark` — Enables or disables writing the final index mark at the end of data part (after the last byte). Default value: 1. Don’t turn it off. + - `merge_max_block_size` — Maximum number of rows in block for merge operations. Default value: 8192. + - `storage_policy` — Storage policy. See [Using Multiple Block Devices for Data Storage](#table_engine-mergetree-multiple-volumes). + - `min_bytes_for_wide_part`, `min_rows_for_wide_part` — Minimum number of bytes/rows in a data part that can be stored in `Wide` format. You can set one, both or none of these settings. See [Data Storage](#mergetree-data-storage). + - `max_parts_in_total` — Maximum number of parts in all partitions. + - `max_compress_block_size` — Maximum size of blocks of uncompressed data before compressing for writing to a table. You can also specify this setting in the global settings (see [max_compress_block_size](../../../operations/settings/settings.md#max-compress-block-size) setting). The value specified when table is created overrides the global value for this setting. + - `min_compress_block_size` — Minimum size of blocks of uncompressed data required for compression when writing the next mark. You can also specify this setting in the global settings (see [min_compress_block_size](../../../operations/settings/settings.md#min-compress-block-size) setting). The value specified when table is created overrides the global value for this setting. + - `max_partitions_to_read` — Limits the maximum number of partitions that can be accessed in one query. You can also specify setting [max_partitions_to_read](../../../operations/settings/merge-tree-settings.md#max-partitions-to-read) in the global setting. + +**Example of Sections Setting** + +``` sql +ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192 +``` + +In the example, we set partitioning by month. + +We also set an expression for sampling as a hash by the user ID. This allows you to pseudorandomize the data in the table for each `CounterID` and `EventDate`. If you define a [SAMPLE](../../../sql-reference/statements/select/sample.md#select-sample-clause) clause when selecting the data, ClickHouse will return an evenly pseudorandom data sample for a subset of users. + +The `index_granularity` setting can be omitted because 8192 is the default value. + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects. If possible, switch old projects to the method described above. +::: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] MergeTree(date-column [, sampling_expression], (primary, key), index_granularity) +``` + +**MergeTree() Parameters** + +- `date-column` — The name of a column of the [Date](../../../sql-reference/data-types/date.md) type. ClickHouse automatically creates partitions by month based on this column. The partition names are in the `"YYYYMM"` format. +- `sampling_expression` — An expression for sampling. +- `(primary, key)` — Primary key. Type: [Tuple()](../../../sql-reference/data-types/tuple.md) +- `index_granularity` — The granularity of an index. The number of data rows between the “marks” of an index. The value 8192 is appropriate for most tasks. + +**Example** + +``` sql +MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) +``` + +The `MergeTree` engine is configured in the same way as in the example above for the main engine configuration method. +
+ +## Data Storage {#mergetree-data-storage} + +A table consists of data parts sorted by primary key. + +When data is inserted in a table, separate data parts are created and each of them is lexicographically sorted by primary key. For example, if the primary key is `(CounterID, Date)`, the data in the part is sorted by `CounterID`, and within each `CounterID`, it is ordered by `Date`. + +Data belonging to different partitions are separated into different parts. In the background, ClickHouse merges data parts for more efficient storage. Parts belonging to different partitions are not merged. The merge mechanism does not guarantee that all rows with the same primary key will be in the same data part. + +Data parts can be stored in `Wide` or `Compact` format. In `Wide` format each column is stored in a separate file in a filesystem, in `Compact` format all columns are stored in one file. `Compact` format can be used to increase performance of small and frequent inserts. + +Data storing format is controlled by the `min_bytes_for_wide_part` and `min_rows_for_wide_part` settings of the table engine. If the number of bytes or rows in a data part is less then the corresponding setting's value, the part is stored in `Compact` format. Otherwise it is stored in `Wide` format. If none of these settings is set, data parts are stored in `Wide` format. + +Each data part is logically divided into granules. A granule is the smallest indivisible data set that ClickHouse reads when selecting data. ClickHouse does not split rows or values, so each granule always contains an integer number of rows. The first row of a granule is marked with the value of the primary key for the row. For each data part, ClickHouse creates an index file that stores the marks. For each column, whether it’s in the primary key or not, ClickHouse also stores the same marks. These marks let you find data directly in column files. + +The granule size is restricted by the `index_granularity` and `index_granularity_bytes` settings of the table engine. The number of rows in a granule lays in the `[1, index_granularity]` range, depending on the size of the rows. The size of a granule can exceed `index_granularity_bytes` if the size of a single row is greater than the value of the setting. In this case, the size of the granule equals the size of the row. + +## Primary Keys and Indexes in Queries {#primary-keys-and-indexes-in-queries} + +Take the `(CounterID, Date)` primary key as an example. In this case, the sorting and index can be illustrated as follows: + + Whole data: [---------------------------------------------] + CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] + Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] + Marks: | | | | | | | | | | | + a,1 a,2 a,3 b,3 e,2 e,3 g,1 h,2 i,1 i,3 l,3 + Marks numbers: 0 1 2 3 4 5 6 7 8 9 10 + +If the data query specifies: + +- `CounterID in ('a', 'h')`, the server reads the data in the ranges of marks `[0, 3)` and `[6, 8)`. +- `CounterID IN ('a', 'h') AND Date = 3`, the server reads the data in the ranges of marks `[1, 3)` and `[7, 8)`. +- `Date = 3`, the server reads the data in the range of marks `[1, 10]`. + +The examples above show that it is always more effective to use an index than a full scan. + +A sparse index allows extra data to be read. When reading a single range of the primary key, up to `index_granularity * 2` extra rows in each data block can be read. + +Sparse indexes allow you to work with a very large number of table rows, because in most cases, such indexes fit in the computer’s RAM. + +ClickHouse does not require a unique primary key. You can insert multiple rows with the same primary key. + +You can use `Nullable`-typed expressions in the `PRIMARY KEY` and `ORDER BY` clauses but it is strongly discouraged. To allow this feature, turn on the [allow_nullable_key](../../../operations/settings/settings.md#allow-nullable-key) setting. The [NULLS_LAST](../../../sql-reference/statements/select/order-by.md#sorting-of-special-values) principle applies for `NULL` values in the `ORDER BY` clause. + +### Selecting the Primary Key {#selecting-the-primary-key} + +The number of columns in the primary key is not explicitly limited. Depending on the data structure, you can include more or fewer columns in the primary key. This may: + +- Improve the performance of an index. + + If the primary key is `(a, b)`, then adding another column `c` will improve the performance if the following conditions are met: + + - There are queries with a condition on column `c`. + - Long data ranges (several times longer than the `index_granularity`) with identical values for `(a, b)` are common. In other words, when adding another column allows you to skip quite long data ranges. + +- Improve data compression. + + ClickHouse sorts data by primary key, so the higher the consistency, the better the compression. + +- Provide additional logic when merging data parts in the [CollapsingMergeTree](../../../engines/table-engines/mergetree-family/collapsingmergetree.md#table_engine-collapsingmergetree) and [SummingMergeTree](../../../engines/table-engines/mergetree-family/summingmergetree.md) engines. + + In this case it makes sense to specify the *sorting key* that is different from the primary key. + +A long primary key will negatively affect the insert performance and memory consumption, but extra columns in the primary key do not affect ClickHouse performance during `SELECT` queries. + +You can create a table without a primary key using the `ORDER BY tuple()` syntax. In this case, ClickHouse stores data in the order of inserting. If you want to save data order when inserting data by `INSERT ... SELECT` queries, set [max_insert_threads = 1](../../../operations/settings/settings.md#settings-max-insert-threads). + +To select data in the initial order, use [single-threaded](../../../operations/settings/settings.md#settings-max_threads) `SELECT` queries. + +### Choosing a Primary Key that Differs from the Sorting Key {#choosing-a-primary-key-that-differs-from-the-sorting-key} + +It is possible to specify a primary key (an expression with values that are written in the index file for each mark) that is different from the sorting key (an expression for sorting the rows in data parts). In this case the primary key expression tuple must be a prefix of the sorting key expression tuple. + +This feature is helpful when using the [SummingMergeTree](../../../engines/table-engines/mergetree-family/summingmergetree.md) and +[AggregatingMergeTree](../../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engines. In a common case when using these engines, the table has two types of columns: *dimensions* and *measures*. Typical queries aggregate values of measure columns with arbitrary `GROUP BY` and filtering by dimensions. Because SummingMergeTree and AggregatingMergeTree aggregate rows with the same value of the sorting key, it is natural to add all dimensions to it. As a result, the key expression consists of a long list of columns and this list must be frequently updated with newly added dimensions. + +In this case it makes sense to leave only a few columns in the primary key that will provide efficient range scans and add the remaining dimension columns to the sorting key tuple. + +[ALTER](../../../sql-reference/statements/alter/index.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts do not need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification. + +### Use of Indexes and Partitions in Queries {#use-of-indexes-and-partitions-in-queries} + +For `SELECT` queries, ClickHouse analyzes whether an index can be used. An index can be used if the `WHERE/PREWHERE` clause has an expression (as one of the conjunction elements, or entirely) that represents an equality or inequality comparison operation, or if it has `IN` or `LIKE` with a fixed prefix on columns or expressions that are in the primary key or partitioning key, or on certain partially repetitive functions of these columns, or logical relationships of these expressions. + +Thus, it is possible to quickly run queries on one or many ranges of the primary key. In this example, queries will be fast when run for a specific tracking tag, for a specific tag and date range, for a specific tag and date, for multiple tags with a date range, and so on. + +Let’s look at the engine configured as follows: + + ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 + +In this case, in queries: + +``` sql +SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34 +SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42) +SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01')) +``` + +ClickHouse will use the primary key index to trim improper data and the monthly partitioning key to trim partitions that are in improper date ranges. + +The queries above show that the index is used even for complex expressions. Reading from the table is organized so that using the index can’t be slower than a full scan. + +In the example below, the index can’t be used. + +``` sql +SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' +``` + +To check whether ClickHouse can use the index when running a query, use the settings [force_index_by_date](../../../operations/settings/settings.md#settings-force_index_by_date) and [force_primary_key](../../../operations/settings/settings.md#force-primary-key). + +The key for partitioning by month allows reading only those data blocks which contain dates from the proper range. In this case, the data block may contain data for many dates (up to an entire month). Within a block, data is sorted by primary key, which might not contain the date as the first column. Because of this, using a query with only a date condition that does not specify the primary key prefix will cause more data to be read than for a single date. + +### Use of Index for Partially-monotonic Primary Keys {#use-of-index-for-partially-monotonic-primary-keys} + +Consider, for example, the days of the month. They form a [monotonic sequence](https://en.wikipedia.org/wiki/Monotonic_function) for one month, but not monotonic for more extended periods. This is a partially-monotonic sequence. If a user creates the table with partially-monotonic primary key, ClickHouse creates a sparse index as usual. When a user selects data from this kind of table, ClickHouse analyzes the query conditions. If the user wants to get data between two marks of the index and both these marks fall within one month, ClickHouse can use the index in this particular case because it can calculate the distance between the parameters of a query and index marks. + +ClickHouse cannot use an index if the values of the primary key in the query parameter range do not represent a monotonic sequence. In this case, ClickHouse uses the full scan method. + +ClickHouse uses this logic not only for days of the month sequences, but for any primary key that represents a partially-monotonic sequence. + +### Data Skipping Indexes {#table_engine-mergetree-data_skipping-indexes} + +The index declaration is in the columns section of the `CREATE` query. + +``` sql +INDEX index_name expr TYPE type(...) GRANULARITY granularity_value +``` + +For tables from the `*MergeTree` family, data skipping indices can be specified. + +These indices aggregate some information about the specified expression on blocks, which consist of `granularity_value` granules (the size of the granule is specified using the `index_granularity` setting in the table engine). Then these aggregates are used in `SELECT` queries for reducing the amount of data to read from the disk by skipping big blocks of data where the `where` query cannot be satisfied. + +**Example** + +``` sql +CREATE TABLE table_name +( + u64 UInt64, + i32 Int32, + s String, + ... + INDEX a (u64 * i32, s) TYPE minmax GRANULARITY 3, + INDEX b (u64 * length(s)) TYPE set(1000) GRANULARITY 4 +) ENGINE = MergeTree() +... +``` + +Indices from the example can be used by ClickHouse to reduce the amount of data to read from disk in the following queries: + +``` sql +SELECT count() FROM table WHERE s < 'z' +SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 +``` + +#### Available Types of Indices {#available-types-of-indices} + +- `minmax` + + Stores extremes of the specified expression (if the expression is `tuple`, then it stores extremes for each element of `tuple`), uses stored info for skipping blocks of data like the primary key. + +- `set(max_rows)` + + Stores unique values of the specified expression (no more than `max_rows` rows, `max_rows=0` means “no limits”). Uses the values to check if the `WHERE` expression is not satisfiable on a block of data. + +- `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` + + Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from a block of data. Works only with datatypes: [String](../../../sql-reference/data-types/string.md), [FixedString](../../../sql-reference/data-types/fixedstring.md) and [Map](../../../sql-reference/data-types/map.md). Can be used for optimization of `EQUALS`, `LIKE` and `IN` expressions. + + - `n` — ngram size, + - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, for example, 256 or 512, because it can be compressed well). + - `number_of_hash_functions` — The number of hash functions used in the Bloom filter. + - `random_seed` — The seed for Bloom filter hash functions. + +- `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` + + The same as `ngrambf_v1`, but stores tokens instead of ngrams. Tokens are sequences separated by non-alphanumeric characters. + +- `bloom_filter([false_positive])` — Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) for the specified columns. + + The optional `false_positive` parameter is the probability of receiving a false positive response from the filter. Possible values: (0, 1). Default value: 0.025. + + Supported data types: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`, `UUID`, `Map`. + + For `Map` data type client can specify if index should be created for keys or values using [mapKeys](../../../sql-reference/functions/tuple-map-functions.md#mapkeys) or [mapValues](../../../sql-reference/functions/tuple-map-functions.md#mapvalues) function. + + The following functions can use the filter: [equals](../../../sql-reference/functions/comparison-functions.md), [notEquals](../../../sql-reference/functions/comparison-functions.md), [in](../../../sql-reference/functions/in-functions.md), [notIn](../../../sql-reference/functions/in-functions.md), [has](../../../sql-reference/functions/array-functions.md#hasarr-elem), [hasAny](../../../sql-reference/functions/array-functions.md#hasany), [hasAll](../../../sql-reference/functions/array-functions.md#hasall). + + Example of index creation for `Map` data type + +``` +INDEX map_key_index mapKeys(map_column) TYPE bloom_filter GRANULARITY 1 +INDEX map_key_index mapValues(map_column) TYPE bloom_filter GRANULARITY 1 +``` + + +``` sql +INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 +INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 +INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 +``` + +#### Functions Support {#functions-support} + +Conditions in the `WHERE` clause contains calls of the functions that operate with columns. If the column is a part of an index, ClickHouse tries to use this index when performing the functions. ClickHouse supports different subsets of functions for using indexes. + +The `set` index can be used with all functions. Function subsets for other indexes are shown in the table below. + +| Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter | +|------------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------| +| [equals (=, ==)](../../../sql-reference/functions/comparison-functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!=, <>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [like](../../../sql-reference/functions/string-search-functions.md#function-like) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [notLike](../../../sql-reference/functions/string-search-functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [startsWith](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [endsWith](../../../sql-reference/functions/string-functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | +| [multiSearchAny](../../../sql-reference/functions/string-search-functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | +| [in](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notIn](../../../sql-reference/functions/in-functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [less (<)](../../../sql-reference/functions/comparison-functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greater (>)](../../../sql-reference/functions/comparison-functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [lessOrEquals (<=)](../../../sql-reference/functions/comparison-functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greaterOrEquals (>=)](../../../sql-reference/functions/comparison-functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [empty](../../../sql-reference/functions/array-functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [notEmpty](../../../sql-reference/functions/array-functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | + +Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization. + +:::note +Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can not be used for optimizing queries where the result of a function is expected to be false. + +For example: + +- Can be optimized: + - `s LIKE '%test%'` + - `NOT s NOT LIKE '%test%'` + - `s = 1` + - `NOT s != 1` + - `startsWith(s, 'test')` +- Can not be optimized: + - `NOT s LIKE '%test%'` + - `s NOT LIKE '%test%'` + - `NOT s = 1` + - `s != 1` + - `NOT startsWith(s, 'test')` +::: + +## Projections {#projections} +Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries. + +Projections are an experimental feature. To enable them you must set the [allow_experimental_projection_optimization](../../../operations/settings/settings.md#allow-experimental-projection-optimization) to `1`. See also the [force_optimize_projection](../../../operations/settings/settings.md#force-optimize-projection) setting. + +Projections are not supported in the `SELECT` statements with the [FINAL](../../../sql-reference/statements/select/from.md#select-from-final) modifier. + +### Projection Query {#projection-query} +A projection query is what defines a projection. It implicitly selects data from the parent table. +**Syntax** + +```sql +SELECT [GROUP BY] [ORDER BY] +``` + +Projections can be modified or dropped with the [ALTER](../../../sql-reference/statements/alter/projection.md) statement. + +### Projection Storage {#projection-storage} +Projections are stored inside the part directory. It's similar to an index but contains a subdirectory that stores an anonymous `MergeTree` table's part. The table is induced by the definition query of the projection. If there is a `GROUP BY` clause, the underlying storage engine becomes [AggregatingMergeTree](aggregatingmergetree.md), and all aggregate functions are converted to `AggregateFunction`. If there is an `ORDER BY` clause, the `MergeTree` table uses it as its primary key expression. During the merge process the projection part is merged via its storage's merge routine. The checksum of the parent table's part is combined with the projection's part. Other maintenance jobs are similar to skip indices. + +### Query Analysis {#projection-query-analysis} +1. Check if the projection can be used to answer the given query, that is, it generates the same answer as querying the base table. +2. Select the best feasible match, which contains the least granules to read. +3. The query pipeline which uses projections will be different from the one that uses the original parts. If the projection is absent in some parts, we can add the pipeline to "project" it on the fly. + +## Concurrent Data Access {#concurrent-data-access} + +For concurrent table access, we use multi-versioning. In other words, when a table is simultaneously read and updated, data is read from a set of parts that is current at the time of the query. There are no lengthy locks. Inserts do not get in the way of read operations. + +Reading from a table is automatically parallelized. + +## TTL for Columns and Tables {#table_engine-mergetree-ttl} + +Determines the lifetime of values. + +The `TTL` clause can be set for the whole table and for each individual column. Table-level `TTL` can also specify the logic of automatic moving data between disks and volumes, or recompressing parts where all the data has been expired. + +Expressions must evaluate to [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md) data type. + +**Syntax** + +Setting time-to-live for a column: + +``` sql +TTL time_column +TTL time_column + interval +``` + +To define `interval`, use [time interval](../../../sql-reference/operators/index.md#operators-datetime) operators, for example: + +``` sql +TTL date_time + INTERVAL 1 MONTH +TTL date_time + INTERVAL 15 HOUR +``` + +### Column TTL {#mergetree-column-ttl} + +When the values in the column expire, ClickHouse replaces them with the default values for the column data type. If all the column values in the data part expire, ClickHouse deletes this column from the data part in a filesystem. + +The `TTL` clause can’t be used for key columns. + +**Examples** + +Creating a table with `TTL`: + +``` sql +CREATE TABLE example_table +( + d DateTime, + a Int TTL d + INTERVAL 1 MONTH, + b Int TTL d + INTERVAL 1 MONTH, + c String +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(d) +ORDER BY d; +``` + +Adding TTL to a column of an existing table + +``` sql +ALTER TABLE example_table + MODIFY COLUMN + c String TTL d + INTERVAL 1 DAY; +``` + +Altering TTL of the column + +``` sql +ALTER TABLE example_table + MODIFY COLUMN + c String TTL d + INTERVAL 1 MONTH; +``` + +### Table TTL {#mergetree-table-ttl} + +Table can have an expression for removal of expired rows, and multiple expressions for automatic move of parts between [disks or volumes](#table_engine-mergetree-multiple-volumes). When rows in the table expire, ClickHouse deletes all corresponding rows. For parts moving or recompressing, all rows of a part must satisfy the `TTL` expression criteria. + +``` sql +TTL expr + [DELETE|RECOMPRESS codec_name1|TO DISK 'xxx'|TO VOLUME 'xxx'][, DELETE|RECOMPRESS codec_name2|TO DISK 'aaa'|TO VOLUME 'bbb'] ... + [WHERE conditions] + [GROUP BY key_expr [SET v1 = aggr_func(v1) [, v2 = aggr_func(v2) ...]] ] +``` + +Type of TTL rule may follow each TTL expression. It affects an action which is to be done once the expression is satisfied (reaches current time): + +- `DELETE` - delete expired rows (default action); +- `RECOMPRESS codec_name` - recompress data part with the `codec_name`; +- `TO DISK 'aaa'` - move part to the disk `aaa`; +- `TO VOLUME 'bbb'` - move part to the disk `bbb`; +- `GROUP BY` - aggregate expired rows. + +With `WHERE` clause you may specify which of the expired rows to delete or aggregate (it cannot be applied to moves or recompression). + +`GROUP BY` expression must be a prefix of the table primary key. + +If a column is not part of the `GROUP BY` expression and is not set explicitly in the `SET` clause, in result row it contains an occasional value from the grouped rows (as if aggregate function `any` is applied to it). + +**Examples** + +Creating a table with `TTL`: + +``` sql +CREATE TABLE example_table +( + d DateTime, + a Int +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(d) +ORDER BY d +TTL d + INTERVAL 1 MONTH [DELETE], + d + INTERVAL 1 WEEK TO VOLUME 'aaa', + d + INTERVAL 2 WEEK TO DISK 'bbb'; +``` + +Altering `TTL` of the table: + +``` sql +ALTER TABLE example_table + MODIFY TTL d + INTERVAL 1 DAY; +``` + +Creating a table, where the rows are expired after one month. The expired rows where dates are Mondays are deleted: + +``` sql +CREATE TABLE table_with_where +( + d DateTime, + a Int +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(d) +ORDER BY d +TTL d + INTERVAL 1 MONTH DELETE WHERE toDayOfWeek(d) = 1; +``` + +Creating a table, where expired rows are recompressed: + +```sql +CREATE TABLE table_for_recompression +( + d DateTime, + key UInt64, + value String +) ENGINE MergeTree() +ORDER BY tuple() +PARTITION BY key +TTL d + INTERVAL 1 MONTH RECOMPRESS CODEC(ZSTD(17)), d + INTERVAL 1 YEAR RECOMPRESS CODEC(LZ4HC(10)) +SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0; +``` + +Creating a table, where expired rows are aggregated. In result rows `x` contains the maximum value accross the grouped rows, `y` — the minimum value, and `d` — any occasional value from grouped rows. + +``` sql +CREATE TABLE table_for_aggregation +( + d DateTime, + k1 Int, + k2 Int, + x Int, + y Int +) +ENGINE = MergeTree +ORDER BY (k1, k2) +TTL d + INTERVAL 1 MONTH GROUP BY k1, k2 SET x = max(x), y = min(y); +``` + +### Removing Expired Data {#mergetree-removing-expired-data} + +Data with an expired `TTL` is removed when ClickHouse merges data parts. + +When ClickHouse detects that data is expired, it performs an off-schedule merge. To control the frequency of such merges, you can set `merge_with_ttl_timeout`. If the value is too low, it will perform many off-schedule merges that may consume a lot of resources. + +If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](../../../sql-reference/statements/optimize.md) query before `SELECT`. + +**See Also** + +- [ttl_only_drop_parts](../../../operations/settings/settings.md#ttl_only_drop_parts) setting + + +## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} + +### Introduction {#introduction} + +`MergeTree` family table engines can store data on multiple block devices. For example, it can be useful when the data of a certain table are implicitly split into “hot” and “cold”. The most recent data is regularly requested but requires only a small amount of space. On the contrary, the fat-tailed historical data is requested rarely. If several disks are available, the “hot” data may be located on fast disks (for example, NVMe SSDs or in memory), while the “cold” data - on relatively slow ones (for example, HDD). + +Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../../sql-reference/statements/alter/partition.md#alter_move-partition) queries. + +### Terms {#terms} + +- Disk — Block device mounted to the filesystem. +- Default disk — Disk that stores the path specified in the [path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-path) server setting. +- Volume — Ordered set of equal disks (similar to [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures)). +- Storage policy — Set of volumes and the rules for moving data between them. + +The names given to the described entities can be found in the system tables, [system.storage_policies](../../../operations/system-tables/storage_policies.md#system_tables-storage_policies) and [system.disks](../../../operations/system-tables/disks.md#system_tables-disks). To apply one of the configured storage policies for a table, use the `storage_policy` setting of `MergeTree`-engine family tables. + +### Configuration {#table_engine-mergetree-multiple-volumes_configure} + +Disks, volumes and storage policies should be declared inside the `` tag either in the main file `config.xml` or in a distinct file in the `config.d` directory. + +Configuration structure: + +``` xml + + + + /mnt/fast_ssd/clickhouse/ + + + /mnt/hdd1/clickhouse/ + 10485760 + + + /mnt/hdd2/clickhouse/ + 10485760 + + + ... + + + ... + +``` + +Tags: + +- `` — Disk name. Names must be different for all disks. +- `path` — path under which a server will store data (`data` and `shadow` folders), should be terminated with ‘/’. +- `keep_free_space_bytes` — the amount of free disk space to be reserved. + +The order of the disk definition is not important. + +Storage policies configuration markup: + +``` xml + + ... + + + + + disk_name_from_disks_configuration + 1073741824 + + + + + + + 0.2 + + + + + + + + ... + +``` + +Tags: + +- `policy_name_N` — Policy name. Policy names must be unique. +- `volume_name_N` — Volume name. Volume names must be unique. +- `disk` — a disk within a volume. +- `max_data_part_size_bytes` — the maximum size of a part that can be stored on any of the volume’s disks. If the a size of a merged part estimated to be bigger than `max_data_part_size_bytes` then this part will be written to a next volume. Basically this feature allows to keep new/small parts on a hot (SSD) volume and move them to a cold (HDD) volume when they reach large size. Do not use this setting if your policy has only one volume. +- `move_factor` — when the amount of available space gets lower than this factor, data automatically starts to move on the next volume if any (by default, 0.1). ClickHouse sorts existing parts by size from largest to smallest (in descending order) and selects parts with the total size that is sufficient to meet the `move_factor` condition. If the total size of all parts is insufficient, all parts will be moved. +- `prefer_not_to_merge` — Disables merging of data parts on this volume. When this setting is enabled, merging data on this volume is not allowed. This allows controlling how ClickHouse works with slow disks. + +Cofiguration examples: + +``` xml + + ... + + + + + disk1 + disk2 + + + + + + + + fast_ssd + 1073741824 + + + disk1 + + + 0.2 + + + + +
+ jbod1 +
+ + external + true + +
+
+
+ ... +
+``` + +In given example, the `hdd_in_order` policy implements the [round-robin](https://en.wikipedia.org/wiki/Round-robin_scheduling) approach. Thus this policy defines only one volume (`single`), the data parts are stored on all its disks in circular order. Such policy can be quite useful if there are several similar disks are mounted to the system, but RAID is not configured. Keep in mind that each individual disk drive is not reliable and you might want to compensate it with replication factor of 3 or more. + +If there are different kinds of disks available in the system, `moving_from_ssd_to_hdd` policy can be used instead. The volume `hot` consists of an SSD disk (`fast_ssd`), and the maximum size of a part that can be stored on this volume is 1GB. All the parts with the size larger than 1GB will be stored directly on the `cold` volume, which contains an HDD disk `disk1`. +Also, once the disk `fast_ssd` gets filled by more than 80%, data will be transferred to the `disk1` by a background process. + +The order of volume enumeration within a storage policy is important. Once a volume is overfilled, data are moved to the next one. The order of disk enumeration is important as well because data are stored on them in turns. + +When creating a table, one can apply one of the configured storage policies to it: + +``` sql +CREATE TABLE table_with_non_default_policy ( + EventDate Date, + OrderID UInt64, + BannerID UInt64, + SearchPhrase String +) ENGINE = MergeTree +ORDER BY (OrderID, BannerID) +PARTITION BY toYYYYMM(EventDate) +SETTINGS storage_policy = 'moving_from_ssd_to_hdd' +``` + +The `default` storage policy implies using only one volume, which consists of only one disk given in ``. +You could change storage policy after table creation with [ALTER TABLE ... MODIFY SETTING] query, new policy should include all old disks and volumes with same names. + +The number of threads performing background moves of data parts can be changed by [background_move_pool_size](../../../operations/settings/settings.md#background_move_pool_size) setting. + +### Details {#details} + +In the case of `MergeTree` tables, data is getting to disk in different ways: + +- As a result of an insert (`INSERT` query). +- During background merges and [mutations](../../../sql-reference/statements/alter/index.md#alter-mutations). +- When downloading from another replica. +- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../../sql-reference/statements/alter/partition.md#alter_freeze-partition). + +In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy: + +1. The first volume (in the order of definition) that has enough disk space for storing a part (`unreserved_space > current_part_size`) and allows for storing parts of a given size (`max_data_part_size_bytes > current_part_size`) is chosen. +2. Within this volume, that disk is chosen that follows the one, which was used for storing the previous chunk of data, and that has free space more than the part size (`unreserved_space - keep_free_space_bytes > current_part_size`). + +Under the hood, mutations and partition freezing make use of [hard links](https://en.wikipedia.org/wiki/Hard_link). Hard links between different disks are not supported, therefore in such cases the resulting parts are stored on the same disks as the initial ones. + +In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file. +Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](../../../operations/system-tables/part_log.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. + +User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../../sql-reference/statements/alter/partition.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. + +Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas. + +After the completion of background merges and mutations, old parts are removed only after a certain amount of time (`old_parts_lifetime`). +During this time, they are not moved to other volumes or disks. Therefore, until the parts are finally removed, they are still taken into account for evaluation of the occupied disk space. + +User can assign new big parts to different disks of a [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures) volume in a balanced way using the [min_bytes_to_rebalance_partition_over_jbod](../../../operations/settings/merge-tree-settings.md#min-bytes-to-rebalance-partition-over-jbod) setting. + +## Using S3 for Data Storage {#table_engine-mergetree-s3} + +`MergeTree` family table engines can store data to [S3](https://aws.amazon.com/s3/) using a disk with type `s3`. + +This feature is under development and not ready for production. There are known drawbacks such as very low performance. + +Configuration markup: +``` xml + + ... + + + s3 + https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/root-path/ + your_access_key_id + your_secret_access_key + + your_base64_encoded_customer_key + + http://proxy1 + http://proxy2 + + 10000 + 5000 + 10 + 4 + 1000 + /var/lib/clickhouse/disks/s3/ + true + /var/lib/clickhouse/disks/s3/cache/ + false + + + ... + +``` + +Required parameters: + +- `endpoint` — S3 endpoint URL in `path` or `virtual hosted` [styles](https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). Endpoint URL should contain a bucket and root path to store data. +- `access_key_id` — S3 access key id. +- `secret_access_key` — S3 secret access key. + +Optional parameters: + +- `region` — S3 region name. +- `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`. +- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`. +- `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL. +- `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`. +- `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`. +- `retry_attempts` — Number of retry attempts in case of failed request. Default value is `10`. +- `single_read_retries` — Number of retry attempts in case of connection drop during read. Default value is `4`. +- `min_bytes_for_seek` — Minimal number of bytes to use seek operation instead of sequential read. Default value is `1 Mb`. +- `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks//`. +- `cache_enabled` — Allows to cache mark and index files on local FS. Default value is `true`. +- `cache_path` — Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks//cache/`. +- `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`. +- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. + +S3 disk can be configured as `main` or `cold` storage: +``` xml + + ... + + + s3 + https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/root-path/ + your_access_key_id + your_secret_access_key + + + + + +
+ s3 +
+
+
+ + +
+ default +
+ + s3 + +
+ 0.2 +
+
+ ... +
+``` + +In case of `cold` option a data can be moved to S3 if local disk free size will be smaller than `move_factor * disk_size` or by TTL move rule. + +## Using Azure Blob Storage for Data Storage {#table_engine-mergetree-azure-blob-storage} + +`MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`. + +As of February 2022, this feature is still a fresh addition, so expect that some Azure Blob Storage functionalities might be unimplemented. + +Configuration markup: +``` xml + + ... + + + azure_blob_storage + http://account.blob.core.windows.net + container + account + pass123 + /var/lib/clickhouse/disks/blob_storage_disk/ + true + /var/lib/clickhouse/disks/blob_storage_disk/cache/ + false + + + ... + +``` + +Connection parameters: +* `storage_account_url` - **Required**, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`. +* `container_name` - Target container name, defaults to `default-container`. +* `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet. + +Authentication parameters (the disk will try all available methods **and** Managed Identity Credential): +* `connection_string` - For authentication using a connection string. +* `account_name` and `account_key` - For authentication using Shared Key. + +Limit parameters (mainly for internal usage): +* `max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage. +* `min_bytes_for_seek` - Limits the size of a seekable region. +* `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage. +* `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage. +* `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated. + +Other parameters: +* `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks//`. +* `cache_enabled` - Allows to cache mark and index files on local FS. Default value is `true`. +* `cache_path` - Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks//cache/`. +* `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`. + +Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)). + +## Virtual Columns {#virtual-columns} + +- `_part` — Name of a part. +- `_part_index` — Sequential index of the part in the query result. +- `_partition_id` — Name of a partition. +- `_part_uuid` — Unique part identifier (if enabled MergeTree setting `assign_part_uuids`). +- `_partition_value` — Values (a tuple) of a `partition by` expression. +- `_sample_factor` — Sample factor (from the query). diff --git a/docs/en/reference/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/en/reference/engines/table-engines/mergetree-family/replacingmergetree.md new file mode 100644 index 00000000000..47651527f99 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/replacingmergetree.md @@ -0,0 +1,70 @@ +--- +sidebar_position: 40 +sidebar_label: ReplacingMergeTree +--- + +# ReplacingMergeTree {#replacingmergetree} + +The engine differs from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) in that it removes duplicate entries with the same [sorting key](../../../engines/table-engines/mergetree-family/mergetree.md) value (`ORDER BY` table section, not `PRIMARY KEY`). + +Data deduplication occurs only during a merge. Merging occurs in the background at an unknown time, so you can’t plan for it. Some of the data may remain unprocessed. Although you can run an unscheduled merge using the `OPTIMIZE` query, do not count on using it, because the `OPTIMIZE` query will read and write a large amount of data. + +Thus, `ReplacingMergeTree` is suitable for clearing out duplicate data in the background in order to save space, but it does not guarantee the absence of duplicates. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = ReplacingMergeTree([ver]) +[PARTITION BY expr] +[ORDER BY expr] +[PRIMARY KEY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +For a description of request parameters, see [statement description](../../../sql-reference/statements/create/table.md). + +:::warning +Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY KEY`. +::: + +**ReplacingMergeTree Parameters** + +- `ver` — column with the version number. Type `UInt*`, `Date`, `DateTime` or `DateTime64`. Optional parameter. + + When merging, `ReplacingMergeTree` from all the rows with the same sorting key leaves only one: + + - The last in the selection, if `ver` not set. A selection is a set of rows in a set of parts participating in the merge. The most recently created part (the last insert) will be the last one in the selection. Thus, after deduplication, the very last row from the most recent insert will remain for each unique sorting key. + - With the maximum version, if `ver` specified. + +**Query clauses** + +When creating a `ReplacingMergeTree` table the same [clauses](../../../engines/table-engines/mergetree-family/mergetree.md) are required, as when creating a `MergeTree` table. + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects and, if possible, switch old projects to the method described above. +::: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] ReplacingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [ver]) +``` + +All of the parameters excepting `ver` have the same meaning as in `MergeTree`. + +- `ver` - column with the version. Optional parameter. For a description, see the text above. + +
diff --git a/docs/en/reference/engines/table-engines/mergetree-family/replication.md b/docs/en/reference/engines/table-engines/mergetree-family/replication.md new file mode 100644 index 00000000000..67c503854a9 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/replication.md @@ -0,0 +1,295 @@ +--- +sidebar_position: 20 +sidebar_label: Data Replication +--- + +# Data Replication {#table_engines-replication} + +Replication is only supported for tables in the MergeTree family: + +- ReplicatedMergeTree +- ReplicatedSummingMergeTree +- ReplicatedReplacingMergeTree +- ReplicatedAggregatingMergeTree +- ReplicatedCollapsingMergeTree +- ReplicatedVersionedCollapsingMergeTree +- ReplicatedGraphiteMergeTree + +Replication works at the level of an individual table, not the entire server. A server can store both replicated and non-replicated tables at the same time. + +Replication does not depend on sharding. Each shard has its own independent replication. + +Compressed data for `INSERT` and `ALTER` queries is replicated (for more information, see the documentation for [ALTER](../../../sql-reference/statements/alter/index.md#query_language_queries_alter)). + +`CREATE`, `DROP`, `ATTACH`, `DETACH` and `RENAME` queries are executed on a single server and are not replicated: + +- The `CREATE TABLE` query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica. +- The `DROP TABLE` query deletes the replica located on the server where the query is run. +- The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. + +ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing replicas meta information. Use ZooKeeper version 3.4.5 or newer. + +To use replication, set parameters in the [zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) server configuration section. + +:::warning +Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. +::: + +Example of setting the addresses of the ZooKeeper cluster: + +``` xml + + + example1 + 2181 + + + example2 + 2181 + + + example3 + 2181 + + +``` + +ClickHouse also supports to store replicas meta information in the auxiliary ZooKeeper cluster by providing ZooKeeper cluster name and path as engine arguments. +In other word, it supports to store the metadata of differnt tables in different ZooKeeper clusters. + +Example of setting the addresses of the auxiliary ZooKeeper cluster: + +``` xml + + + + example_2_1 + 2181 + + + example_2_2 + 2181 + + + example_2_3 + 2181 + + + + + example_3_1 + 2181 + + + +``` + +To store table datameta in a auxiliary ZooKeeper cluster instead of default ZooKeeper cluster, we can use the SQL to create table with +ReplicatedMergeTree engine as follow: + +``` +CREATE TABLE table_name ( ... ) ENGINE = ReplicatedMergeTree('zookeeper_name_configured_in_auxiliary_zookeepers:path', 'replica_name') ... +``` +You can specify any existing ZooKeeper cluster and the system will use a directory on it for its own data (the directory is specified when creating a replicatable table). + +If ZooKeeper isn’t set in the config file, you can’t create replicated tables, and any existing replicated tables will be read-only. + +ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries). + +For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it does not create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data. + +For very large clusters, you can use different ZooKeeper clusters for different shards. However, from our experience this has not proven necessary based on production clusters with approximately 300 servers. + +Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. The number of threads performing background tasks for replicated tables can be set by [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) setting. + +`ReplicatedMergeTree` engine uses a separate thread pool for replicated fetches. Size of the pool is limited by the [background_fetches_pool_size](../../../operations/settings/settings.md#background_fetches_pool_size) setting which can be tuned with a server restart. + +By default, an INSERT query waits for confirmation of writing the data from only one replica. If the data was successfully written to only one replica and the server with this replica ceases to exist, the stored data will be lost. To enable getting confirmation of data writes from multiple replicas, use the `insert_quorum` option. + +Each block of data is written atomically. The INSERT query is divided into blocks up to `max_insert_block_size = 1048576` rows. In other words, if the `INSERT` query has less than 1048576 rows, it is made atomically. + +Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application does not know if the data was written to the DB, so the `INSERT` query can simply be repeated. It does not matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge_tree](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-merge_tree) server settings. + +During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.) + +You can have any number of replicas of the same data. Based on our experiences, a relatively reliable and convenient solution could use double replication in production, with each server using RAID-5 or RAID-6 (and RAID-10 in some cases). + +The system monitors data synchronicity on replicas and is able to recover after a failure. Failover is automatic (for small differences in data) or semi-automatic (when data differs too much, which may indicate a configuration error). + +## Creating Replicated Tables {#creating-replicated-tables} + +The `Replicated` prefix is added to the table engine name. For example:`ReplicatedMergeTree`. + +**Replicated\*MergeTree parameters** + +- `zoo_path` — The path to the table in ZooKeeper. +- `replica_name` — The replica name in ZooKeeper. +- `other_parameters` — Parameters of an engine which is used for creating the replicated version, for example, version in `ReplacingMergeTree`. + +Example: + +``` sql +CREATE TABLE table_name +( + EventDate DateTime, + CounterID UInt32, + UserID UInt32, + ver UInt16 +) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', ver) +PARTITION BY toYYYYMM(EventDate) +ORDER BY (CounterID, EventDate, intHash32(UserID)) +SAMPLE BY intHash32(UserID); +``` + +
+ +Example in deprecated syntax + +``` sql +CREATE TABLE table_name +( + EventDate DateTime, + CounterID UInt32, + UserID UInt32 +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192); +``` + +
+ +As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the [macros](../../../operations/server-configuration-parameters/settings.md#macros) section of the configuration file. + +Example: + +``` xml + + 05 + 02 + example05-02-1 + +``` + +The path to the table in ZooKeeper should be unique for each replicated table. Tables on different shards should have different paths. +In this case, the path consists of the following parts: + +`/clickhouse/tables/` is the common prefix. We recommend using exactly this one. + +`{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the example cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier. + +`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query. +*HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name` + +The two built-in substitutions `{database}` and `{table}` can be used, they expand into the table name and the database name respectively (unless these macros are defined in the `macros` section). So the zookeeper path can be specified as `'/clickhouse/tables/{layer}-{shard}/{database}/{table}'`. +Be careful with table renames when using these built-in substitutions. The path in Zookeeper cannot be changed, and when the table is renamed, the macros will expand into a different path, the table will refer to a path that does not exist in Zookeeper, and will go into read-only mode. + +The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard. + +You can define the parameters explicitly instead of using substitutions. This might be convenient for testing and for configuring small clusters. However, you can’t use distributed DDL queries (`ON CLUSTER`) in this case. + +When working with large clusters, we recommend using substitutions because they reduce the probability of error. + +You can specify default arguments for `Replicated` table engine in the server configuration file. For instance: + +```xml +/clickhouse/tables/{shard}/{database}/{table} +{replica} +``` + +In this case, you can omit arguments when creating tables: + +``` sql +CREATE TABLE table_name ( + x UInt32 +) ENGINE = ReplicatedMergeTree +ORDER BY x; +``` + +It is equivalent to: + +``` sql +CREATE TABLE table_name ( + x UInt32 +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/table_name', '{replica}') +ORDER BY x; +``` + +Run the `CREATE TABLE` query on each replica. This query creates a new replicated table, or adds a new replica to an existing one. + +If you add a new replica after the table already contains some data on other replicas, the data will be copied from the other replicas to the new one after running the query. In other words, the new replica syncs itself with the others. + +To delete a replica, run `DROP TABLE`. However, only one replica is deleted – the one that resides on the server where you run the query. + +## Recovery After Failures {#recovery-after-failures} + +If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper. + +If ZooKeeper is unavailable during an `INSERT`, or an error occurs when interacting with ZooKeeper, an exception is thrown. + +After connecting to ZooKeeper, the system checks whether the set of data in the local file system matches the expected set of data (ZooKeeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas. + +If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ZooKeeper), it moves them to the `detached` subdirectory (they are not deleted). Any missing parts are copied from the replicas. + +Note that ClickHouse does not perform any destructive actions such as automatically deleting a large amount of data. + +When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. + +If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by “pushing a button”. + +To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ZooKeeper with any content, or run the command to restore all replicated tables: + +``` bash +sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data +``` + +Then restart the server. On start, the server deletes these flags and starts recovery. + +## Recovery After Complete Data Loss {#recovery-after-complete-data-loss} + +If all data and metadata disappeared from one of the servers, follow these steps for recovery: + +1. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them. +2. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory `/var/lib/clickhouse/data/db_name/table_name/`). +3. Copy table definitions located in `/var/lib/clickhouse/metadata/` from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, start the server and make all the `ATTACH TABLE` queries that should have been in the .sql files in `/var/lib/clickhouse/metadata/`.) +4. To start recovery, create the ZooKeeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` + +Then start the server (restart, if it is already running). Data will be downloaded from replicas. + +An alternative recovery option is to delete information about the lost replica from ZooKeeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”. + +There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once. + +## Converting from MergeTree to ReplicatedMergeTree {#converting-from-mergetree-to-replicatedmergetree} + +We use the term `MergeTree` to refer to all table engines in the `MergeTree family`, the same as for `ReplicatedMergeTree`. + +If you had a `MergeTree` table that was manually replicated, you can convert it to a replicated table. You might need to do this if you have already collected a large amount of data in a `MergeTree` table and now you want to enable replication. + +If the data differs on various replicas, first sync it, or delete this data on all the replicas except one. + +Rename the existing MergeTree table, then create a `ReplicatedMergeTree` table with the old name. +Move the data from the old table to the `detached` subdirectory inside the directory with the new table data (`/var/lib/clickhouse/data/db_name/table_name/`). +Then run `ALTER TABLE ATTACH PARTITION` on one of the replicas to add these data parts to the working set. + +## Converting from ReplicatedMergeTree to MergeTree {#converting-from-replicatedmergetree-to-mergetree} + +Create a MergeTree table with a different name. Move all the data from the directory with the `ReplicatedMergeTree` table data to the new table’s data directory. Then delete the `ReplicatedMergeTree` table and restart the server. + +If you want to get rid of a `ReplicatedMergeTree` table without launching the server: + +- Delete the corresponding `.sql` file in the metadata directory (`/var/lib/clickhouse/metadata/`). +- Delete the corresponding path in ZooKeeper (`/path_to_table/replica_name`). + +After this, you can launch the server, create a `MergeTree` table, move the data to its directory, and then restart the server. + +## Recovery When Metadata in the Zookeeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} + +If the data in ZooKeeper was lost or damaged, you can save data by moving it to an unreplicated table as described above. + +**See Also** + +- [background_schedule_pool_size](../../../operations/settings/settings.md#background_schedule_pool_size) +- [background_fetches_pool_size](../../../operations/settings/settings.md#background_fetches_pool_size) +- [execute_merges_on_single_replica_time_threshold](../../../operations/settings/settings.md#execute-merges-on-single-replica-time-threshold) +- [max_replicated_fetches_network_bandwidth](../../../operations/settings/merge-tree-settings.md#max_replicated_fetches_network_bandwidth) +- [max_replicated_sends_network_bandwidth](../../../operations/settings/merge-tree-settings.md#max_replicated_sends_network_bandwidth) + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/replication/) diff --git a/docs/en/reference/engines/table-engines/mergetree-family/summingmergetree.md b/docs/en/reference/engines/table-engines/mergetree-family/summingmergetree.md new file mode 100644 index 00000000000..5d180782ed3 --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/summingmergetree.md @@ -0,0 +1,140 @@ +--- +sidebar_position: 50 +sidebar_label: SummingMergeTree +--- + +# SummingMergeTree {#summingmergetree} + +The engine inherits from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree). The difference is that when merging data parts for `SummingMergeTree` tables ClickHouse replaces all the rows with the same primary key (or more accurately, with the same [sorting key](../../../engines/table-engines/mergetree-family/mergetree.md)) with one row which contains summarized values for the columns with the numeric data type. If the sorting key is composed in a way that a single key value corresponds to large number of rows, this significantly reduces storage volume and speeds up data selection. + +We recommend using the engine together with `MergeTree`. Store complete data in `MergeTree` table, and use `SummingMergeTree` for aggregated data storing, for example, when preparing reports. Such an approach will prevent you from losing valuable data due to an incorrectly composed primary key. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = SummingMergeTree([columns]) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +For a description of request parameters, see [request description](../../../sql-reference/statements/create/table.md). + +**Parameters of SummingMergeTree** + +- `columns` - a tuple with the names of columns where values will be summarized. Optional parameter. + The columns must be of a numeric type and must not be in the primary key. + + If `columns` not specified, ClickHouse summarizes the values in all columns with a numeric data type that are not in the primary key. + +**Query clauses** + +When creating a `SummingMergeTree` table the same [clauses](../../../engines/table-engines/mergetree-family/mergetree.md) are required, as when creating a `MergeTree` table. + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects and, if possible, switch the old projects to the method described above. +::: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] SummingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [columns]) +``` + +All of the parameters excepting `columns` have the same meaning as in `MergeTree`. + +- `columns` — tuple with names of columns values of which will be summarized. Optional parameter. For a description, see the text above. + +
+ +## Usage Example {#usage-example} + +Consider the following table: + +``` sql +CREATE TABLE summtt +( + key UInt32, + value UInt32 +) +ENGINE = SummingMergeTree() +ORDER BY key +``` + +Insert data to it: + +``` sql +INSERT INTO summtt Values(1,1),(1,2),(2,1) +``` + +ClickHouse may sum all the rows not completely ([see below](#data-processing)), so we use an aggregate function `sum` and `GROUP BY` clause in the query. + +``` sql +SELECT key, sum(value) FROM summtt GROUP BY key +``` + +``` text +┌─key─┬─sum(value)─┐ +│ 2 │ 1 │ +│ 1 │ 3 │ +└─────┴────────────┘ +``` + +## Data Processing {#data-processing} + +When data are inserted into a table, they are saved as-is. ClickHouse merges the inserted parts of data periodically and this is when rows with the same primary key are summed and replaced with one for each resulting part of data. + +ClickHouse can merge the data parts so that different resulting parts of data can consist rows with the same primary key, i.e. the summation will be incomplete. Therefore (`SELECT`) an aggregate function [sum()](../../../sql-reference/aggregate-functions/reference/sum.md#agg_function-sum) and `GROUP BY` clause should be used in a query as described in the example above. + +### Common Rules for Summation {#common-rules-for-summation} + +The values in the columns with the numeric data type are summarized. The set of columns is defined by the parameter `columns`. + +If the values were 0 in all of the columns for summation, the row is deleted. + +If column is not in the primary key and is not summarized, an arbitrary value is selected from the existing ones. + +The values are not summarized for columns in the primary key. + +### The Summation in the Aggregatefunction Columns {#the-summation-in-the-aggregatefunction-columns} + +For columns of [AggregateFunction type](../../../sql-reference/data-types/aggregatefunction.md) ClickHouse behaves as [AggregatingMergeTree](../../../engines/table-engines/mergetree-family/aggregatingmergetree.md) engine aggregating according to the function. + +### Nested Structures {#nested-structures} + +Table can have nested data structures that are processed in a special way. + +If the name of a nested table ends with `Map` and it contains at least two columns that meet the following criteria: + +- the first column is numeric `(*Int*, Date, DateTime)` or a string `(String, FixedString)`, let’s call it `key`, +- the other columns are arithmetic `(*Int*, Float32/64)`, let’s call it `(values...)`, + +then this nested table is interpreted as a mapping of `key => (values...)`, and when merging its rows, the elements of two data sets are merged by `key` with a summation of the corresponding `(values...)`. + +Examples: + +``` text +[(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] +[(1, 100)] + [(1, 150)] -> [(1, 250)] +[(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] +[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] +``` + +When requesting data, use the [sumMap(key, value)](../../../sql-reference/aggregate-functions/reference/summap.md) function for aggregation of `Map`. + +For nested data structure, you do not need to specify its columns in the tuple of columns for summation. + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/summingmergetree/) diff --git a/docs/en/reference/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/en/reference/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md new file mode 100644 index 00000000000..77cf192dcda --- /dev/null +++ b/docs/en/reference/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md @@ -0,0 +1,237 @@ +--- +sidebar_position: 80 +sidebar_label: VersionedCollapsingMergeTree +--- + +# VersionedCollapsingMergeTree {#versionedcollapsingmergetree} + +This engine: + +- Allows quick writing of object states that are continually changing. +- Deletes old object states in the background. This significantly reduces the volume of storage. + +See the section [Collapsing](#table_engines_versionedcollapsingmergetree) for details. + +The engine inherits from [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) and adds the logic for collapsing rows to the algorithm for merging data parts. `VersionedCollapsingMergeTree` serves the same purpose as [CollapsingMergeTree](../../../engines/table-engines/mergetree-family/collapsingmergetree.md) but uses a different collapsing algorithm that allows inserting the data in any order with multiple threads. In particular, the `Version` column helps to collapse the rows properly even if they are inserted in the wrong order. In contrast, `CollapsingMergeTree` allows only strictly consecutive insertion. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = VersionedCollapsingMergeTree(sign, version) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +For a description of query parameters, see the [query description](../../../sql-reference/statements/create/table.md). + +**Engine Parameters** + +``` sql +VersionedCollapsingMergeTree(sign, version) +``` + +- `sign` — Name of the column with the type of row: `1` is a “state” row, `-1` is a “cancel” row. + + The column data type should be `Int8`. + +- `version` — Name of the column with the version of the object state. + + The column data type should be `UInt*`. + +**Query Clauses** + +When creating a `VersionedCollapsingMergeTree` table, the same [clauses](../../../engines/table-engines/mergetree-family/mergetree.md) are required as when creating a `MergeTree` table. + +
+ +Deprecated Method for Creating a Table + +:::warning +Do not use this method in new projects. If possible, switch old projects to the method described above. +::: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] VersionedCollapsingMergeTree(date-column [, samp#table_engines_versionedcollapsingmergetreeling_expression], (primary, key), index_granularity, sign, version) +``` + +All of the parameters except `sign` and `version` have the same meaning as in `MergeTree`. + +- `sign` — Name of the column with the type of row: `1` is a “state” row, `-1` is a “cancel” row. + + Column Data Type — `Int8`. + +- `version` — Name of the column with the version of the object state. + + The column data type should be `UInt*`. + +
+ +## Collapsing {#table_engines_versionedcollapsingmergetree} + +### Data {#data} + +Consider a situation where you need to save continually changing data for some object. It is reasonable to have one row for an object and update the row whenever there are changes. However, the update operation is expensive and slow for a DBMS because it requires rewriting the data in the storage. Update is not acceptable if you need to write data quickly, but you can write the changes to an object sequentially as follows. + +Use the `Sign` column when writing the row. If `Sign = 1` it means that the row is a state of an object (let’s call it the “state” row). If `Sign = -1` it indicates the cancellation of the state of an object with the same attributes (let’s call it the “cancel” row). Also use the `Version` column, which should identify each state of an object with a separate number. + +For example, we want to calculate how many pages users visited on some site and how long they were there. At some point in time we write the following row with the state of user activity: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +At some point later we register the change of user activity and write it with the following two rows. + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +The first row cancels the previous state of the object (user). It should copy all of the fields of the canceled state except `Sign`. + +The second row contains the current state. + +Because we need only the last state of user activity, the rows + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +can be deleted, collapsing the invalid (old) state of the object. `VersionedCollapsingMergeTree` does this while merging the data parts. + +To find out why we need two rows for each change, see [Algorithm](#table_engines-versionedcollapsingmergetree-algorithm). + +**Notes on Usage** + +1. The program that writes the data should remember the state of an object to be able to cancel it. “Cancel” string should contain copies of the primary key fields and the version of the “state” string and the opposite `Sign`. It increases the initial size of storage but allows to write the data quickly. +2. Long growing arrays in columns reduce the efficiency of the engine due to the load for writing. The more straightforward the data, the better the efficiency. +3. `SELECT` results depend strongly on the consistency of the history of object changes. Be accurate when preparing data for inserting. You can get unpredictable results with inconsistent data, such as negative values for non-negative metrics like session depth. + +### Algorithm {#table_engines-versionedcollapsingmergetree-algorithm} + +When ClickHouse merges data parts, it deletes each pair of rows that have the same primary key and version and different `Sign`. The order of rows does not matter. + +When ClickHouse inserts data, it orders rows by the primary key. If the `Version` column is not in the primary key, ClickHouse adds it to the primary key implicitly as the last field and uses it for ordering. + +## Selecting Data {#selecting-data} + +ClickHouse does not guarantee that all of the rows with the same primary key will be in the same resulting data part or even on the same physical server. This is true both for writing the data and for subsequent merging of the data parts. In addition, ClickHouse processes `SELECT` queries with multiple threads, and it cannot predict the order of rows in the result. This means that aggregation is required if there is a need to get completely “collapsed” data from a `VersionedCollapsingMergeTree` table. + +To finalize collapsing, write a query with a `GROUP BY` clause and aggregate functions that account for the sign. For example, to calculate quantity, use `sum(Sign)` instead of `count()`. To calculate the sum of something, use `sum(Sign * x)` instead of `sum(x)`, and add `HAVING sum(Sign) > 0`. + +The aggregates `count`, `sum` and `avg` can be calculated this way. The aggregate `uniq` can be calculated if an object has at least one non-collapsed state. The aggregates `min` and `max` can’t be calculated because `VersionedCollapsingMergeTree` does not save the history of values of collapsed states. + +If you need to extract the data with “collapsing” but without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is inefficient and should not be used with large tables. + +## Example of Use {#example-of-use} + +Example data: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +Creating the table: + +``` sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews UInt8, + Duration UInt8, + Sign Int8, + Version UInt8 +) +ENGINE = VersionedCollapsingMergeTree(Sign, Version) +ORDER BY UserID +``` + +Inserting the data: + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1, 1) +``` + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(4324182021466249494, 6, 185, 1, 2) +``` + +We use two `INSERT` queries to create two different data parts. If we insert the data with a single query, ClickHouse creates one data part and will never perform any merge. + +Getting the data: + +``` sql +SELECT * FROM UAct +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +What do we see here and where are the collapsed parts? +We created two data parts using two `INSERT` queries. The `SELECT` query was performed in two threads, and the result is a random order of rows. +Collapsing did not occur because the data parts have not been merged yet. ClickHouse merges data parts at an unknown point in time which we cannot predict. + +This is why we need aggregation: + +``` sql +SELECT + UserID, + sum(PageViews * Sign) AS PageViews, + sum(Duration * Sign) AS Duration, + Version +FROM UAct +GROUP BY UserID, Version +HAVING sum(Sign) > 0 +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐ +│ 4324182021466249494 │ 6 │ 185 │ 2 │ +└─────────────────────┴───────────┴──────────┴─────────┘ +``` + +If we do not need aggregation and want to force collapsing, we can use the `FINAL` modifier for the `FROM` clause. + +``` sql +SELECT * FROM UAct FINAL +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +This is a very inefficient way to select data. Don’t use it for large tables. + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/versionedcollapsingmergetree/) diff --git a/docs/en/reference/engines/table-engines/special/buffer.md b/docs/en/reference/engines/table-engines/special/buffer.md new file mode 100644 index 00000000000..a0aff2ec813 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/buffer.md @@ -0,0 +1,77 @@ +--- +sidebar_position: 120 +sidebar_label: Buffer +--- + +# Buffer Table Engine {#buffer} + +Buffers the data to write in RAM, periodically flushing it to another table. During the read operation, data is read from the buffer and the other table simultaneously. + +``` sql +Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) +``` + +Engine parameters: + +- `database` – Database name. Instead of the database name, you can use a constant expression that returns a string. +- `table` – Table to flush data to. +- `num_layers` – Parallelism layer. Physically, the table will be represented as `num_layers` of independent buffers. Recommended value: 16. +- `min_time`, `max_time`, `min_rows`, `max_rows`, `min_bytes`, and `max_bytes` – Conditions for flushing data from the buffer. + +Optional engine parameters: + +- `flush_time`, `flush_rows`, `flush_bytes` – Conditions for flushing data from the buffer, that will happen only in background (omitted or zero means no `flush*` parameters). + +Data is flushed from the buffer and written to the destination table if all the `min*` conditions or at least one `max*` condition are met. + +Also, if at least one `flush*` condition are met flush initiated in background, this is different from `max*`, since `flush*` allows you to configure background flushes separately to avoid adding latency for `INSERT` (into `Buffer`) queries. + +- `min_time`, `max_time`, `flush_time` – Condition for the time in seconds from the moment of the first write to the buffer. +- `min_rows`, `max_rows`, `flush_rows` – Condition for the number of rows in the buffer. +- `min_bytes`, `max_bytes`, `flush_bytes` – Condition for the number of bytes in the buffer. + +During the write operation, data is inserted to a `num_layers` number of random buffers. Or, if the data part to insert is large enough (greater than `max_rows` or `max_bytes`), it is written directly to the destination table, omitting the buffer. + +The conditions for flushing the data are calculated separately for each of the `num_layers` buffers. For example, if `num_layers = 16` and `max_bytes = 100000000`, the maximum RAM consumption is 1.6 GB. + +Example: + +``` sql +CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) +``` + +Creating a `merge.hits_buffer` table with the same structure as `merge.hits` and using the Buffer engine. When writing to this table, data is buffered in RAM and later written to the ‘merge.hits’ table. 16 buffers are created. The data in each of them is flushed if either 100 seconds have passed, or one million rows have been written, or 100 MB of data have been written; or if simultaneously 10 seconds have passed and 10,000 rows and 10 MB of data have been written. For example, if just one row has been written, after 100 seconds it will be flushed, no matter what. But if many rows have been written, the data will be flushed sooner. + +When the server is stopped, with `DROP TABLE` or `DETACH TABLE`, buffer data is also flushed to the destination table. + +You can set empty strings in single quotation marks for the database and table name. This indicates the absence of a destination table. In this case, when the data flush conditions are reached, the buffer is simply cleared. This may be useful for keeping a window of data in memory. + +When reading from a Buffer table, data is processed both from the buffer and from the destination table (if there is one). +Note that the Buffer tables does not support an index. In other words, data in the buffer is fully scanned, which might be slow for large buffers. (For data in a subordinate table, the index that it supports will be used.) + +If the set of columns in the Buffer table does not match the set of columns in a subordinate table, a subset of columns that exist in both tables is inserted. + +If the types do not match for one of the columns in the Buffer table and a subordinate table, an error message is entered in the server log, and the buffer is cleared. +The same thing happens if the subordinate table does not exist when the buffer is flushed. + +:::warning +Running ALTER on the Buffer table in releases made before 26 Oct 2021 will cause a `Block structure mismatch` error (see [#15117](https://github.com/ClickHouse/ClickHouse/issues/15117) and [#30565](https://github.com/ClickHouse/ClickHouse/pull/30565)), so deleting the Buffer table and then recreating is the only option. It is advisable to check that this error is fixed in your release before trying to run ALTER on the Buffer table. +::: + +If the server is restarted abnormally, the data in the buffer is lost. + +`FINAL` and `SAMPLE` do not work correctly for Buffer tables. These conditions are passed to the destination table, but are not used for processing data in the buffer. If these features are required we recommend only using the Buffer table for writing, while reading from the destination table. + +When adding data to a Buffer, one of the buffers is locked. This causes delays if a read operation is simultaneously being performed from the table. + +Data that is inserted to a Buffer table may end up in the subordinate table in a different order and in different blocks. Because of this, a Buffer table is difficult to use for writing to a CollapsingMergeTree correctly. To avoid problems, you can set `num_layers` to 1. + +If the destination table is replicated, some expected characteristics of replicated tables are lost when writing to a Buffer table. The random changes to the order of rows and sizes of data parts cause data deduplication to quit working, which means it is not possible to have a reliable ‘exactly once’ write to replicated tables. + +Due to these disadvantages, we can only recommend using a Buffer table in rare cases. + +A Buffer table is used when too many INSERTs are received from a large number of servers over a unit of time and data can’t be buffered before insertion, which means the INSERTs can’t run fast enough. + +Note that it does not make sense to insert data one row at a time, even for Buffer tables. This will only produce a speed of a few thousand rows per second, while inserting larger blocks of data can produce over a million rows per second (see the section “Performance”). + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/buffer/) diff --git a/docs/en/reference/engines/table-engines/special/dictionary.md b/docs/en/reference/engines/table-engines/special/dictionary.md new file mode 100644 index 00000000000..67b97e37d44 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/dictionary.md @@ -0,0 +1,101 @@ +--- +sidebar_position: 20 +sidebar_label: Dictionary +--- + +# Dictionary Table Engine {#dictionary} + +The `Dictionary` engine displays the [dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) data as a ClickHouse table. + +## Example {#example} + +As an example, consider a dictionary of `products` with the following configuration: + +``` xml + + + products + + +
products
+ DSN=some-db-server + + + + 300 + 360 + + + + + + + product_id + + + title + String + + + + + +``` + +Query the dictionary data: + +``` sql +SELECT + name, + type, + key, + attribute.names, + attribute.types, + bytes_allocated, + element_count, + source +FROM system.dictionaries +WHERE name = 'products' +``` + +``` text +┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ +│ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ +└──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ +``` + +You can use the [dictGet\*](../../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions) function to get the dictionary data in this format. + +This view isn’t helpful when you need to get raw data, or when performing a `JOIN` operation. For these cases, you can use the `Dictionary` engine, which displays the dictionary data in a table. + +Syntax: + +``` sql +CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` +``` + +Usage example: + +``` sql +create table products (product_id UInt64, title String) Engine = Dictionary(products); +``` + + Ok + +Take a look at what’s in the table. + +``` sql +select * from products limit 1; +``` + +``` text +┌────product_id─┬─title───────────┐ +│ 152689 │ Some item │ +└───────────────┴─────────────────┘ +``` + +**See Also** + +- [Dictionary function](../../../sql-reference/table-functions/dictionary.md#dictionary-function) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/dictionary/) diff --git a/docs/en/reference/engines/table-engines/special/distributed.md b/docs/en/reference/engines/table-engines/special/distributed.md new file mode 100644 index 00000000000..db89175e4d9 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/distributed.md @@ -0,0 +1,229 @@ +--- +sidebar_position: 10 +sidebar_label: Distributed +--- + +# Distributed Table Engine {#distributed} + +Tables with Distributed engine do not store any data of their own, but allow distributed query processing on multiple servers. +Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any. + +## Creating a Table {#distributed-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = Distributed(cluster, database, table[, sharding_key[, policy_name]]) +[SETTINGS name=value, ...] +``` + +### From a Table {#distributed-from-a-table} +When the `Distributed` table is pointing to a table on the current server you can adopt that table's schema: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2 ENGINE = Distributed(cluster, database, table[, sharding_key[, policy_name]]) [SETTINGS name=value, ...] +``` + +**Distributed Parameters** + +- `cluster` - the cluster name in the server’s config file + +- `database` - the name of a remote database + +- `table` - the name of a remote table + +- `sharding_key` - (optionally) sharding key + +- `policy_name` - (optionally) policy name, it will be used to store temporary files for async send + +**See Also** + + - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting + - [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) for the examples + +**Distributed Settings** + +- `fsync_after_insert` - do the `fsync` for the file data after asynchronous insert to Distributed. Guarantees that the OS flushed the whole inserted data to a file **on the initiator node** disk. + +- `fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to asynchronous inserts on Distributed table (after insert, after sending the data to shard, etc). + +- `bytes_to_throw_insert` - if more than this number of compressed bytes will be pending for async INSERT, an exception will be thrown. 0 - do not throw. Default 0. + +- `bytes_to_delay_insert` - if more than this number of compressed bytes will be pending for async INSERT, the query will be delayed. 0 - do not delay. Default 0. + +- `max_delay_to_insert` - max delay of inserting data into Distributed table in seconds, if there are a lot of pending bytes for async send. Default 60. + +- `monitor_batch_inserts` - same as [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) + +- `monitor_split_batch_on_failure` - same as [distributed_directory_monitor_split_batch_on_failure](../../../operations/settings/settings.md#distributed_directory_monitor_split_batch_on_failure) + +- `monitor_sleep_time_ms` - same as [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) + +- `monitor_max_sleep_time_ms` - same as [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) + +:::note +**Durability settings** (`fsync_...`): + +- Affect only asynchronous INSERTs (i.e. `insert_distributed_sync=false`) when data first stored on the initiator node disk and later asynchronously send to shards. +- May significantly decrease the inserts' performance +- Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings` + +For **Insert limit settings** (`..._insert`) see also: + +- [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting +- [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting +- `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` +::: + +**Example** + +``` sql +CREATE TABLE hits_all AS hits +ENGINE = Distributed(logs, default, hits[, sharding_key[, policy_name]]) +SETTINGS + fsync_after_insert=0, + fsync_directories=0; +``` + +Data will be read from all servers in the `logs` cluster, from the `default.hits` table located on every server in the cluster. +Data is not only read but is partially processed on the remote servers (to the extent that this is possible). +For example, for a query with `GROUP BY`, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated. + +Instead of the database name, you can use a constant expression that returns a string. For example: `currentDatabase()`. + +## Clusters {#distributed-clusters} + +Clusters are configured in the [server configuration file](../../../operations/configuration-files.md): + +``` xml + + + + + + + 1 + + false + + + 1 + example01-01-1 + 9000 + + + example01-01-2 + 9000 + + + + 2 + false + + example01-02-1 + 9000 + + + example01-02-2 + 1 + 9440 + + + + +``` + +Here a cluster is defined with the name `logs` that consists of two shards, each of which contains two replicas. +Shards refer to the servers that contain different parts of the data (in order to read all the data, you must access all the shards). +Replicas are duplicating servers (in order to read all the data, you can access the data on any one of the replicas). + +Cluster names must not contain dots. + +The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `compression` are specified for each server: + +- `host` – The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server does not start. If you change the DNS record, restart the server. +- `port` – The TCP port for messenger activity (`tcp_port` in the config, usually set to 9000). Not to be confused with `http_port`. +- `user` – Name of the user for connecting to a remote server. Default value is the `default` user. This user must have access to connect to the specified server. Access is configured in the `users.xml` file. For more information, see the section [Access rights](../../../operations/access-rights.md). +- `password` – The password for connecting to a remote server (not masked). Default value: empty string. +- `secure` - Whether to use a secure SSL/TLS connection. Usually also requires specifying the port (the default secure port is `9440`). The server should listen on `9440` and be configured with correct certificates. +- `compression` - Use data compression. Default value: `true`. + +When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) – see the [load_balancing](../../../operations/settings/settings.md#settings-load_balancing) setting. +If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times. +This works in favour of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly. + +You can specify just one of the shards (in this case, query processing should be called remote, rather than distributed) or up to any number of shards. In each shard, you can specify from one to any number of replicas. You can specify a different number of replicas for each shard. + +You can specify as many clusters as you wish in the configuration. + +To view your clusters, use the `system.clusters` table. + +The `Distributed` engine allows working with a cluster like a local server. However, the cluster's configuration cannot be specified dynamically, it has to be configured in the server config file. Usually, all servers in a cluster will have the same cluster config (though this is not required). Clusters from the config file are updated on the fly, without restarting the server. + +If you need to send a query to an unknown set of shards and replicas each time, you do not need to create a `Distributed` table – use the `remote` table function instead. See the section [Table functions](../../../sql-reference/table-functions/index.md). + +## Writing data {#distributed-writing-data} + +There are two methods for writing data to a cluster: + +First, you can define which servers to write which data to and perform the write directly on each shard. In other words, perform direct `INSERT` statements on the remote tables in the cluster that the `Distributed` table is pointing to. This is the most flexible solution as you can use any sharding scheme, even one that is non-trivial due to the requirements of the subject area. This is also the most optimal solution since data can be written to different shards completely independently. + +Second, you can perform `INSERT` statements on a `Distributed` table. In this case, the table will distribute the inserted data across the servers itself. In order to write to a `Distributed` table, it must have the `sharding_key` parameter configured (except if there is only one shard). + +Each shard can have a `` defined in the config file. By default, the weight is `1`. Data is distributed across shards in the amount proportional to the shard weight. All shard weights are summed up, then each shard's weight is divided by the total to determine each shard's proportion. For example, if there are two shards and the first has a weight of 1 while the second has a weight of 2, the first will be sent one third (1 / 3) of inserted rows and the second will be sent two thirds (2 / 3). + +Each shard can have the `internal_replication` parameter defined in the config file. If this parameter is set to `true`, the write operation selects the first healthy replica and writes data to it. Use this if the tables underlying the `Distributed` table are replicated tables (e.g. any of the `Replicated*MergeTree` table engines). One of the table replicas will receive the write and it will be replicated to the other replicas automatically. + +If `internal_replication` is set to `false` (the default), data is written to all replicas. In this case, the `Distributed` table replicates data itself. This is worse than using replicated tables because the consistency of replicas is not checked and, over time, they will contain slightly different data. + +To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from `prev_weights` to `prev_weights + weight`, where `prev_weights` is the total weight of the shards with the smallest number, and `weight` is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range \[0, 9), and to the second for the remainders from the range \[9, 19). + +The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression `rand()` for random distribution of data, or `UserID` for distribution by the remainder from dividing the user’s ID (then the data of a single user will reside on a single shard, which simplifies running `IN` and `JOIN` by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function e.g. `intHash64(UserID)`. + +A simple remainder from the division is a limited solution for sharding and isn’t always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area rather than using entries in `Distributed` tables. + +You should be concerned about the sharding scheme in the following cases: + +- Queries are used that require joining data (`IN` or `JOIN`) by a specific key. If data is sharded by this key, you can use local `IN` or `JOIN` instead of `GLOBAL IN` or `GLOBAL JOIN`, which is much more efficient. +- A large number of servers is used (hundreds or more) with a large number of small queries, for example, queries for data of individual clients (e.g. websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. `Distributed` tables are created for each layer, and a single shared distributed table is created for global queries. + +Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The periodicity for sending data is managed by the [distributed_directory_monitor_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed_directory_monitor_batch_inserts](../../../operations/settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`. The number of threads performing background tasks can be set by [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting. + +If the server ceased to exist or had a rough restart (for example, due to a hardware failure) after an `INSERT` to a `Distributed` table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the `broken` subdirectory and no longer used. + +## Reading data {#distributed-reading-data} + +When querying a `Distributed` table, `SELECT` queries are sent to all shards and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you do not have to transfer old data into it. Instead, you can write new data to it by using a heavier weight – the data will be distributed slightly unevenly, but queries will work correctly and efficiently. + +When the `max_parallel_replicas` option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max_parallel_replicas](../../../operations/settings/settings.md#settings-max_parallel_replicas). + +To learn more about how distibuted `in` and `global in` queries are processed, refer to [this](../../../sql-reference/operators/in.md#select-distributed-subqueries) documentation. + +## Virtual Columns {#virtual-columns} + +- `_shard_num` — Contains the `shard_num` value from the table `system.clusters`. Type: [UInt32](../../../sql-reference/data-types/int-uint.md). + +:::note +Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](../../../sql-reference/table-functions/cluster.md) table functions internally create temporary Distributed table, `_shard_num` is available there too. +::: + +**See Also** + +- [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) description +- [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting +- [shardNum()](../../../sql-reference/functions/other-functions.md#shard-num) and [shardCount()](../../../sql-reference/functions/other-functions.md#shard-count) functions + + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/distributed/) diff --git a/docs/en/reference/engines/table-engines/special/external-data.md b/docs/en/reference/engines/table-engines/special/external-data.md new file mode 100644 index 00000000000..1f4336c74fe --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/external-data.md @@ -0,0 +1,65 @@ +--- +sidebar_position: 130 +sidebar_label: External Data +--- + +# External Data for Query Processing {#external-data-for-query-processing} + +ClickHouse allows sending a server the data that is needed for processing a query, together with a `SELECT` query. This data is put in a temporary table (see the section “Temporary tables”) and can be used in the query (for example, in `IN` operators). + +For example, if you have a text file with important user identifiers, you can upload it to the server along with a query that uses filtration by this list. + +If you need to run more than one query with a large volume of external data, do not use this feature. It is better to upload the data to the DB ahead of time. + +External data can be uploaded using the command-line client (in non-interactive mode), or using the HTTP interface. + +In the command-line client, you can specify a parameters section in the format + +``` bash +--external --file=... [--name=...] [--format=...] [--types=...|--structure=...] +``` + +You may have multiple sections like this, for the number of tables being transmitted. + +**–external** – Marks the beginning of a clause. +**–file** – Path to the file with the table dump, or -, which refers to stdin. +Only a single table can be retrieved from stdin. + +The following parameters are optional: **–name**– Name of the table. If omitted, _data is used. +**–format** – Data format in the file. If omitted, TabSeparated is used. + +One of the following parameters is required:**–types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named _1, _2, … +**–structure**– The table structure in the format`UserID UInt64`, `URL String`. Defines the column names and types. + +The files specified in ‘file’ will be parsed by the format specified in ‘format’, using the data types specified in ‘types’ or ‘structure’. The table will be uploaded to the server and accessible there as a temporary table with the name in ‘name’. + +Examples: + +``` bash +$ echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 +849897 +$ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' +/bin/sh 20 +/bin/false 5 +/bin/bash 4 +/usr/sbin/nologin 1 +/bin/sync 1 +``` + +When using the HTTP interface, external data is passed in the multipart/form-data format. Each table is transmitted as a separate file. The table name is taken from the file name. The `query_string` is passed the parameters `name_format`, `name_types`, and `name_structure`, where `name` is the name of the table that these parameters correspond to. The meaning of the parameters is the same as when using the command-line client. + +Example: + +``` bash +$ cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv + +$ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' +/bin/sh 20 +/bin/false 5 +/bin/bash 4 +/usr/sbin/nologin 1 +/bin/sync 1 +``` + +For distributed query processing, the temporary tables are sent to all the remote servers. + diff --git a/docs/en/reference/engines/table-engines/special/file.md b/docs/en/reference/engines/table-engines/special/file.md new file mode 100644 index 00000000000..6e4449bf1a9 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/file.md @@ -0,0 +1,89 @@ +--- +sidebar_position: 40 +sidebar_label: File +--- + +# File Table Engine {#table_engines-file} + +The File table engine keeps the data in a file in one of the supported [file formats](../../../interfaces/formats.md#formats) (`TabSeparated`, `Native`, etc.). + +Usage scenarios: + +- Data export from ClickHouse to file. +- Convert data from one format to another. +- Updating data in ClickHouse via editing a file on a disk. + +## Usage in ClickHouse Server {#usage-in-clickhouse-server} + +``` sql +File(Format) +``` + +The `Format` parameter specifies one of the available file formats. To perform +`SELECT` queries, the format must be supported for input, and to perform +`INSERT` queries – for output. The available formats are listed in the +[Formats](../../../interfaces/formats.md#formats) section. + +ClickHouse does not allow specifying filesystem path for `File`. It will use folder defined by [path](../../../operations/server-configuration-parameters/settings.md) setting in server configuration. + +When creating table using `File(Format)` it creates empty subdirectory in that folder. When data is written to that table, it’s put into `data.Format` file in that subdirectory. + +You may manually create this subfolder and file in server filesystem and then [ATTACH](../../../sql-reference/statements/attach.md) it to table information with matching name, so you can query data from that file. + +:::warning +Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. +::: + +## Example {#example} + +**1.** Set up the `file_engine_table` table: + +``` sql +CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSeparated) +``` + +By default ClickHouse will create folder `/var/lib/clickhouse/data/default/file_engine_table`. + +**2.** Manually create `/var/lib/clickhouse/data/default/file_engine_table/data.TabSeparated` containing: + +``` bash +$ cat data.TabSeparated +one 1 +two 2 +``` + +**3.** Query the data: + +``` sql +SELECT * FROM file_engine_table +``` + +``` text +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` + +## Usage in ClickHouse-local {#usage-in-clickhouse-local} + +In [clickhouse-local](../../../operations/utilities/clickhouse-local.md) File engine accepts file path in addition to `Format`. Default input/output streams can be specified using numeric or human-readable names like `0` or `stdin`, `1` or `stdout`. It is possible to read and write compressed files based on an additional engine parameter or file extension (`gz`, `br` or `xz`). + +**Example:** + +``` bash +$ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" +``` + +## Details of Implementation {#details-of-implementation} + +- Multiple `SELECT` queries can be performed concurrently, but `INSERT` queries will wait each other. +- Supported creating new file by `INSERT` query. +- If file exists, `INSERT` would append new values in it. +- Not supported: + - `ALTER` + - `SELECT ... SAMPLE` + - Indices + - Replication + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/file/) diff --git a/docs/en/reference/engines/table-engines/special/generate.md b/docs/en/reference/engines/table-engines/special/generate.md new file mode 100644 index 00000000000..453f3b5db0b --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/generate.md @@ -0,0 +1,59 @@ +--- +sidebar_position: 140 +sidebar_label: GenerateRandom +--- + +# GenerateRandom Table Engine {#table_engines-generate} + +The GenerateRandom table engine produces random data for given table schema. + +Usage examples: + +- Use in test to populate reproducible large table. +- Generate random input for fuzzing tests. + +## Usage in ClickHouse Server {#usage-in-clickhouse-server} + +``` sql +ENGINE = GenerateRandom(random_seed, max_string_length, max_array_length) +``` + +The `max_array_length` and `max_string_length` parameters specify maximum length of all +array columns and strings correspondingly in generated data. + +Generate table engine supports only `SELECT` queries. + +It supports all [DataTypes](../../../sql-reference/data-types/index.md) that can be stored in a table except `LowCardinality` and `AggregateFunction`. + +## Example {#example} + +**1.** Set up the `generate_engine_table` table: + +``` sql +CREATE TABLE generate_engine_table (name String, value UInt32) ENGINE = GenerateRandom(1, 5, 3) +``` + +**2.** Query the data: + +``` sql +SELECT * FROM generate_engine_table LIMIT 3 +``` + +``` text +┌─name─┬──────value─┐ +│ c4xJ │ 1412771199 │ +│ r │ 1791099446 │ +│ 7#$ │ 124312908 │ +└──────┴────────────┘ +``` + +## Details of Implementation {#details-of-implementation} + +- Not supported: + - `ALTER` + - `SELECT ... SAMPLE` + - `INSERT` + - Indices + - Replication + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/generate/) diff --git a/docs/en/reference/engines/table-engines/special/index.md b/docs/en/reference/engines/table-engines/special/index.md new file mode 100644 index 00000000000..f87cd86c891 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/index.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 50 +sidebar_label: Special +--- + +# Special Table Engines {#special-table-engines} + +There are three main categories of table engines: + +- [MergeTree engine family](../../../engines/table-engines/mergetree-family/index.md) for main production use. +- [Log engine family](../../../engines/table-engines/log-family/index.md) for small temporary data. +- [Table engines for integrations](../../../engines/table-engines/integrations/index.md). + +The remaining engines are unique in their purpose and are not grouped into families yet, thus they are placed in this “special” category. diff --git a/docs/en/reference/engines/table-engines/special/join.md b/docs/en/reference/engines/table-engines/special/join.md new file mode 100644 index 00000000000..7d6f6e99b9f --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/join.md @@ -0,0 +1,130 @@ +--- +sidebar_position: 70 +sidebar_label: Join +--- + +# Join Table Engine {#join} + +Optional prepared data structure for usage in [JOIN](../../../sql-reference/statements/select/join.md#select-join) operations. + +:::note +This is not an article about the [JOIN clause](../../../sql-reference/statements/select/join.md#select-join) itself. +::: + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], +) ENGINE = Join(join_strictness, join_type, k1[, k2, ...]) +``` + +See the detailed description of the [CREATE TABLE](../../../sql-reference/statements/create/table.md#create-table-query) query. + +**Engine Parameters** + +- `join_strictness` – [JOIN strictness](../../../sql-reference/statements/select/join.md#select-join-types). +- `join_type` – [JOIN type](../../../sql-reference/statements/select/join.md#select-join-types). +- `k1[, k2, ...]` – Key columns from the `USING` clause that the `JOIN` operation is made with. + +Enter `join_strictness` and `join_type` parameters without quotes, for example, `Join(ANY, LEFT, col1)`. They must match the `JOIN` operation that the table will be used for. If the parameters do not match, ClickHouse does not throw an exception and may return incorrect data. + +## Specifics and Recommendations {#specifics-and-recommendations} + +### Data Storage {#data-storage} + +`Join` table data is always located in the RAM. When inserting rows into a table, ClickHouse writes data blocks to the directory on the disk so that they can be restored when the server restarts. + +If the server restarts incorrectly, the data block on the disk might get lost or damaged. In this case, you may need to manually delete the file with damaged data. + +### Selecting and Inserting Data {#selecting-and-inserting-data} + +You can use `INSERT` queries to add data to the `Join`-engine tables. If the table was created with the `ANY` strictness, data for duplicate keys are ignored. With the `ALL` strictness, all rows are added. + +Main use-cases for `Join`-engine tables are following: + +- Place the table to the right side in a `JOIN` clause. +- Call the [joinGet](../../../sql-reference/functions/other-functions.md#joinget) function, which lets you extract data from the table the same way as from a dictionary. + +### Deleting Data {#deleting-data} + +`ALTER DELETE` queries for `Join`-engine tables are implemented as [mutations](../../../sql-reference/statements/alter/index.md#mutations). `DELETE` mutation reads filtered data and overwrites data of memory and disk. + +### Limitations and Settings {#join-limitations-and-settings} + +When creating a table, the following settings are applied: + +- [join_use_nulls](../../../operations/settings/settings.md#join_use_nulls) +- [max_rows_in_join](../../../operations/settings/query-complexity.md#settings-max_rows_in_join) +- [max_bytes_in_join](../../../operations/settings/query-complexity.md#settings-max_bytes_in_join) +- [join_overflow_mode](../../../operations/settings/query-complexity.md#settings-join_overflow_mode) +- [join_any_take_last_row](../../../operations/settings/settings.md#settings-join_any_take_last_row) +- [persistent](../../../operations/settings/settings.md#persistent) + +The `Join`-engine tables can’t be used in `GLOBAL JOIN` operations. + +The `Join`-engine allows to specify [join_use_nulls](../../../operations/settings/settings.md#join_use_nulls) setting in the `CREATE TABLE` statement. [SELECT](../../../sql-reference/statements/select/index.md) query should have the same `join_use_nulls` value. + +## Usage Examples {#example} + +Creating the left-side table: + +``` sql +CREATE TABLE id_val(`id` UInt32, `val` UInt32) ENGINE = TinyLog; +``` + +``` sql +INSERT INTO id_val VALUES (1,11)(2,12)(3,13); +``` + +Creating the right-side `Join` table: + +``` sql +CREATE TABLE id_val_join(`id` UInt32, `val` UInt8) ENGINE = Join(ANY, LEFT, id); +``` + +``` sql +INSERT INTO id_val_join VALUES (1,21)(1,22)(3,23); +``` + +Joining the tables: + +``` sql +SELECT * FROM id_val ANY LEFT JOIN id_val_join USING (id); +``` + +``` text +┌─id─┬─val─┬─id_val_join.val─┐ +│ 1 │ 11 │ 21 │ +│ 2 │ 12 │ 0 │ +│ 3 │ 13 │ 23 │ +└────┴─────┴─────────────────┘ +``` + +As an alternative, you can retrieve data from the `Join` table, specifying the join key value: + +``` sql +SELECT joinGet('id_val_join', 'val', toUInt32(1)); +``` + +``` text +┌─joinGet('id_val_join', 'val', toUInt32(1))─┐ +│ 21 │ +└────────────────────────────────────────────┘ +``` + +Deleting a row from the `Join` table: + +```sql +ALTER TABLE id_val_join DELETE WHERE id = 3; +``` + +```text +┌─id─┬─val─┐ +│ 1 │ 21 │ +└────┴─────┘ +``` + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/join/) diff --git a/docs/en/reference/engines/table-engines/special/materializedview.md b/docs/en/reference/engines/table-engines/special/materializedview.md new file mode 100644 index 00000000000..6c9a5e84f60 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/materializedview.md @@ -0,0 +1,10 @@ +--- +sidebar_position: 100 +sidebar_label: MaterializedView +--- + +# MaterializedView Table Engine {#materializedview} + +Used for implementing materialized views (for more information, see [CREATE VIEW](../../../sql-reference/statements/create/view.md#materialized)). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses that engine. + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/materializedview/) diff --git a/docs/en/reference/engines/table-engines/special/memory.md b/docs/en/reference/engines/table-engines/special/memory.md new file mode 100644 index 00000000000..1e154a323d1 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/memory.md @@ -0,0 +1,18 @@ +--- +sidebar_position: 110 +sidebar_label: Memory +--- + +# Memory Table Engine {#memory} + +The Memory engine stores data in RAM, in uncompressed form. Data is stored in exactly the same form as it is received when read. In other words, reading from this table is completely free. +Concurrent data access is synchronized. Locks are short: read and write operations do not block each other. +Indexes are not supported. Reading is parallelized. + +Maximal productivity (over 10 GB/sec) is reached on simple queries, because there is no reading from the disk, decompressing, or deserializing data. (We should note that in many cases, the productivity of the MergeTree engine is almost as high.) +When restarting a server, data disappears from the table and the table becomes empty. +Normally, using this table engine is not justified. However, it can be used for tests, and for tasks where maximum speed is required on a relatively small number of rows (up to approximately 100,000,000). + +The Memory engine is used by the system for temporary tables with external query data (see the section “External data for processing a query”), and for implementing `GLOBAL IN` (see the section “IN operators”). + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/memory/) diff --git a/docs/en/reference/engines/table-engines/special/merge.md b/docs/en/reference/engines/table-engines/special/merge.md new file mode 100644 index 00000000000..bcad7a0c1f6 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/merge.md @@ -0,0 +1,85 @@ +--- +sidebar_position: 30 +sidebar_label: Merge +--- + +# Merge Table Engine {#merge} + +The `Merge` engine (not to be confused with `MergeTree`) does not store data itself, but allows reading from any number of other tables simultaneously. + +Reading is automatically parallelized. Writing to a table is not supported. When reading, the indexes of tables that are actually being read are used, if they exist. + +## Creating a Table {#creating-a-table} + +``` sql +CREATE TABLE ... Engine=Merge(db_name, tables_regexp) +``` + +**Engine Parameters** + +- `db_name` — Possible values: + - database name, + - constant expression that returns a string with a database name, for example, `currentDatabase()`, + - `REGEXP(expression)`, where `expression` is a regular expression to match the DB names. + +- `tables_regexp` — A regular expression to match the table names in the specified DB or DBs. + +Regular expressions — [re2](https://github.com/google/re2) (supports a subset of PCRE), case-sensitive. +See the notes about escaping symbols in regular expressions in the "match" section. + +## Usage {#usage} + +When selecting tables to read, the `Merge` table itself is not selected, even if it matches the regex. This is to avoid loops. +It is possible to create two `Merge` tables that will endlessly try to read each others' data, but this is not a good idea. + +The typical way to use the `Merge` engine is for working with a large number of `TinyLog` tables as if with a single table. + +## Examples {#examples} + +**Example 1** + +Consider two databases `ABC_corporate_site` and `ABC_store`. The `all_visitors` table will contain IDs from the tables `visitors` in both databases. + +``` sql +CREATE TABLE all_visitors (id UInt32) ENGINE=Merge(REGEXP('ABC_*'), 'visitors'); +``` + +**Example 2** + +Let's say you have an old table `WatchLog_old` and decided to change partitioning without moving data to a new table `WatchLog_new`, and you need to see data from both tables. + +``` sql +CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) + ENGINE=MergeTree(date, (UserId, EventType), 8192); +INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); + +CREATE TABLE WatchLog_new(date Date, UserId Int64, EventType String, Cnt UInt64) + ENGINE=MergeTree PARTITION BY date ORDER BY (UserId, EventType) SETTINGS index_granularity=8192; +INSERT INTO WatchLog_new VALUES ('2018-01-02', 2, 'hit', 3); + +CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog'); + +SELECT * FROM WatchLog; +``` + +``` text +┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ +│ 2018-01-01 │ 1 │ hit │ 3 │ +└────────────┴────────┴───────────┴─────┘ +┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ +│ 2018-01-02 │ 2 │ hit │ 3 │ +└────────────┴────────┴───────────┴─────┘ +``` + +## Virtual Columns {#virtual-columns} + +- `_table` — Contains the name of the table from which data was read. Type: [String](../../../sql-reference/data-types/string.md). + + You can set the constant conditions on `_table` in the `WHERE/PREWHERE` clause (for example, `WHERE _table='xyz'`). In this case the read operation is performed only for that tables where the condition on `_table` is satisfied, so the `_table` column acts as an index. + +**See Also** + +- [Virtual columns](../../../engines/table-engines/special/index.md#table_engines-virtual_columns) +- [merge](../../../sql-reference/table-functions/merge.md) table function + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/merge/) diff --git a/docs/en/reference/engines/table-engines/special/null.md b/docs/en/reference/engines/table-engines/special/null.md new file mode 100644 index 00000000000..309b09ba779 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/null.md @@ -0,0 +1,15 @@ +--- +sidebar_position: 50 +sidebar_label: 'Null' +--- + +# Null Table Engine {#null} + +When writing to a `Null` table, data is ignored. When reading from a `Null` table, the response is empty. + +:::note +If you are wondering why this is useful, note that you can create a materialized view on a `Null` table. So the data written to the table will end up affecting the view, but original raw data will still be discarded. +::: + + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/null/) diff --git a/docs/en/reference/engines/table-engines/special/set.md b/docs/en/reference/engines/table-engines/special/set.md new file mode 100644 index 00000000000..5fd80ba55fe --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/set.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 60 +sidebar_label: Set +--- + +# Set Table Engine {#set} + +A data set that is always in RAM. It is intended for use on the right side of the `IN` operator (see the section “IN operators”). + +You can use `INSERT` to insert data in the table. New elements will be added to the data set, while duplicates will be ignored. +But you can’t perform `SELECT` from the table. The only way to retrieve data is by using it in the right half of the `IN` operator. + +Data is always located in RAM. For `INSERT`, the blocks of inserted data are also written to the directory of tables on the disk. When starting the server, this data is loaded to RAM. In other words, after restarting, the data remains in place. + +For a rough server restart, the block of data on the disk might be lost or damaged. In the latter case, you may need to manually delete the file with damaged data. + +### Limitations and Settings {#join-limitations-and-settings} + +When creating a table, the following settings are applied: + +- [persistent](../../../operations/settings/settings.md#persistent) + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/set/) diff --git a/docs/en/reference/engines/table-engines/special/url.md b/docs/en/reference/engines/table-engines/special/url.md new file mode 100644 index 00000000000..64642623f88 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/url.md @@ -0,0 +1,92 @@ +--- +sidebar_position: 80 +sidebar_label: URL +--- + +# URL Table Engine {#table_engines-url} + +Queries data to/from a remote HTTP/HTTPS server. This engine is similar to the [File](../../../engines/table-engines/special/file.md) engine. + +Syntax: `URL(URL [,Format] [,CompressionMethod])` + +- The `URL` parameter must conform to the structure of a Uniform Resource Locator. The specified URL must point to a server that uses HTTP or HTTPS. This does not require any additional headers for getting a response from the server. + +- The `Format` must be one that ClickHouse can use in `SELECT` queries and, if necessary, in `INSERTs`. For the full list of supported formats, see [Formats](../../../interfaces/formats.md#formats). + +- `CompressionMethod` indicates that whether the HTTP body should be compressed. If the compression is enabled, the HTTP packets sent by the URL engine contain 'Content-Encoding' header to indicate which compression method is used. + +To enable compression, please first make sure the remote HTTP endpoint indicated by the `URL` parameter supports corresponding compression algorithm. + +The supported `CompressionMethod` should be one of following: +- gzip or gz +- deflate +- brotli or br +- lzma or xz +- zstd or zst +- lz4 +- bz2 +- snappy +- none + +## Usage {#using-the-engine-in-the-clickhouse-server} + +`INSERT` and `SELECT` queries are transformed to `POST` and `GET` requests, +respectively. For processing `POST` requests, the remote server must support +[Chunked transfer encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding). + +You can limit the maximum number of HTTP GET redirect hops using the [max_http_get_redirects](../../../operations/settings/settings.md#setting-max_http_get_redirects) setting. + +## Example {#example} + +**1.** Create a `url_engine_table` table on the server : + +``` sql +CREATE TABLE url_engine_table (word String, value UInt64) +ENGINE=URL('http://127.0.0.1:12345/', CSV) +``` + +**2.** Create a basic HTTP server using the standard Python 3 tools and +start it: + +``` python3 +from http.server import BaseHTTPRequestHandler, HTTPServer + +class CSVHTTPServer(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.send_header('Content-type', 'text/csv') + self.end_headers() + + self.wfile.write(bytes('Hello,1\nWorld,2\n', "utf-8")) + +if __name__ == "__main__": + server_address = ('127.0.0.1', 12345) + HTTPServer(server_address, CSVHTTPServer).serve_forever() +``` + +``` bash +$ python3 server.py +``` + +**3.** Request data: + +``` sql +SELECT * FROM url_engine_table +``` + +``` text +┌─word──┬─value─┐ +│ Hello │ 1 │ +│ World │ 2 │ +└───────┴───────┘ +``` + +## Details of Implementation {#details-of-implementation} + +- Reads and writes can be parallel +- Not supported: + - `ALTER` and `SELECT...SAMPLE` operations. + - Indexes. + - Replication. + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/url/) diff --git a/docs/en/reference/engines/table-engines/special/view.md b/docs/en/reference/engines/table-engines/special/view.md new file mode 100644 index 00000000000..455c301fb01 --- /dev/null +++ b/docs/en/reference/engines/table-engines/special/view.md @@ -0,0 +1,10 @@ +--- +sidebar_position: 90 +sidebar_label: View +--- + +# View Table Engine {#table_engines-view} + +Used for implementing views (for more information, see the `CREATE VIEW query`). It does not store data, but only stores the specified `SELECT` query. When reading from a table, it runs this query (and deletes all unnecessary columns from the query). + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/view/) diff --git a/docs/en/reference/getting-started/_category_.yml b/docs/en/reference/getting-started/_category_.yml new file mode 100644 index 00000000000..4bac4e1ffae --- /dev/null +++ b/docs/en/reference/getting-started/_category_.yml @@ -0,0 +1,8 @@ +position: 1 +label: 'Getting Started' +collapsible: true +collapsed: true +link: + type: generated-index + title: Getting Started + slug: /en/getting-started \ No newline at end of file diff --git a/docs/en/reference/getting-started/example-datasets/_category_.yml b/docs/en/reference/getting-started/example-datasets/_category_.yml new file mode 100644 index 00000000000..060a977eb57 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/_category_.yml @@ -0,0 +1,8 @@ +position: 10 +label: 'Example Datasets' +collapsible: true +collapsed: true +link: + type: generated-index + title: Example Datasets + slug: /en/example-datasets \ No newline at end of file diff --git a/docs/en/reference/getting-started/example-datasets/amplab-benchmark.md b/docs/en/reference/getting-started/example-datasets/amplab-benchmark.md new file mode 100644 index 00000000000..a87ac53e2e3 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/amplab-benchmark.md @@ -0,0 +1,127 @@ +--- +sidebar_label: AMPLab Big Data Benchmark +description: A benchmark dataset used for comparing the performance of data warehousing solutions. +--- + +# AMPLab Big Data Benchmark {#amplab-big-data-benchmark} + +See https://amplab.cs.berkeley.edu/benchmark/ + +Sign up for a free account at https://aws.amazon.com. It requires a credit card, email, and phone number. Get a new access key at https://console.aws.amazon.com/iam/home?nc2=h_m_sc#security_credential + +Run the following in the console: + +``` bash +$ sudo apt-get install s3cmd +$ mkdir tiny; cd tiny; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . +$ cd .. +$ mkdir 1node; cd 1node; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . +$ cd .. +$ mkdir 5nodes; cd 5nodes; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . +$ cd .. +``` + +Run the following ClickHouse queries: + +``` sql +CREATE TABLE rankings_tiny +( + pageURL String, + pageRank UInt32, + avgDuration UInt32 +) ENGINE = Log; + +CREATE TABLE uservisits_tiny +( + sourceIP String, + destinationURL String, + visitDate Date, + adRevenue Float32, + UserAgent String, + cCode FixedString(3), + lCode FixedString(6), + searchWord String, + duration UInt32 +) ENGINE = MergeTree(visitDate, visitDate, 8192); + +CREATE TABLE rankings_1node +( + pageURL String, + pageRank UInt32, + avgDuration UInt32 +) ENGINE = Log; + +CREATE TABLE uservisits_1node +( + sourceIP String, + destinationURL String, + visitDate Date, + adRevenue Float32, + UserAgent String, + cCode FixedString(3), + lCode FixedString(6), + searchWord String, + duration UInt32 +) ENGINE = MergeTree(visitDate, visitDate, 8192); + +CREATE TABLE rankings_5nodes_on_single +( + pageURL String, + pageRank UInt32, + avgDuration UInt32 +) ENGINE = Log; + +CREATE TABLE uservisits_5nodes_on_single +( + sourceIP String, + destinationURL String, + visitDate Date, + adRevenue Float32, + UserAgent String, + cCode FixedString(3), + lCode FixedString(6), + searchWord String, + duration UInt32 +) ENGINE = MergeTree(visitDate, visitDate, 8192); +``` + +Go back to the console: + +``` bash +$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done +$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done +$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done +$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done +$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done +$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done +``` + +Queries for obtaining data samples: + +``` sql +SELECT pageURL, pageRank FROM rankings_1node WHERE pageRank > 1000 + +SELECT substring(sourceIP, 1, 8), sum(adRevenue) FROM uservisits_1node GROUP BY substring(sourceIP, 1, 8) + +SELECT + sourceIP, + sum(adRevenue) AS totalRevenue, + avg(pageRank) AS pageRank +FROM rankings_1node ALL INNER JOIN +( + SELECT + sourceIP, + destinationURL AS pageURL, + adRevenue + FROM uservisits_1node + WHERE (visitDate > '1980-01-01') AND (visitDate < '1980-04-01') +) USING pageURL +GROUP BY sourceIP +ORDER BY totalRevenue DESC +LIMIT 1 +``` + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/amplab_benchmark/) diff --git a/docs/en/reference/getting-started/example-datasets/brown-benchmark.md b/docs/en/reference/getting-started/example-datasets/brown-benchmark.md new file mode 100644 index 00000000000..0960756dbe9 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/brown-benchmark.md @@ -0,0 +1,416 @@ +--- +sidebar_label: Brown University Benchmark +description: A new analytical benchmark for machine-generated log data +--- + +# Brown University Benchmark + +`MgBench` is a new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/). + +Download the data: +``` +wget https://datasets.clickhouse.com/mgbench{1..3}.csv.xz +``` + +Unpack the data: +``` +xz -v -d mgbench{1..3}.csv.xz +``` + +Create tables: +``` +CREATE DATABASE mgbench; + + +CREATE TABLE mgbench.logs1 ( + log_time DateTime, + machine_name LowCardinality(String), + machine_group LowCardinality(String), + cpu_idle Nullable(Float32), + cpu_nice Nullable(Float32), + cpu_system Nullable(Float32), + cpu_user Nullable(Float32), + cpu_wio Nullable(Float32), + disk_free Nullable(Float32), + disk_total Nullable(Float32), + part_max_used Nullable(Float32), + load_fifteen Nullable(Float32), + load_five Nullable(Float32), + load_one Nullable(Float32), + mem_buffers Nullable(Float32), + mem_cached Nullable(Float32), + mem_free Nullable(Float32), + mem_shared Nullable(Float32), + swap_free Nullable(Float32), + bytes_in Nullable(Float32), + bytes_out Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (machine_group, machine_name, log_time); + + +CREATE TABLE mgbench.logs2 ( + log_time DateTime, + client_ip IPv4, + request String, + status_code UInt16, + object_size UInt64 +) +ENGINE = MergeTree() +ORDER BY log_time; + + +CREATE TABLE mgbench.logs3 ( + log_time DateTime64, + device_id FixedString(15), + device_name LowCardinality(String), + device_type LowCardinality(String), + device_floor UInt8, + event_type LowCardinality(String), + event_unit FixedString(1), + event_value Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (event_type, log_time); +``` + +Insert data: + +``` +clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv +clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbench2.csv +clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv +``` + +Run benchmark queries: +``` +-- Q1.1: What is the CPU/network utilization for each web server since midnight? + +SELECT machine_name, + MIN(cpu) AS cpu_min, + MAX(cpu) AS cpu_max, + AVG(cpu) AS cpu_avg, + MIN(net_in) AS net_in_min, + MAX(net_in) AS net_in_max, + AVG(net_in) AS net_in_avg, + MIN(net_out) AS net_out_min, + MAX(net_out) AS net_out_max, + AVG(net_out) AS net_out_avg +FROM ( + SELECT machine_name, + COALESCE(cpu_user, 0.0) AS cpu, + COALESCE(bytes_in, 0.0) AS net_in, + COALESCE(bytes_out, 0.0) AS net_out + FROM logs1 + WHERE machine_name IN ('anansi','aragog','urd') + AND log_time >= TIMESTAMP '2017-01-11 00:00:00' +) AS r +GROUP BY machine_name; + + +-- Q1.2: Which computer lab machines have been offline in the past day? + +SELECT machine_name, + log_time +FROM logs1 +WHERE (machine_name LIKE 'cslab%' OR + machine_name LIKE 'mslab%') + AND load_one IS NULL + AND log_time >= TIMESTAMP '2017-01-10 00:00:00' +ORDER BY machine_name, + log_time; + + +-- Q1.3: What are the hourly average metrics during the past 10 days for a specific workstation? + +SELECT dt, + hr, + AVG(load_fifteen) AS load_fifteen_avg, + AVG(load_five) AS load_five_avg, + AVG(load_one) AS load_one_avg, + AVG(mem_free) AS mem_free_avg, + AVG(swap_free) AS swap_free_avg +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + load_fifteen, + load_five, + load_one, + mem_free, + swap_free + FROM logs1 + WHERE machine_name = 'babbage' + AND load_fifteen IS NOT NULL + AND load_five IS NOT NULL + AND load_one IS NOT NULL + AND mem_free IS NOT NULL + AND swap_free IS NOT NULL + AND log_time >= TIMESTAMP '2017-01-01 00:00:00' +) AS r +GROUP BY dt, + hr +ORDER BY dt, + hr; + + +-- Q1.4: Over 1 month, how often was each server blocked on disk I/O? + +SELECT machine_name, + COUNT(*) AS spikes +FROM logs1 +WHERE machine_group = 'Servers' + AND cpu_wio > 0.99 + AND log_time >= TIMESTAMP '2016-12-01 00:00:00' + AND log_time < TIMESTAMP '2017-01-01 00:00:00' +GROUP BY machine_name +ORDER BY spikes DESC +LIMIT 10; + + +-- Q1.5: Which externally reachable VMs have run low on memory? + +SELECT machine_name, + dt, + MIN(mem_free) AS mem_free_min +FROM ( + SELECT machine_name, + CAST(log_time AS DATE) AS dt, + mem_free + FROM logs1 + WHERE machine_group = 'DMZ' + AND mem_free IS NOT NULL +) AS r +GROUP BY machine_name, + dt +HAVING MIN(mem_free) < 10000 +ORDER BY machine_name, + dt; + + +-- Q1.6: What is the total hourly network traffic across all file servers? + +SELECT dt, + hr, + SUM(net_in) AS net_in_sum, + SUM(net_out) AS net_out_sum, + SUM(net_in) + SUM(net_out) AS both_sum +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + COALESCE(bytes_in, 0.0) / 1000000000.0 AS net_in, + COALESCE(bytes_out, 0.0) / 1000000000.0 AS net_out + FROM logs1 + WHERE machine_name IN ('allsorts','andes','bigred','blackjack','bonbon', + 'cadbury','chiclets','cotton','crows','dove','fireball','hearts','huey', + 'lindt','milkduds','milkyway','mnm','necco','nerds','orbit','peeps', + 'poprocks','razzles','runts','smarties','smuggler','spree','stride', + 'tootsie','trident','wrigley','york') +) AS r +GROUP BY dt, + hr +ORDER BY both_sum DESC +LIMIT 10; + + +-- Q2.1: Which requests have caused server errors within the past 2 weeks? + +SELECT * +FROM logs2 +WHERE status_code >= 500 + AND log_time >= TIMESTAMP '2012-12-18 00:00:00' +ORDER BY log_time; + + +-- Q2.2: During a specific 2-week period, was the user password file leaked? + +SELECT * +FROM logs2 +WHERE status_code >= 200 + AND status_code < 300 + AND request LIKE '%/etc/passwd%' + AND log_time >= TIMESTAMP '2012-05-06 00:00:00' + AND log_time < TIMESTAMP '2012-05-20 00:00:00'; + + +-- Q2.3: What was the average path depth for top-level requests in the past month? + +SELECT top_level, + AVG(LENGTH(request) - LENGTH(REPLACE(request, '/', ''))) AS depth_avg +FROM ( + SELECT SUBSTRING(request FROM 1 FOR len) AS top_level, + request + FROM ( + SELECT POSITION(SUBSTRING(request FROM 2), '/') AS len, + request + FROM logs2 + WHERE status_code >= 200 + AND status_code < 300 + AND log_time >= TIMESTAMP '2012-12-01 00:00:00' + ) AS r + WHERE len > 0 +) AS s +WHERE top_level IN ('/about','/courses','/degrees','/events', + '/grad','/industry','/news','/people', + '/publications','/research','/teaching','/ugrad') +GROUP BY top_level +ORDER BY top_level; + + +-- Q2.4: During the last 3 months, which clients have made an excessive number of requests? + +SELECT client_ip, + COUNT(*) AS num_requests +FROM logs2 +WHERE log_time >= TIMESTAMP '2012-10-01 00:00:00' +GROUP BY client_ip +HAVING COUNT(*) >= 100000 +ORDER BY num_requests DESC; + + +-- Q2.5: What are the daily unique visitors? + +SELECT dt, + COUNT(DISTINCT client_ip) +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + client_ip + FROM logs2 +) AS r +GROUP BY dt +ORDER BY dt; + + +-- Q2.6: What are the average and maximum data transfer rates (Gbps)? + +SELECT AVG(transfer) / 125000000.0 AS transfer_avg, + MAX(transfer) / 125000000.0 AS transfer_max +FROM ( + SELECT log_time, + SUM(object_size) AS transfer + FROM logs2 + GROUP BY log_time +) AS r; + + +-- Q3.1: Did the indoor temperature reach freezing over the weekend? + +SELECT * +FROM logs3 +WHERE event_type = 'temperature' + AND event_value <= 32.0 + AND log_time >= '2019-11-29 17:00:00.000'; + + +-- Q3.4: Over the past 6 months, how frequently were each door opened? + +SELECT device_name, + device_floor, + COUNT(*) AS ct +FROM logs3 +WHERE event_type = 'door_open' + AND log_time >= '2019-06-01 00:00:00.000' +GROUP BY device_name, + device_floor +ORDER BY ct DESC; + + +-- Q3.5: Where in the building do large temperature variations occur in winter and summer? + +WITH temperature AS ( + SELECT dt, + device_name, + device_type, + device_floor + FROM ( + SELECT dt, + hr, + device_name, + device_type, + device_floor, + AVG(event_value) AS temperature_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + device_name, + device_type, + device_floor, + event_value + FROM logs3 + WHERE event_type = 'temperature' + ) AS r + GROUP BY dt, + hr, + device_name, + device_type, + device_floor + ) AS s + GROUP BY dt, + device_name, + device_type, + device_floor + HAVING MAX(temperature_hourly_avg) - MIN(temperature_hourly_avg) >= 25.0 +) +SELECT DISTINCT device_name, + device_type, + device_floor, + 'WINTER' +FROM temperature +WHERE dt >= DATE '2018-12-01' + AND dt < DATE '2019-03-01' +UNION +SELECT DISTINCT device_name, + device_type, + device_floor, + 'SUMMER' +FROM temperature +WHERE dt >= DATE '2019-06-01' + AND dt < DATE '2019-09-01'; + + +-- Q3.6: For each device category, what are the monthly power consumption metrics? + +SELECT yr, + mo, + SUM(coffee_hourly_avg) AS coffee_monthly_sum, + AVG(coffee_hourly_avg) AS coffee_monthly_avg, + SUM(printer_hourly_avg) AS printer_monthly_sum, + AVG(printer_hourly_avg) AS printer_monthly_avg, + SUM(projector_hourly_avg) AS projector_monthly_sum, + AVG(projector_hourly_avg) AS projector_monthly_avg, + SUM(vending_hourly_avg) AS vending_monthly_sum, + AVG(vending_hourly_avg) AS vending_monthly_avg +FROM ( + SELECT dt, + yr, + mo, + hr, + AVG(coffee) AS coffee_hourly_avg, + AVG(printer) AS printer_hourly_avg, + AVG(projector) AS projector_hourly_avg, + AVG(vending) AS vending_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(YEAR FROM log_time) AS yr, + EXTRACT(MONTH FROM log_time) AS mo, + EXTRACT(HOUR FROM log_time) AS hr, + CASE WHEN device_name LIKE 'coffee%' THEN event_value END AS coffee, + CASE WHEN device_name LIKE 'printer%' THEN event_value END AS printer, + CASE WHEN device_name LIKE 'projector%' THEN event_value END AS projector, + CASE WHEN device_name LIKE 'vending%' THEN event_value END AS vending + FROM logs3 + WHERE device_type = 'meter' + ) AS r + GROUP BY dt, + yr, + mo, + hr +) AS s +GROUP BY yr, + mo +ORDER BY yr, + mo; +``` + +The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.com/play?user=play), [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/brown-benchmark/) diff --git a/docs/en/reference/getting-started/example-datasets/cell-towers.md b/docs/en/reference/getting-started/example-datasets/cell-towers.md new file mode 100644 index 00000000000..6c3201ff2b2 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/cell-towers.md @@ -0,0 +1,131 @@ +--- +sidebar_label: Cell Towers +--- + +# Cell Towers + +This dataset is from [OpenCellid](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. + +As of 2021, it contains more than 40 million records about cell towers (GSM, LTE, UMTS, etc.) around the world with their geographical coordinates and metadata (country code, network, etc). + +OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License, and we redistribute a snapshot of this dataset under the terms of the same license. The up-to-date version of the dataset is available to download after sign in. + + +## Get the Dataset {#get-the-dataset} + +1. Download the snapshot of the dataset from February 2021: [https://datasets.clickhouse.com/cell_towers.csv.xz] (729 MB). + +2. Validate the integrity (optional step): +``` +md5sum cell_towers.csv.xz +8cf986f4a0d9f12c6f384a0e9192c908 cell_towers.csv.xz +``` + +3. Decompress it with the following command: +``` +xz -d cell_towers.csv.xz +``` + +4. Create a table: + +``` +CREATE TABLE cell_towers +( + radio Enum8('' = 0, 'CDMA' = 1, 'GSM' = 2, 'LTE' = 3, 'NR' = 4, 'UMTS' = 5), + mcc UInt16, + net UInt16, + area UInt16, + cell UInt64, + unit Int16, + lon Float64, + lat Float64, + range UInt32, + samples UInt32, + changeable UInt8, + created DateTime, + updated DateTime, + averageSignal UInt8 +) +ENGINE = MergeTree ORDER BY (radio, mcc, net, created); +``` + +5. Insert the dataset: +``` +clickhouse-client --query "INSERT INTO cell_towers FORMAT CSVWithNames" < cell_towers.csv +``` + +## Examples {#examples} + +1. A number of cell towers by type: + +``` +SELECT radio, count() AS c FROM cell_towers GROUP BY radio ORDER BY c DESC + +┌─radio─┬────────c─┐ +│ UMTS │ 20686487 │ +│ LTE │ 12101148 │ +│ GSM │ 9931312 │ +│ CDMA │ 556344 │ +│ NR │ 867 │ +└───────┴──────────┘ + +5 rows in set. Elapsed: 0.011 sec. Processed 43.28 million rows, 43.28 MB (3.83 billion rows/s., 3.83 GB/s.) +``` + +2. Cell towers by [mobile country code (MCC)](https://en.wikipedia.org/wiki/Mobile_country_code): + +``` +SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 + +┌─mcc─┬─count()─┐ +│ 310 │ 5024650 │ +│ 262 │ 2622423 │ +│ 250 │ 1953176 │ +│ 208 │ 1891187 │ +│ 724 │ 1836150 │ +│ 404 │ 1729151 │ +│ 234 │ 1618924 │ +│ 510 │ 1353998 │ +│ 440 │ 1343355 │ +│ 311 │ 1332798 │ +└─────┴─────────┘ + +10 rows in set. Elapsed: 0.019 sec. Processed 43.28 million rows, 86.55 MB (2.33 billion rows/s., 4.65 GB/s.) +``` + +So, the top countries are: the USA, Germany, and Russia. + +You may want to create an [External Dictionary](../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. + + +## Use case {#use-case} + +Using `pointInPolygon` function. + +1. Create a table where we will store polygons: + +``` +CREATE TEMPORARY TABLE moscow (polygon Array(Tuple(Float64, Float64))); +``` + +2. This is a rough shape of Moscow (without "new Moscow"): + +``` +INSERT INTO moscow VALUES ([(37.84172564285271, 55.78000432402266), (37.8381207618713, 55.775874525970494), (37.83979446823122, 55.775626746008065), (37.84243326983639, 55.77446586811748), (37.84262672750849, 55.771974101091104), (37.84153238623039, 55.77114545193181), (37.841124690460184, 55.76722010265554), (37.84239076983644, 55.76654891107098), (37.842283558197025, 55.76258709833121), (37.8421759312134, 55.758073999993734), (37.84198330422974, 55.75381499999371), (37.8416827275085, 55.749277102484484), (37.84157576190186, 55.74794544108413), (37.83897929098507, 55.74525257875241), (37.83739676451868, 55.74404373042019), (37.838732481460525, 55.74298009816793), (37.841183997352545, 55.743060321833575), (37.84097476190185, 55.73938799999373), (37.84048155819702, 55.73570799999372), (37.840095812164286, 55.73228210777237), (37.83983814285274, 55.73080491981639), (37.83846476321406, 55.729799917464675), (37.83835745269769, 55.72919751082619), (37.838636380279524, 55.72859509486539), (37.8395161005249, 55.727705075632784), (37.83897964285276, 55.722727886185154), (37.83862557539366, 55.72034817326636), (37.83559735744853, 55.71944437307499), (37.835370708803126, 55.71831419154461), (37.83738169402022, 55.71765218986692), (37.83823396494291, 55.71691750159089), (37.838056931213345, 55.71547311301385), (37.836812846557606, 55.71221445615604), (37.83522525396725, 55.709331054395555), (37.83269301586908, 55.70953687463627), (37.829667367706236, 55.70903403789297), (37.83311126588435, 55.70552351822608), (37.83058993121339, 55.70041317726053), (37.82983872750851, 55.69883771404813), (37.82934501586913, 55.69718947487017), (37.828926414016685, 55.69504441658371), (37.82876530422971, 55.69287499999378), (37.82894754100031, 55.690759754047335), (37.827697554878185, 55.68951421135665), (37.82447346292115, 55.68965045405069), (37.83136543914793, 55.68322046195302), (37.833554015869154, 55.67814012759211), (37.83544184655761, 55.67295011628339), (37.837480388885474, 55.6672498719639), (37.838960677246064, 55.66316274139358), (37.83926093121332, 55.66046999999383), (37.839025050262435, 55.65869897264431), (37.83670784390257, 55.65794084879904), (37.835656529083245, 55.65694309303843), (37.83704060449217, 55.65689306460552), (37.83696819873806, 55.65550363526252), (37.83760389616388, 55.65487847246661), (37.83687972750851, 55.65356745541324), (37.83515216004943, 55.65155951234079), (37.83312418518067, 55.64979413590619), (37.82801726983639, 55.64640836412121), (37.820614174591, 55.64164525405531), (37.818908190475426, 55.6421883258084), (37.81717543386075, 55.64112490388471), (37.81690987037274, 55.63916106913107), (37.815099354492155, 55.637925371757085), (37.808769150787356, 55.633798276884455), (37.80100123544311, 55.62873670012244), (37.79598013491824, 55.62554336109055), (37.78634567724606, 55.62033499605651), (37.78334147619623, 55.618768681480326), (37.77746201055901, 55.619855533402706), (37.77527329626457, 55.61909966711279), (37.77801986242668, 55.618770300976294), (37.778212973541216, 55.617257701952106), (37.77784818518065, 55.61574504433011), (37.77016867724609, 55.61148576294007), (37.760191219573976, 55.60599579539028), (37.75338926983641, 55.60227892751446), (37.746329965606634, 55.59920577639331), (37.73939925396728, 55.59631430313617), (37.73273665739439, 55.5935318803559), (37.7299954450912, 55.59350760316188), (37.7268679946899, 55.59469840523759), (37.72626726983634, 55.59229549697373), (37.7262673598022, 55.59081598950582), (37.71897193121335, 55.5877595845419), (37.70871550793456, 55.58393177431724), (37.700497489410374, 55.580917323756644), (37.69204305026244, 55.57778089778455), (37.68544477378839, 55.57815154690915), (37.68391050793454, 55.57472945079756), (37.678803592590306, 55.57328235936491), (37.6743402539673, 55.57255251445782), (37.66813862698363, 55.57216388774464), (37.617927457672096, 55.57505691895805), (37.60443099999999, 55.5757737568051), (37.599683515869145, 55.57749105910326), (37.59754177842709, 55.57796291823627), (37.59625834786988, 55.57906686095235), (37.59501783265684, 55.57746616444403), (37.593090671936025, 55.57671634534502), (37.587018007904, 55.577944600233785), (37.578692203704804, 55.57982895000019), (37.57327546607398, 55.58116294118248), (37.57385012109279, 55.581550362779), (37.57399562266922, 55.5820107079112), (37.5735356072979, 55.58226289171689), (37.57290393054962, 55.582393529795155), (37.57037722355653, 55.581919415056234), (37.5592298306885, 55.584471614867844), (37.54189249206543, 55.58867650795186), (37.5297256269836, 55.59158133551745), (37.517837865081766, 55.59443656218868), (37.51200186508174, 55.59635625174229), (37.506808949737554, 55.59907823904434), (37.49820432275389, 55.6062944994944), (37.494406071441674, 55.60967103463367), (37.494760001358024, 55.61066689753365), (37.49397137107085, 55.61220931698269), (37.49016528606031, 55.613417718449064), (37.48773249206542, 55.61530616333343), (37.47921386508177, 55.622640129112334), (37.470652153442394, 55.62993723476164), (37.46273446298218, 55.6368075123157), (37.46350692265317, 55.64068225239439), (37.46050283203121, 55.640794546982576), (37.457627470916734, 55.64118904154646), (37.450718034393326, 55.64690488145138), (37.44239252645875, 55.65397824729769), (37.434587576721185, 55.66053543155961), (37.43582144975277, 55.661693766520735), (37.43576786245721, 55.662755031737014), (37.430982915344174, 55.664610641628116), (37.428547447097685, 55.66778515273695), (37.42945134592044, 55.668633314343566), (37.42859571562949, 55.66948145750025), (37.4262836402282, 55.670813882451405), (37.418709037048295, 55.6811141674414), (37.41922139651101, 55.68235377885389), (37.419218771842885, 55.68359335082235), (37.417196501327446, 55.684375235224735), (37.41607020370478, 55.68540557585352), (37.415640857147146, 55.68686637150793), (37.414632153442334, 55.68903015131686), (37.413344899475064, 55.690896881757396), (37.41171432275391, 55.69264232162232), (37.40948282275393, 55.69455101638112), (37.40703674603271, 55.69638690385348), (37.39607169577025, 55.70451821283731), (37.38952706878662, 55.70942491932811), (37.387778313491815, 55.71149057784176), (37.39049275399779, 55.71419814298992), (37.385557272491454, 55.7155489617061), (37.38388335714726, 55.71849856042102), (37.378368238098155, 55.7292763261685), (37.37763597123337, 55.730845879211614), (37.37890062088197, 55.73167906388319), (37.37750451918789, 55.734703664681774), (37.375610832015965, 55.734851959522246), (37.3723813571472, 55.74105626086403), (37.37014935714723, 55.746115620904355), (37.36944173016362, 55.750883999993725), (37.36975304365541, 55.76335905525834), (37.37244070571134, 55.76432079697595), (37.3724259757175, 55.76636979670426), (37.369922155757884, 55.76735417953104), (37.369892695770275, 55.76823419316575), (37.370214730163575, 55.782312184391266), (37.370493611114505, 55.78436801120489), (37.37120164550783, 55.78596427165359), (37.37284851456452, 55.7874378183096), (37.37608325135799, 55.7886695054807), (37.3764587460632, 55.78947647305964), (37.37530000265506, 55.79146512926804), (37.38235915344241, 55.79899647809345), (37.384344043655396, 55.80113596939471), (37.38594269577028, 55.80322699999366), (37.38711208598329, 55.804919036911976), (37.3880239841309, 55.806610999993666), (37.38928977249147, 55.81001864976979), (37.39038389947512, 55.81348641242801), (37.39235781481933, 55.81983538336746), (37.393709457672124, 55.82417822811877), (37.394685720901464, 55.82792275755836), (37.39557615344238, 55.830447148154136), (37.39844478226658, 55.83167107969975), (37.40019761214057, 55.83151823557964), (37.400398790382326, 55.83264967594742), (37.39659544313046, 55.83322180909622), (37.39667059524539, 55.83402792148566), (37.39682089947515, 55.83638877400216), (37.39643489154053, 55.83861656112751), (37.3955338994751, 55.84072348043264), (37.392680272491454, 55.84502158126453), (37.39241188227847, 55.84659117913199), (37.392529730163616, 55.84816071336481), (37.39486835714723, 55.85288092980303), (37.39873052645878, 55.859893456073635), (37.40272161111449, 55.86441833633205), (37.40697072750854, 55.867579567544375), (37.410007082016016, 55.868369880337), (37.4120992989502, 55.86920843741314), (37.412668021163924, 55.87055369615854), (37.41482461111453, 55.87170587948249), (37.41862266137694, 55.873183961039565), (37.42413732540892, 55.874879126654704), (37.4312182698669, 55.875614937236705), (37.43111093783558, 55.8762723478417), (37.43332105622856, 55.87706546369396), (37.43385747619623, 55.87790681284802), (37.441303050262405, 55.88027084462084), (37.44747234260555, 55.87942070143253), (37.44716141796871, 55.88072960917233), (37.44769797085568, 55.88121221323979), (37.45204320500181, 55.882080694420715), (37.45673176190186, 55.882346110794586), (37.463383999999984, 55.88252729504517), (37.46682797486874, 55.88294937719063), (37.470014457672086, 55.88361266759345), (37.47751410450743, 55.88546991372396), (37.47860317658232, 55.88534929207307), (37.48165826025772, 55.882563306475106), (37.48316434442331, 55.8815803226785), (37.483831555817645, 55.882427612793315), (37.483182967125686, 55.88372791409729), (37.483092277908824, 55.88495581062434), (37.4855716508179, 55.8875561994203), (37.486440636245746, 55.887827444039566), (37.49014203439328, 55.88897899871799), (37.493210285705544, 55.890208937135604), (37.497512451065035, 55.891342397444696), (37.49780744510645, 55.89174030252967), (37.49940333499519, 55.89239745507079), (37.50018383334346, 55.89339220941865), (37.52421672750851, 55.903869074155224), (37.52977457672118, 55.90564076517974), (37.53503220370484, 55.90661661218259), (37.54042858064267, 55.90714113744566), (37.54320461007303, 55.905645048442985), (37.545686966066306, 55.906608607018505), (37.54743976120755, 55.90788552162358), (37.55796999999999, 55.90901557907218), (37.572711542327866, 55.91059395704873), (37.57942799999998, 55.91073854155573), (37.58502865872187, 55.91009969268444), (37.58739968913264, 55.90794809960554), (37.59131567193598, 55.908713267595054), (37.612687423278814, 55.902866854295375), (37.62348079629517, 55.90041967242986), (37.635797880950896, 55.898141151686396), (37.649487626983664, 55.89639275532968), (37.65619302513125, 55.89572360207488), (37.66294133862307, 55.895295577183965), (37.66874564418033, 55.89505457604897), (37.67375601586915, 55.89254677027454), (37.67744661901856, 55.8947775867987), (37.688347, 55.89450045676125), (37.69480554232789, 55.89422926332761), (37.70107096560668, 55.89322256101114), (37.705962965606716, 55.891763491662616), (37.711885134918205, 55.889110234998974), (37.71682005026245, 55.886577568759876), (37.7199315476074, 55.88458159806678), (37.72234560316464, 55.882281005794134), (37.72364385977171, 55.8809452036196), (37.725371142837474, 55.8809722706006), (37.727870902099546, 55.88037213862385), (37.73394330422971, 55.877941504088696), (37.745339592590376, 55.87208120378722), (37.75525267724611, 55.86703807949492), (37.76919976190188, 55.859821640197474), (37.827835219574, 55.82962968399116), (37.83341438888553, 55.82575289922351), (37.83652584655761, 55.82188784027888), (37.83809213491821, 55.81612575504693), (37.83605359521481, 55.81460347077685), (37.83632178569025, 55.81276696067908), (37.838623105812026, 55.811486181656385), (37.83912198147584, 55.807329380532785), (37.839079078033414, 55.80510270463816), (37.83965844708251, 55.79940712529036), (37.840581150787344, 55.79131399999368), (37.84172564285271, 55.78000432402266)]); +``` + +3. Check how many cell towers are in Moscow: + +``` +SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM moscow)) + +┌─count()─┐ +│ 310463 │ +└─────────┘ + +1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.) +``` + +The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.com/play?user=play), [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). + +Although you cannot create temporary tables there. \ No newline at end of file diff --git a/docs/en/reference/getting-started/example-datasets/criteo.md b/docs/en/reference/getting-started/example-datasets/criteo.md new file mode 100644 index 00000000000..2d1c700d15c --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/criteo.md @@ -0,0 +1,78 @@ +--- +sidebar_label: Terabyte Click Logs from Criteo +--- + +# Terabyte of Click Logs from Criteo + +Download the data from http://labs.criteo.com/downloads/download-terabyte-click-logs/ + +Create a table to import the log to: + +``` sql +CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log +``` + +Download the data: + +``` bash +$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done +``` + +Create a table for the converted data: + +``` sql +CREATE TABLE criteo +( + date Date, + clicked UInt8, + int1 Int32, + int2 Int32, + int3 Int32, + int4 Int32, + int5 Int32, + int6 Int32, + int7 Int32, + int8 Int32, + int9 Int32, + int10 Int32, + int11 Int32, + int12 Int32, + int13 Int32, + icat1 UInt32, + icat2 UInt32, + icat3 UInt32, + icat4 UInt32, + icat5 UInt32, + icat6 UInt32, + icat7 UInt32, + icat8 UInt32, + icat9 UInt32, + icat10 UInt32, + icat11 UInt32, + icat12 UInt32, + icat13 UInt32, + icat14 UInt32, + icat15 UInt32, + icat16 UInt32, + icat17 UInt32, + icat18 UInt32, + icat19 UInt32, + icat20 UInt32, + icat21 UInt32, + icat22 UInt32, + icat23 UInt32, + icat24 UInt32, + icat25 UInt32, + icat26 UInt32 +) ENGINE = MergeTree(date, intHash32(icat1), (date, intHash32(icat1)), 8192) +``` + +Transform data from the raw log and put it in the second table: + +``` sql +INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; + +DROP TABLE criteo_log; +``` + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/criteo/) diff --git a/docs/en/reference/getting-started/example-datasets/github-events.md b/docs/en/reference/getting-started/example-datasets/github-events.md new file mode 100644 index 00000000000..3a0cbc3324d --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/github-events.md @@ -0,0 +1,10 @@ +--- +sidebar_label: GitHub Events +--- + +# GitHub Events Dataset + +Dataset contains all events on GitHub from 2011 to Dec 6 2020, the size is 3.1 billion records. Download size is 75 GB and it will require up to 200 GB space on disk if stored in a table with lz4 compression. + +Full dataset description, insights, download instruction and interactive queries are posted [here](https://ghe.clickhouse.tech/). + diff --git a/docs/en/reference/getting-started/example-datasets/menus.md b/docs/en/reference/getting-started/example-datasets/menus.md new file mode 100644 index 00000000000..c41195223a2 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/menus.md @@ -0,0 +1,354 @@ +--- +sidebar_label: New York Public Library "What's on the Menu?" Dataset +--- + +# New York Public Library "What's on the Menu?" Dataset + +The dataset is created by the New York Public Library. It contains historical data on the menus of hotels, restaurants and cafes with the dishes along with their prices. + +Source: http://menus.nypl.org/data +The data is in public domain. + +The data is from library's archive and it may be incomplete and difficult for statistical analysis. Nevertheless it is also very yummy. +The size is just 1.3 million records about dishes in the menus — it's a very small data volume for ClickHouse, but it's still a good example. + +## Download the Dataset {#download-dataset} + +Run the command: + +```bash +wget https://s3.amazonaws.com/menusdata.nypl.org/gzips/2021_08_01_07_01_17_data.tgz +``` + +Replace the link to the up to date link from http://menus.nypl.org/data if needed. +Download size is about 35 MB. + +## Unpack the Dataset {#unpack-dataset} + +```bash +tar xvf 2021_08_01_07_01_17_data.tgz +``` + +Uncompressed size is about 150 MB. + +The data is normalized consisted of four tables: +- `Menu` — Information about menus: the name of the restaurant, the date when menu was seen, etc. +- `Dish` — Information about dishes: the name of the dish along with some characteristic. +- `MenuPage` — Information about the pages in the menus, because every page belongs to some menu. +- `MenuItem` — An item of the menu. A dish along with its price on some menu page: links to dish and menu page. + +## Create the Tables {#create-tables} + +We use [Decimal](../sql-reference/data-types/decimal.md) data type to store prices. + +```sql +CREATE TABLE dish +( + id UInt32, + name String, + description String, + menus_appeared UInt32, + times_appeared Int32, + first_appeared UInt16, + last_appeared UInt16, + lowest_price Decimal64(3), + highest_price Decimal64(3) +) ENGINE = MergeTree ORDER BY id; + +CREATE TABLE menu +( + id UInt32, + name String, + sponsor String, + event String, + venue String, + place String, + physical_description String, + occasion String, + notes String, + call_number String, + keywords String, + language String, + date String, + location String, + location_type String, + currency String, + currency_symbol String, + status String, + page_count UInt16, + dish_count UInt16 +) ENGINE = MergeTree ORDER BY id; + +CREATE TABLE menu_page +( + id UInt32, + menu_id UInt32, + page_number UInt16, + image_id String, + full_height UInt16, + full_width UInt16, + uuid UUID +) ENGINE = MergeTree ORDER BY id; + +CREATE TABLE menu_item +( + id UInt32, + menu_page_id UInt32, + price Decimal64(3), + high_price Decimal64(3), + dish_id UInt32, + created_at DateTime, + updated_at DateTime, + xpos Float64, + ypos Float64 +) ENGINE = MergeTree ORDER BY id; +``` + +## Import the Data {#import-data} + +Upload data into ClickHouse, run: + +```bash +clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_default 0 --query "INSERT INTO dish FORMAT CSVWithNames" < Dish.csv +clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_default 0 --query "INSERT INTO menu FORMAT CSVWithNames" < Menu.csv +clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_default 0 --query "INSERT INTO menu_page FORMAT CSVWithNames" < MenuPage.csv +clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_default 0 --date_time_input_format best_effort --query "INSERT INTO menu_item FORMAT CSVWithNames" < MenuItem.csv +``` + +We use [CSVWithNames](../interfaces/formats.md#csvwithnames) format as the data is represented by CSV with header. + +We disable `format_csv_allow_single_quotes` as only double quotes are used for data fields and single quotes can be inside the values and should not confuse the CSV parser. + +We disable [input_format_null_as_default](../operations/settings/settings.md#settings-input-format-null-as-default) as our data does not have [NULL](../sql-reference/syntax.md#null-literal). Otherwise ClickHouse will try to parse `\N` sequences and can be confused with `\` in data. + +The setting [date_time_input_format best_effort](../operations/settings/settings.md#settings-date_time_input_format) allows to parse [DateTime](../sql-reference/data-types/datetime.md) fields in wide variety of formats. For example, ISO-8601 without seconds like '2000-01-01 01:02' will be recognized. Without this setting only fixed DateTime format is allowed. + +## Denormalize the Data {#denormalize-data} + +Data is presented in multiple tables in [normalized form](https://en.wikipedia.org/wiki/Database_normalization#Normal_forms). It means you have to perform [JOIN](../sql-reference/statements/select/join.md#select-join) if you want to query, e.g. dish names from menu items. +For typical analytical tasks it is way more efficient to deal with pre-JOINed data to avoid doing `JOIN` every time. It is called "denormalized" data. + +We will create a table `menu_item_denorm` where will contain all the data JOINed together: + +```sql +CREATE TABLE menu_item_denorm +ENGINE = MergeTree ORDER BY (dish_name, created_at) +AS SELECT + price, + high_price, + created_at, + updated_at, + xpos, + ypos, + dish.id AS dish_id, + dish.name AS dish_name, + dish.description AS dish_description, + dish.menus_appeared AS dish_menus_appeared, + dish.times_appeared AS dish_times_appeared, + dish.first_appeared AS dish_first_appeared, + dish.last_appeared AS dish_last_appeared, + dish.lowest_price AS dish_lowest_price, + dish.highest_price AS dish_highest_price, + menu.id AS menu_id, + menu.name AS menu_name, + menu.sponsor AS menu_sponsor, + menu.event AS menu_event, + menu.venue AS menu_venue, + menu.place AS menu_place, + menu.physical_description AS menu_physical_description, + menu.occasion AS menu_occasion, + menu.notes AS menu_notes, + menu.call_number AS menu_call_number, + menu.keywords AS menu_keywords, + menu.language AS menu_language, + menu.date AS menu_date, + menu.location AS menu_location, + menu.location_type AS menu_location_type, + menu.currency AS menu_currency, + menu.currency_symbol AS menu_currency_symbol, + menu.status AS menu_status, + menu.page_count AS menu_page_count, + menu.dish_count AS menu_dish_count +FROM menu_item + JOIN dish ON menu_item.dish_id = dish.id + JOIN menu_page ON menu_item.menu_page_id = menu_page.id + JOIN menu ON menu_page.menu_id = menu.id; +``` + +## Validate the Data {#validate-data} + +Query: + +```sql +SELECT count() FROM menu_item_denorm; +``` + +Result: + +```text +┌─count()─┐ +│ 1329175 │ +└─────────┘ +``` + +## Run Some Queries {#run-queries} + +### Averaged historical prices of dishes {#query-averaged-historical-prices} + +Query: + +```sql +SELECT + round(toUInt32OrZero(extract(menu_date, '^\\d{4}')), -1) AS d, + count(), + round(avg(price), 2), + bar(avg(price), 0, 100, 100) +FROM menu_item_denorm +WHERE (menu_currency = 'Dollars') AND (d > 0) AND (d < 2022) +GROUP BY d +ORDER BY d ASC; +``` + +Result: + +```text +┌────d─┬─count()─┬─round(avg(price), 2)─┬─bar(avg(price), 0, 100, 100)─┐ +│ 1850 │ 618 │ 1.5 │ █▍ │ +│ 1860 │ 1634 │ 1.29 │ █▎ │ +│ 1870 │ 2215 │ 1.36 │ █▎ │ +│ 1880 │ 3909 │ 1.01 │ █ │ +│ 1890 │ 8837 │ 1.4 │ █▍ │ +│ 1900 │ 176292 │ 0.68 │ ▋ │ +│ 1910 │ 212196 │ 0.88 │ ▊ │ +│ 1920 │ 179590 │ 0.74 │ ▋ │ +│ 1930 │ 73707 │ 0.6 │ ▌ │ +│ 1940 │ 58795 │ 0.57 │ ▌ │ +│ 1950 │ 41407 │ 0.95 │ ▊ │ +│ 1960 │ 51179 │ 1.32 │ █▎ │ +│ 1970 │ 12914 │ 1.86 │ █▋ │ +│ 1980 │ 7268 │ 4.35 │ ████▎ │ +│ 1990 │ 11055 │ 6.03 │ ██████ │ +│ 2000 │ 2467 │ 11.85 │ ███████████▋ │ +│ 2010 │ 597 │ 25.66 │ █████████████████████████▋ │ +└──────┴─────────┴──────────────────────┴──────────────────────────────┘ +``` + +Take it with a grain of salt. + +### Burger Prices {#query-burger-prices} + +Query: + +```sql +SELECT + round(toUInt32OrZero(extract(menu_date, '^\\d{4}')), -1) AS d, + count(), + round(avg(price), 2), + bar(avg(price), 0, 50, 100) +FROM menu_item_denorm +WHERE (menu_currency = 'Dollars') AND (d > 0) AND (d < 2022) AND (dish_name ILIKE '%burger%') +GROUP BY d +ORDER BY d ASC; +``` + +Result: + +```text +┌────d─┬─count()─┬─round(avg(price), 2)─┬─bar(avg(price), 0, 50, 100)───────────┐ +│ 1880 │ 2 │ 0.42 │ ▋ │ +│ 1890 │ 7 │ 0.85 │ █▋ │ +│ 1900 │ 399 │ 0.49 │ ▊ │ +│ 1910 │ 589 │ 0.68 │ █▎ │ +│ 1920 │ 280 │ 0.56 │ █ │ +│ 1930 │ 74 │ 0.42 │ ▋ │ +│ 1940 │ 119 │ 0.59 │ █▏ │ +│ 1950 │ 134 │ 1.09 │ ██▏ │ +│ 1960 │ 272 │ 0.92 │ █▋ │ +│ 1970 │ 108 │ 1.18 │ ██▎ │ +│ 1980 │ 88 │ 2.82 │ █████▋ │ +│ 1990 │ 184 │ 3.68 │ ███████▎ │ +│ 2000 │ 21 │ 7.14 │ ██████████████▎ │ +│ 2010 │ 6 │ 18.42 │ ████████████████████████████████████▋ │ +└──────┴─────────┴──────────────────────┴───────────────────────────────────────┘ +``` + +### Vodka {#query-vodka} + +Query: + +```sql +SELECT + round(toUInt32OrZero(extract(menu_date, '^\\d{4}')), -1) AS d, + count(), + round(avg(price), 2), + bar(avg(price), 0, 50, 100) +FROM menu_item_denorm +WHERE (menu_currency IN ('Dollars', '')) AND (d > 0) AND (d < 2022) AND (dish_name ILIKE '%vodka%') +GROUP BY d +ORDER BY d ASC; +``` + +Result: + +```text +┌────d─┬─count()─┬─round(avg(price), 2)─┬─bar(avg(price), 0, 50, 100)─┐ +│ 1910 │ 2 │ 0 │ │ +│ 1920 │ 1 │ 0.3 │ ▌ │ +│ 1940 │ 21 │ 0.42 │ ▋ │ +│ 1950 │ 14 │ 0.59 │ █▏ │ +│ 1960 │ 113 │ 2.17 │ ████▎ │ +│ 1970 │ 37 │ 0.68 │ █▎ │ +│ 1980 │ 19 │ 2.55 │ █████ │ +│ 1990 │ 86 │ 3.6 │ ███████▏ │ +│ 2000 │ 2 │ 3.98 │ ███████▊ │ +└──────┴─────────┴──────────────────────┴─────────────────────────────┘ +``` + +To get vodka we have to write `ILIKE '%vodka%'` and this definitely makes a statement. + +### Caviar {#query-caviar} + +Let's print caviar prices. Also let's print a name of any dish with caviar. + +Query: + +```sql +SELECT + round(toUInt32OrZero(extract(menu_date, '^\\d{4}')), -1) AS d, + count(), + round(avg(price), 2), + bar(avg(price), 0, 50, 100), + any(dish_name) +FROM menu_item_denorm +WHERE (menu_currency IN ('Dollars', '')) AND (d > 0) AND (d < 2022) AND (dish_name ILIKE '%caviar%') +GROUP BY d +ORDER BY d ASC; +``` + +Result: + +```text +┌────d─┬─count()─┬─round(avg(price), 2)─┬─bar(avg(price), 0, 50, 100)──────┬─any(dish_name)──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ 1090 │ 1 │ 0 │ │ Caviar │ +│ 1880 │ 3 │ 0 │ │ Caviar │ +│ 1890 │ 39 │ 0.59 │ █▏ │ Butter and caviar │ +│ 1900 │ 1014 │ 0.34 │ ▋ │ Anchovy Caviar on Toast │ +│ 1910 │ 1588 │ 1.35 │ ██▋ │ 1/1 Brötchen Caviar │ +│ 1920 │ 927 │ 1.37 │ ██▋ │ ASTRAKAN CAVIAR │ +│ 1930 │ 289 │ 1.91 │ ███▋ │ Astrachan caviar │ +│ 1940 │ 201 │ 0.83 │ █▋ │ (SPECIAL) Domestic Caviar Sandwich │ +│ 1950 │ 81 │ 2.27 │ ████▌ │ Beluga Caviar │ +│ 1960 │ 126 │ 2.21 │ ████▍ │ Beluga Caviar │ +│ 1970 │ 105 │ 0.95 │ █▊ │ BELUGA MALOSSOL CAVIAR AMERICAN DRESSING │ +│ 1980 │ 12 │ 7.22 │ ██████████████▍ │ Authentic Iranian Beluga Caviar the world's finest black caviar presented in ice garni and a sampling of chilled 100° Russian vodka │ +│ 1990 │ 74 │ 14.42 │ ████████████████████████████▋ │ Avocado Salad, Fresh cut avocado with caviare │ +│ 2000 │ 3 │ 7.82 │ ███████████████▋ │ Aufgeschlagenes Kartoffelsueppchen mit Forellencaviar │ +│ 2010 │ 6 │ 15.58 │ ███████████████████████████████▏ │ "OYSTERS AND PEARLS" "Sabayon" of Pearl Tapioca with Island Creek Oysters and Russian Sevruga Caviar │ +└──────┴─────────┴──────────────────────┴──────────────────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +At least they have caviar with vodka. Very nice. + +## Online Playground {#playground} + +The data is uploaded to ClickHouse Playground, [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICByb3VuZCh0b1VJbnQzMk9yWmVybyhleHRyYWN0KG1lbnVfZGF0ZSwgJ15cXGR7NH0nKSksIC0xKSBBUyBkLAogICAgY291bnQoKSwKICAgIHJvdW5kKGF2ZyhwcmljZSksIDIpLAogICAgYmFyKGF2ZyhwcmljZSksIDAsIDUwLCAxMDApLAogICAgYW55KGRpc2hfbmFtZSkKRlJPTSBtZW51X2l0ZW1fZGVub3JtCldIRVJFIChtZW51X2N1cnJlbmN5IElOICgnRG9sbGFycycsICcnKSkgQU5EIChkID4gMCkgQU5EIChkIDwgMjAyMikgQU5EIChkaXNoX25hbWUgSUxJS0UgJyVjYXZpYXIlJykKR1JPVVAgQlkgZApPUkRFUiBCWSBkIEFTQw==). diff --git a/docs/en/reference/getting-started/example-datasets/metrica.md b/docs/en/reference/getting-started/example-datasets/metrica.md new file mode 100644 index 00000000000..c5ef74750a6 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/metrica.md @@ -0,0 +1,78 @@ +--- +sidebar_label: Web Analytics Data +description: Dataset consists of two tables containing anonymized web analytics data with hits and visits +--- + +# Anonymized Web Analytics Data + +Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`). + +The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz and as prepared partitions at https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz. + +## Obtaining Tables from Prepared Partitions {#obtaining-tables-from-prepared-partitions} + +Download and import hits table: + +``` bash +curl -O https://datasets.clickhouse.com/hits/partitions/hits_v1.tar +tar xvf hits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory +# check permissions on unpacked data, fix if required +sudo service clickhouse-server restart +clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +``` + +Download and import visits: + +``` bash +curl -O https://datasets.clickhouse.com/visits/partitions/visits_v1.tar +tar xvf visits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory +# check permissions on unpacked data, fix if required +sudo service clickhouse-server restart +clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +``` + +## Obtaining Tables from Compressed TSV File {#obtaining-tables-from-compressed-tsv-file} + +Download and import hits from compressed TSV file: + +``` bash +curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +# Validate the checksum +md5sum hits_v1.tsv +# Checksum should be equal to: f3631b6295bf06989c1437491f7592cb +# now create table +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +# for hits_v1 +clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" +# for hits_100m_obfuscated +clickhouse-client --query="CREATE TABLE hits_100m_obfuscated (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" + +# import data +cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 +# optionally you can optimize table +clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" +clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +``` + +Download and import visits from compressed tsv-file: + +``` bash +curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +# Validate the checksum +md5sum visits_v1.tsv +# Checksum should be equal to: 6dafe1a0f24e59e3fc2d0fed85601de6 +# now create table +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" +# import data +cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 +# optionally you can optimize table +clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" +clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +``` + +## Example Queries {#example-queries} + +[The ClickHouse tutorial](../../tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial. + +Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there). diff --git a/docs/en/reference/getting-started/example-datasets/nyc-taxi.md b/docs/en/reference/getting-started/example-datasets/nyc-taxi.md new file mode 100644 index 00000000000..270aeb4929c --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/nyc-taxi.md @@ -0,0 +1,392 @@ +--- +sidebar_label: New York Taxi Data +description: Data for billions of taxi and for-hire vehicle (Uber, Lyft, etc.) trips originating in New York City since 2009 +--- + +# New York Taxi Data + +This dataset can be obtained in two ways: + +- import from raw data +- download of prepared partitions + +## How to Import the Raw Data {#how-to-import-the-raw-data} + +See https://github.com/toddwschneider/nyc-taxi-data and http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html for the description of a dataset and instructions for downloading. + +Downloading will result in about 227 GB of uncompressed data in CSV files. The download takes about an hour over a 1 Gbit connection (parallel downloading from s3.amazonaws.com recovers at least half of a 1 Gbit channel). +Some of the files might not download fully. Check the file sizes and re-download any that seem doubtful. + +Some of the files might contain invalid rows. You can fix them as follows: + +``` bash +sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_ +sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_ +mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv +mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv +``` + +Then the data must be pre-processed in PostgreSQL. This will create selections of points in the polygons (to match points on the map with the boroughs of New York City) and combine all the data into a single denormalized flat table by using a JOIN. To do this, you will need to install PostgreSQL with PostGIS support. + +Be careful when running `initialize_database.sh` and manually re-check that all the tables were created correctly. + +It takes about 20-30 minutes to process each month’s worth of data in PostgreSQL, for a total of about 48 hours. + +You can check the number of downloaded rows as follows: + +``` bash +$ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" +## Count + 1298979494 +(1 row) + +real 7m9.164s +``` + +(This is slightly more than 1.1 billion rows reported by Mark Litwintschik in a series of blog posts.) + +The data in PostgreSQL uses 370 GB of space. + +Exporting the data from PostgreSQL: + +``` sql +COPY +( + SELECT trips.id, + trips.vendor_id, + trips.pickup_datetime, + trips.dropoff_datetime, + trips.store_and_fwd_flag, + trips.rate_code_id, + trips.pickup_longitude, + trips.pickup_latitude, + trips.dropoff_longitude, + trips.dropoff_latitude, + trips.passenger_count, + trips.trip_distance, + trips.fare_amount, + trips.extra, + trips.mta_tax, + trips.tip_amount, + trips.tolls_amount, + trips.ehail_fee, + trips.improvement_surcharge, + trips.total_amount, + trips.payment_type, + trips.trip_type, + trips.pickup, + trips.dropoff, + + cab_types.type cab_type, + + weather.precipitation_tenths_of_mm rain, + weather.snow_depth_mm, + weather.snowfall_mm, + weather.max_temperature_tenths_degrees_celsius max_temp, + weather.min_temperature_tenths_degrees_celsius min_temp, + weather.average_wind_speed_tenths_of_meters_per_second wind, + + pick_up.gid pickup_nyct2010_gid, + pick_up.ctlabel pickup_ctlabel, + pick_up.borocode pickup_borocode, + pick_up.boroname pickup_boroname, + pick_up.ct2010 pickup_ct2010, + pick_up.boroct2010 pickup_boroct2010, + pick_up.cdeligibil pickup_cdeligibil, + pick_up.ntacode pickup_ntacode, + pick_up.ntaname pickup_ntaname, + pick_up.puma pickup_puma, + + drop_off.gid dropoff_nyct2010_gid, + drop_off.ctlabel dropoff_ctlabel, + drop_off.borocode dropoff_borocode, + drop_off.boroname dropoff_boroname, + drop_off.ct2010 dropoff_ct2010, + drop_off.boroct2010 dropoff_boroct2010, + drop_off.cdeligibil dropoff_cdeligibil, + drop_off.ntacode dropoff_ntacode, + drop_off.ntaname dropoff_ntaname, + drop_off.puma dropoff_puma + FROM trips + LEFT JOIN cab_types + ON trips.cab_type_id = cab_types.id + LEFT JOIN central_park_weather_observations_raw weather + ON weather.date = trips.pickup_datetime::date + LEFT JOIN nyct2010 pick_up + ON pick_up.gid = trips.pickup_nyct2010_gid + LEFT JOIN nyct2010 drop_off + ON drop_off.gid = trips.dropoff_nyct2010_gid +) TO '/opt/milovidov/nyc-taxi-data/trips.tsv'; +``` + +The data snapshot is created at a speed of about 50 MB per second. While creating the snapshot, PostgreSQL reads from the disk at a speed of about 28 MB per second. +This takes about 5 hours. The resulting TSV file is 590612904969 bytes. + +Create a temporary table in ClickHouse: + +``` sql +CREATE TABLE trips +( +trip_id UInt32, +vendor_id String, +pickup_datetime DateTime, +dropoff_datetime Nullable(DateTime), +store_and_fwd_flag Nullable(FixedString(1)), +rate_code_id Nullable(UInt8), +pickup_longitude Nullable(Float64), +pickup_latitude Nullable(Float64), +dropoff_longitude Nullable(Float64), +dropoff_latitude Nullable(Float64), +passenger_count Nullable(UInt8), +trip_distance Nullable(Float64), +fare_amount Nullable(Float32), +extra Nullable(Float32), +mta_tax Nullable(Float32), +tip_amount Nullable(Float32), +tolls_amount Nullable(Float32), +ehail_fee Nullable(Float32), +improvement_surcharge Nullable(Float32), +total_amount Nullable(Float32), +payment_type Nullable(String), +trip_type Nullable(UInt8), +pickup Nullable(String), +dropoff Nullable(String), +cab_type Nullable(String), +precipitation Nullable(UInt8), +snow_depth Nullable(UInt8), +snowfall Nullable(UInt8), +max_temperature Nullable(UInt8), +min_temperature Nullable(UInt8), +average_wind_speed Nullable(UInt8), +pickup_nyct2010_gid Nullable(UInt8), +pickup_ctlabel Nullable(String), +pickup_borocode Nullable(UInt8), +pickup_boroname Nullable(String), +pickup_ct2010 Nullable(String), +pickup_boroct2010 Nullable(String), +pickup_cdeligibil Nullable(FixedString(1)), +pickup_ntacode Nullable(String), +pickup_ntaname Nullable(String), +pickup_puma Nullable(String), +dropoff_nyct2010_gid Nullable(UInt8), +dropoff_ctlabel Nullable(String), +dropoff_borocode Nullable(UInt8), +dropoff_boroname Nullable(String), +dropoff_ct2010 Nullable(String), +dropoff_boroct2010 Nullable(String), +dropoff_cdeligibil Nullable(String), +dropoff_ntacode Nullable(String), +dropoff_ntaname Nullable(String), +dropoff_puma Nullable(String) +) ENGINE = Log; +``` + +It is needed for converting fields to more correct data types and, if possible, to eliminate NULLs. + +``` bash +$ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv + +real 75m56.214s +``` + +Data is read at a speed of 112-140 Mb/second. +Loading data into a Log type table in one stream took 76 minutes. +The data in this table uses 142 GB. + +(Importing data directly from Postgres is also possible using `COPY ... TO PROGRAM`.) + +Unfortunately, all the fields associated with the weather (precipitation…average_wind_speed) were filled with NULL. Because of this, we will remove them from the final data set. + +To start, we’ll create a table on a single server. Later we will make the table distributed. + +Create and populate a summary table: + +``` sql +CREATE TABLE trips_mergetree +ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) +AS SELECT + +trip_id, +CAST(vendor_id AS Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14)) AS vendor_id, +toDate(pickup_datetime) AS pickup_date, +ifNull(pickup_datetime, toDateTime(0)) AS pickup_datetime, +toDate(dropoff_datetime) AS dropoff_date, +ifNull(dropoff_datetime, toDateTime(0)) AS dropoff_datetime, +assumeNotNull(store_and_fwd_flag) IN ('Y', '1', '2') AS store_and_fwd_flag, +assumeNotNull(rate_code_id) AS rate_code_id, +assumeNotNull(pickup_longitude) AS pickup_longitude, +assumeNotNull(pickup_latitude) AS pickup_latitude, +assumeNotNull(dropoff_longitude) AS dropoff_longitude, +assumeNotNull(dropoff_latitude) AS dropoff_latitude, +assumeNotNull(passenger_count) AS passenger_count, +assumeNotNull(trip_distance) AS trip_distance, +assumeNotNull(fare_amount) AS fare_amount, +assumeNotNull(extra) AS extra, +assumeNotNull(mta_tax) AS mta_tax, +assumeNotNull(tip_amount) AS tip_amount, +assumeNotNull(tolls_amount) AS tolls_amount, +assumeNotNull(ehail_fee) AS ehail_fee, +assumeNotNull(improvement_surcharge) AS improvement_surcharge, +assumeNotNull(total_amount) AS total_amount, +CAST((assumeNotNull(payment_type) AS pt) IN ('CSH', 'CASH', 'Cash', 'CAS', 'Cas', '1') ? 'CSH' : (pt IN ('CRD', 'Credit', 'Cre', 'CRE', 'CREDIT', '2') ? 'CRE' : (pt IN ('NOC', 'No Charge', 'No', '3') ? 'NOC' : (pt IN ('DIS', 'Dispute', 'Dis', '4') ? 'DIS' : 'UNK'))) AS Enum8('CSH' = 1, 'CRE' = 2, 'UNK' = 0, 'NOC' = 3, 'DIS' = 4)) AS payment_type_, +assumeNotNull(trip_type) AS trip_type, +ifNull(toFixedString(unhex(pickup), 25), toFixedString('', 25)) AS pickup, +ifNull(toFixedString(unhex(dropoff), 25), toFixedString('', 25)) AS dropoff, +CAST(assumeNotNull(cab_type) AS Enum8('yellow' = 1, 'green' = 2, 'uber' = 3)) AS cab_type, + +assumeNotNull(pickup_nyct2010_gid) AS pickup_nyct2010_gid, +toFloat32(ifNull(pickup_ctlabel, '0')) AS pickup_ctlabel, +assumeNotNull(pickup_borocode) AS pickup_borocode, +CAST(assumeNotNull(pickup_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS pickup_boroname, +toFixedString(ifNull(pickup_ct2010, '000000'), 6) AS pickup_ct2010, +toFixedString(ifNull(pickup_boroct2010, '0000000'), 7) AS pickup_boroct2010, +CAST(assumeNotNull(ifNull(pickup_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS pickup_cdeligibil, +toFixedString(ifNull(pickup_ntacode, '0000'), 4) AS pickup_ntacode, + +CAST(assumeNotNull(pickup_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS pickup_ntaname, + +toUInt16(ifNull(pickup_puma, '0')) AS pickup_puma, + +assumeNotNull(dropoff_nyct2010_gid) AS dropoff_nyct2010_gid, +toFloat32(ifNull(dropoff_ctlabel, '0')) AS dropoff_ctlabel, +assumeNotNull(dropoff_borocode) AS dropoff_borocode, +CAST(assumeNotNull(dropoff_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS dropoff_boroname, +toFixedString(ifNull(dropoff_ct2010, '000000'), 6) AS dropoff_ct2010, +toFixedString(ifNull(dropoff_boroct2010, '0000000'), 7) AS dropoff_boroct2010, +CAST(assumeNotNull(ifNull(dropoff_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS dropoff_cdeligibil, +toFixedString(ifNull(dropoff_ntacode, '0000'), 4) AS dropoff_ntacode, + +CAST(assumeNotNull(dropoff_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS dropoff_ntaname, + +toUInt16(ifNull(dropoff_puma, '0')) AS dropoff_puma + +FROM trips +``` + +This takes 3030 seconds at a speed of about 428,000 rows per second. +To load it faster, you can create the table with the `Log` engine instead of `MergeTree`. In this case, the download works faster than 200 seconds. + +The table uses 126 GB of disk space. + +``` sql +SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active +``` + +``` text +┌─formatReadableSize(sum(bytes))─┐ +│ 126.18 GiB │ +└────────────────────────────────┘ +``` + +Among other things, you can run the OPTIMIZE query on MergeTree. But it’s not required since everything will be fine without it. + +## Download of Prepared Partitions {#download-of-prepared-partitions} + +``` bash +$ curl -O https://datasets.clickhouse.com/trips_mergetree/partitions/trips_mergetree.tar +$ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.trips_mergetree" +``` + +:::info +If you will run the queries described below, you have to use the full table name, `datasets.trips_mergetree`. +::: + +## Results on Single Server {#results-on-single-server} + +Q1: + +``` sql +SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type +``` + +0.490 seconds. + +Q2: + +``` sql +SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count +``` + +1.224 seconds. + +Q3: + +``` sql +SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year +``` + +2.104 seconds. + +Q4: + +``` sql +SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) +FROM trips_mergetree +GROUP BY passenger_count, year, distance +ORDER BY year, count(*) DESC +``` + +3.593 seconds. + +The following server was used: + +Two Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz, 16 physical cores total, 128 GiB RAM, 8x6 TB HD on hardware RAID-5 + +Execution time is the best of three runs. But starting from the second run, queries read data from the file system cache. No further caching occurs: the data is read out and processed in each run. + +Creating a table on three servers: + +On each server: + +``` sql +CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) +``` + +On the source server: + +``` sql +CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) +``` + +The following query redistributes data: + +``` sql +INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree +``` + +This takes 2454 seconds. + +On three servers: + +Q1: 0.212 seconds. +Q2: 0.438 seconds. +Q3: 0.733 seconds. +Q4: 1.241 seconds. + +No surprises here, since the queries are scaled linearly. + +We also have the results from a cluster of 140 servers: + +Q1: 0.028 sec. +Q2: 0.043 sec. +Q3: 0.051 sec. +Q4: 0.072 sec. + +In this case, the query processing time is determined above all by network latency. +We ran queries using a client located in a different datacenter than where the cluster was located, which added about 20 ms of latency. + +## Summary {#summary} + +| servers | Q1 | Q2 | Q3 | Q4 | +|---------|-------|-------|-------|-------| +| 1, E5-2650v2 | 0.490 | 1.224 | 2.104 | 3.593 | +| 3, E5-2650v2 | 0.212 | 0.438 | 0.733 | 1.241 | +| 1, AWS c5n.4xlarge | 0.249 | 1.279 | 1.738 | 3.527 | +| 1, AWS c5n.9xlarge | 0.130 | 0.584 | 0.777 | 1.811 | +| 3, AWS c5n.9xlarge | 0.057 | 0.231 | 0.285 | 0.641 | +| 140, E5-2650v2 | 0.028 | 0.043 | 0.051 | 0.072 | + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/en/reference/getting-started/example-datasets/ontime.md b/docs/en/reference/getting-started/example-datasets/ontime.md new file mode 100644 index 00000000000..bb3c3644972 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/ontime.md @@ -0,0 +1,408 @@ +--- +sidebar_label: OnTime Airline Flight Data +description: Dataset containing the on-time performance of airline flights +--- + +# OnTime + +This dataset can be obtained in two ways: + +- import from raw data +- download of prepared partitions + +## Import from Raw Data {#import-from-raw-data} + +Downloading data: + +``` bash +wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2021}_{1..12}.zip +``` + +Creating a table: + +``` sql +CREATE TABLE `ontime` +( + `Year` UInt16, + `Quarter` UInt8, + `Month` UInt8, + `DayofMonth` UInt8, + `DayOfWeek` UInt8, + `FlightDate` Date, + `Reporting_Airline` String, + `DOT_ID_Reporting_Airline` Int32, + `IATA_CODE_Reporting_Airline` String, + `Tail_Number` String, + `Flight_Number_Reporting_Airline` String, + `OriginAirportID` Int32, + `OriginAirportSeqID` Int32, + `OriginCityMarketID` Int32, + `Origin` FixedString(5), + `OriginCityName` String, + `OriginState` FixedString(2), + `OriginStateFips` String, + `OriginStateName` String, + `OriginWac` Int32, + `DestAirportID` Int32, + `DestAirportSeqID` Int32, + `DestCityMarketID` Int32, + `Dest` FixedString(5), + `DestCityName` String, + `DestState` FixedString(2), + `DestStateFips` String, + `DestStateName` String, + `DestWac` Int32, + `CRSDepTime` Int32, + `DepTime` Int32, + `DepDelay` Int32, + `DepDelayMinutes` Int32, + `DepDel15` Int32, + `DepartureDelayGroups` String, + `DepTimeBlk` String, + `TaxiOut` Int32, + `WheelsOff` Int32, + `WheelsOn` Int32, + `TaxiIn` Int32, + `CRSArrTime` Int32, + `ArrTime` Int32, + `ArrDelay` Int32, + `ArrDelayMinutes` Int32, + `ArrDel15` Int32, + `ArrivalDelayGroups` Int32, + `ArrTimeBlk` String, + `Cancelled` UInt8, + `CancellationCode` FixedString(1), + `Diverted` UInt8, + `CRSElapsedTime` Int32, + `ActualElapsedTime` Int32, + `AirTime` Nullable(Int32), + `Flights` Int32, + `Distance` Int32, + `DistanceGroup` UInt8, + `CarrierDelay` Int32, + `WeatherDelay` Int32, + `NASDelay` Int32, + `SecurityDelay` Int32, + `LateAircraftDelay` Int32, + `FirstDepTime` String, + `TotalAddGTime` String, + `LongestAddGTime` String, + `DivAirportLandings` String, + `DivReachedDest` String, + `DivActualElapsedTime` String, + `DivArrDelay` String, + `DivDistance` String, + `Div1Airport` String, + `Div1AirportID` Int32, + `Div1AirportSeqID` Int32, + `Div1WheelsOn` String, + `Div1TotalGTime` String, + `Div1LongestGTime` String, + `Div1WheelsOff` String, + `Div1TailNum` String, + `Div2Airport` String, + `Div2AirportID` Int32, + `Div2AirportSeqID` Int32, + `Div2WheelsOn` String, + `Div2TotalGTime` String, + `Div2LongestGTime` String, + `Div2WheelsOff` String, + `Div2TailNum` String, + `Div3Airport` String, + `Div3AirportID` Int32, + `Div3AirportSeqID` Int32, + `Div3WheelsOn` String, + `Div3TotalGTime` String, + `Div3LongestGTime` String, + `Div3WheelsOff` String, + `Div3TailNum` String, + `Div4Airport` String, + `Div4AirportID` Int32, + `Div4AirportSeqID` Int32, + `Div4WheelsOn` String, + `Div4TotalGTime` String, + `Div4LongestGTime` String, + `Div4WheelsOff` String, + `Div4TailNum` String, + `Div5Airport` String, + `Div5AirportID` Int32, + `Div5AirportSeqID` Int32, + `Div5WheelsOn` String, + `Div5TotalGTime` String, + `Div5LongestGTime` String, + `Div5WheelsOff` String, + `Div5TailNum` String +) ENGINE = MergeTree + PARTITION BY Year + ORDER BY (IATA_CODE_Reporting_Airline, FlightDate) + SETTINGS index_granularity = 8192; +``` + +Loading data with multiple threads: + +``` bash +ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'" +``` + +(if you will have memory shortage or other issues on your server, remove the `-P $(nproc)` part) + +## Download of Prepared Partitions {#download-of-prepared-partitions} + +``` bash +$ curl -O https://datasets.clickhouse.com/ontime/partitions/ontime.tar +$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.ontime" +``` + +:::note +If you will run the queries described below, you have to use the full table name, `datasets.ontime`. +::: + +## Queries {#queries} + +Q0. + +``` sql +SELECT avg(c1) +FROM +( + SELECT Year, Month, count(*) AS c1 + FROM ontime + GROUP BY Year, Month +); +``` + +Q1. The number of flights per day from the year 2000 to 2008 + +``` sql +SELECT DayOfWeek, count(*) AS c +FROM ontime +WHERE Year>=2000 AND Year<=2008 +GROUP BY DayOfWeek +ORDER BY c DESC; +``` + +Q2. The number of flights delayed by more than 10 minutes, grouped by the day of the week, for 2000-2008 + +``` sql +SELECT DayOfWeek, count(*) AS c +FROM ontime +WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 +GROUP BY DayOfWeek +ORDER BY c DESC; +``` + +Q3. The number of delays by the airport for 2000-2008 + +``` sql +SELECT Origin, count(*) AS c +FROM ontime +WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 +GROUP BY Origin +ORDER BY c DESC +LIMIT 10; +``` + +Q4. The number of delays by carrier for 2007 + +``` sql +SELECT IATA_CODE_Reporting_Airline AS Carrier, count(*) +FROM ontime +WHERE DepDelay>10 AND Year=2007 +GROUP BY Carrier +ORDER BY count(*) DESC; +``` + +Q5. The percentage of delays by carrier for 2007 + +``` sql +SELECT Carrier, c, c2, c*100/c2 as c3 +FROM +( + SELECT + IATA_CODE_Reporting_Airline AS Carrier, + count(*) AS c + FROM ontime + WHERE DepDelay>10 + AND Year=2007 + GROUP BY Carrier +) q +JOIN +( + SELECT + IATA_CODE_Reporting_Airline AS Carrier, + count(*) AS c2 + FROM ontime + WHERE Year=2007 + GROUP BY Carrier +) qq USING Carrier +ORDER BY c3 DESC; +``` + +Better version of the same query: + +``` sql +SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3 +FROM ontime +WHERE Year=2007 +GROUP BY Carrier +ORDER BY c3 DESC +``` + +Q6. The previous request for a broader range of years, 2000-2008 + +``` sql +SELECT Carrier, c, c2, c*100/c2 as c3 +FROM +( + SELECT + IATA_CODE_Reporting_Airline AS Carrier, + count(*) AS c + FROM ontime + WHERE DepDelay>10 + AND Year>=2000 AND Year<=2008 + GROUP BY Carrier +) q +JOIN +( + SELECT + IATA_CODE_Reporting_Airline AS Carrier, + count(*) AS c2 + FROM ontime + WHERE Year>=2000 AND Year<=2008 + GROUP BY Carrier +) qq USING Carrier +ORDER BY c3 DESC; +``` + +Better version of the same query: + +``` sql +SELECT IATA_CODE_Reporting_Airline AS Carrier, avg(DepDelay>10)*100 AS c3 +FROM ontime +WHERE Year>=2000 AND Year<=2008 +GROUP BY Carrier +ORDER BY c3 DESC; +``` + +Q7. Percentage of flights delayed for more than 10 minutes, by year + +``` sql +SELECT Year, c1/c2 +FROM +( + select + Year, + count(*)*100 as c1 + from ontime + WHERE DepDelay>10 + GROUP BY Year +) q +JOIN +( + select + Year, + count(*) as c2 + from ontime + GROUP BY Year +) qq USING (Year) +ORDER BY Year; +``` + +Better version of the same query: + +``` sql +SELECT Year, avg(DepDelay>10)*100 +FROM ontime +GROUP BY Year +ORDER BY Year; +``` + +Q8. The most popular destinations by the number of directly connected cities for various year ranges + +``` sql +SELECT DestCityName, uniqExact(OriginCityName) AS u +FROM ontime +WHERE Year >= 2000 and Year <= 2010 +GROUP BY DestCityName +ORDER BY u DESC LIMIT 10; +``` + +Q9. + +``` sql +SELECT Year, count(*) AS c1 +FROM ontime +GROUP BY Year; +``` + +Q10. + +``` sql +SELECT + min(Year), max(Year), IATA_CODE_Reporting_Airline AS Carrier, count(*) AS cnt, + sum(ArrDelayMinutes>30) AS flights_delayed, + round(sum(ArrDelayMinutes>30)/count(*),2) AS rate +FROM ontime +WHERE + DayOfWeek NOT IN (6,7) AND OriginState NOT IN ('AK', 'HI', 'PR', 'VI') + AND DestState NOT IN ('AK', 'HI', 'PR', 'VI') + AND FlightDate < '2010-01-01' +GROUP by Carrier +HAVING cnt>100000 and max(Year)>1990 +ORDER by rate DESC +LIMIT 1000; +``` + +Bonus: + +``` sql +SELECT avg(cnt) +FROM +( + SELECT Year,Month,count(*) AS cnt + FROM ontime + WHERE DepDel15=1 + GROUP BY Year,Month +); + +SELECT avg(c1) FROM +( + SELECT Year,Month,count(*) AS c1 + FROM ontime + GROUP BY Year,Month +); + +SELECT DestCityName, uniqExact(OriginCityName) AS u +FROM ontime +GROUP BY DestCityName +ORDER BY u DESC +LIMIT 10; + +SELECT OriginCityName, DestCityName, count() AS c +FROM ontime +GROUP BY OriginCityName, DestCityName +ORDER BY c DESC +LIMIT 10; + +SELECT OriginCityName, count() AS c +FROM ontime +GROUP BY OriginCityName +ORDER BY c DESC +LIMIT 10; +``` + +You can also play with the data in Playground, [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIERheU9mV2VlaywgY291bnQoKikgQVMgYwpGUk9NIG9udGltZQpXSEVSRSBZZWFyPj0yMDAwIEFORCBZZWFyPD0yMDA4CkdST1VQIEJZIERheU9mV2VlawpPUkRFUiBCWSBjIERFU0M7Cg==). + +This performance test was created by Vadim Tkachenko. See: + +- https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ +- https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/ +- https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/ +- https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ +- https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ +- http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/ontime/) diff --git a/docs/en/reference/getting-started/example-datasets/opensky.md b/docs/en/reference/getting-started/example-datasets/opensky.md new file mode 100644 index 00000000000..719f32d7c3e --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/opensky.md @@ -0,0 +1,420 @@ +--- +sidebar_label: Air Traffic Data +description: The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. +--- + +# Crowdsourced air traffic data from The OpenSky Network 2020 + +The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. It spans all flights seen by the network's more than 2500 members since 1 January 2019. More data will be periodically included in the dataset until the end of the COVID-19 pandemic. + +Source: https://zenodo.org/record/5092942#.YRBCyTpRXYd + +Martin Strohmeier, Xavier Olive, Jannis Lübbe, Matthias Schäfer, and Vincent Lenders +"Crowdsourced air traffic data from the OpenSky Network 2019–2020" +Earth System Science Data 13(2), 2021 +https://doi.org/10.5194/essd-13-357-2021 + +## Download the Dataset {#download-dataset} + +Run the command: + +```bash +wget -O- https://zenodo.org/record/5092942 | grep -oP 'https://zenodo.org/record/5092942/files/flightlist_\d+_\d+\.csv\.gz' | xargs wget +``` + +Download will take about 2 minutes with good internet connection. There are 30 files with total size of 4.3 GB. + +## Create the Table {#create-table} + +```sql +CREATE TABLE opensky +( + callsign String, + number String, + icao24 String, + registration String, + typecode String, + origin String, + destination String, + firstseen DateTime, + lastseen DateTime, + day DateTime, + latitude_1 Float64, + longitude_1 Float64, + altitude_1 Float64, + latitude_2 Float64, + longitude_2 Float64, + altitude_2 Float64 +) ENGINE = MergeTree ORDER BY (origin, destination, callsign); +``` + +## Import Data {#import-data} + +Upload data into ClickHouse in parallel: + +```bash +ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"' +``` + +- Here we pass the list of files (`ls -1 flightlist_*.csv.gz`) to `xargs` for parallel processing. +`xargs -P100` specifies to use up to 100 parallel workers but as we only have 30 files, the number of workers will be only 30. +- For every file, `xargs` will run a script with `bash -c`. The script has substitution in form of `{}` and the `xargs` command will substitute the filename to it (we have asked it for `xargs` with `-I{}`). +- The script will decompress the file (`gzip -c -d "{}"`) to standard output (`-c` parameter) and the output is redirected to `clickhouse-client`. +- We also asked to parse [DateTime](../sql-reference/data-types/datetime.md) fields with extended parser ([--date_time_input_format best_effort](../operations/settings/settings.md#settings-date_time_input_format)) to recognize ISO-8601 format with timezone offsets. + +Finally, `clickhouse-client` will do insertion. It will read input data in [CSVWithNames](../interfaces/formats.md#csvwithnames) format. + +Parallel upload takes 24 seconds. + +If you don't like parallel upload, here is sequential variant: + +```bash +for file in flightlist_*.csv.gz; do gzip -c -d "$file" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"; done +``` + +## Validate the Data {#validate-data} + +Query: + +```sql +SELECT count() FROM opensky; +``` + +Result: + +```text +┌──count()─┐ +│ 66010819 │ +└──────────┘ +``` + +The size of dataset in ClickHouse is just 2.66 GiB, check it. + +Query: + +```sql +SELECT formatReadableSize(total_bytes) FROM system.tables WHERE name = 'opensky'; +``` + +Result: + +```text +┌─formatReadableSize(total_bytes)─┐ +│ 2.66 GiB │ +└─────────────────────────────────┘ +``` + +## Run Some Queries {#run-queries} + +Total distance travelled is 68 billion kilometers. + +Query: + +```sql +SELECT formatReadableQuantity(sum(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2)) / 1000) FROM opensky; +``` + +Result: + +```text +┌─formatReadableQuantity(divide(sum(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2)), 1000))─┐ +│ 68.72 billion │ +└──────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Average flight distance is around 1000 km. + +Query: + +```sql +SELECT avg(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2)) FROM opensky; +``` + +Result: + +```text +┌─avg(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2))─┐ +│ 1041090.6465708319 │ +└────────────────────────────────────────────────────────────────────┘ +``` + +### Most busy origin airports and the average distance seen {#busy-airports-average-distance} + +Query: + +```sql +SELECT + origin, + count(), + round(avg(geoDistance(longitude_1, latitude_1, longitude_2, latitude_2))) AS distance, + bar(distance, 0, 10000000, 100) AS bar +FROM opensky +WHERE origin != '' +GROUP BY origin +ORDER BY count() DESC +LIMIT 100; +``` + +Result: + +```text + ┌─origin─┬─count()─┬─distance─┬─bar────────────────────────────────────┐ + 1. │ KORD │ 745007 │ 1546108 │ ███████████████▍ │ + 2. │ KDFW │ 696702 │ 1358721 │ █████████████▌ │ + 3. │ KATL │ 667286 │ 1169661 │ ███████████▋ │ + 4. │ KDEN │ 582709 │ 1287742 │ ████████████▊ │ + 5. │ KLAX │ 581952 │ 2628393 │ ██████████████████████████▎ │ + 6. │ KLAS │ 447789 │ 1336967 │ █████████████▎ │ + 7. │ KPHX │ 428558 │ 1345635 │ █████████████▍ │ + 8. │ KSEA │ 412592 │ 1757317 │ █████████████████▌ │ + 9. │ KCLT │ 404612 │ 880355 │ ████████▋ │ + 10. │ VIDP │ 363074 │ 1445052 │ ██████████████▍ │ + 11. │ EDDF │ 362643 │ 2263960 │ ██████████████████████▋ │ + 12. │ KSFO │ 361869 │ 2445732 │ ████████████████████████▍ │ + 13. │ KJFK │ 349232 │ 2996550 │ █████████████████████████████▊ │ + 14. │ KMSP │ 346010 │ 1287328 │ ████████████▋ │ + 15. │ LFPG │ 344748 │ 2206203 │ ██████████████████████ │ + 16. │ EGLL │ 341370 │ 3216593 │ ████████████████████████████████▏ │ + 17. │ EHAM │ 340272 │ 2116425 │ █████████████████████▏ │ + 18. │ KEWR │ 337696 │ 1826545 │ ██████████████████▎ │ + 19. │ KPHL │ 320762 │ 1291761 │ ████████████▊ │ + 20. │ OMDB │ 308855 │ 2855706 │ ████████████████████████████▌ │ + 21. │ UUEE │ 307098 │ 1555122 │ ███████████████▌ │ + 22. │ KBOS │ 304416 │ 1621675 │ ████████████████▏ │ + 23. │ LEMD │ 291787 │ 1695097 │ ████████████████▊ │ + 24. │ YSSY │ 272979 │ 1875298 │ ██████████████████▋ │ + 25. │ KMIA │ 265121 │ 1923542 │ ███████████████████▏ │ + 26. │ ZGSZ │ 263497 │ 745086 │ ███████▍ │ + 27. │ EDDM │ 256691 │ 1361453 │ █████████████▌ │ + 28. │ WMKK │ 254264 │ 1626688 │ ████████████████▎ │ + 29. │ CYYZ │ 251192 │ 2175026 │ █████████████████████▋ │ + 30. │ KLGA │ 248699 │ 1106935 │ ███████████ │ + 31. │ VHHH │ 248473 │ 3457658 │ ██████████████████████████████████▌ │ + 32. │ RJTT │ 243477 │ 1272744 │ ████████████▋ │ + 33. │ KBWI │ 241440 │ 1187060 │ ███████████▋ │ + 34. │ KIAD │ 239558 │ 1683485 │ ████████████████▋ │ + 35. │ KIAH │ 234202 │ 1538335 │ ███████████████▍ │ + 36. │ KFLL │ 223447 │ 1464410 │ ██████████████▋ │ + 37. │ KDAL │ 212055 │ 1082339 │ ██████████▋ │ + 38. │ KDCA │ 207883 │ 1013359 │ ██████████▏ │ + 39. │ LIRF │ 207047 │ 1427965 │ ██████████████▎ │ + 40. │ PANC │ 206007 │ 2525359 │ █████████████████████████▎ │ + 41. │ LTFJ │ 205415 │ 860470 │ ████████▌ │ + 42. │ KDTW │ 204020 │ 1106716 │ ███████████ │ + 43. │ VABB │ 201679 │ 1300865 │ █████████████ │ + 44. │ OTHH │ 200797 │ 3759544 │ █████████████████████████████████████▌ │ + 45. │ KMDW │ 200796 │ 1232551 │ ████████████▎ │ + 46. │ KSAN │ 198003 │ 1495195 │ ██████████████▊ │ + 47. │ KPDX │ 197760 │ 1269230 │ ████████████▋ │ + 48. │ SBGR │ 197624 │ 2041697 │ ████████████████████▍ │ + 49. │ VOBL │ 189011 │ 1040180 │ ██████████▍ │ + 50. │ LEBL │ 188956 │ 1283190 │ ████████████▋ │ + 51. │ YBBN │ 188011 │ 1253405 │ ████████████▌ │ + 52. │ LSZH │ 187934 │ 1572029 │ ███████████████▋ │ + 53. │ YMML │ 187643 │ 1870076 │ ██████████████████▋ │ + 54. │ RCTP │ 184466 │ 2773976 │ ███████████████████████████▋ │ + 55. │ KSNA │ 180045 │ 778484 │ ███████▋ │ + 56. │ EGKK │ 176420 │ 1694770 │ ████████████████▊ │ + 57. │ LOWW │ 176191 │ 1274833 │ ████████████▋ │ + 58. │ UUDD │ 176099 │ 1368226 │ █████████████▋ │ + 59. │ RKSI │ 173466 │ 3079026 │ ██████████████████████████████▋ │ + 60. │ EKCH │ 172128 │ 1229895 │ ████████████▎ │ + 61. │ KOAK │ 171119 │ 1114447 │ ███████████▏ │ + 62. │ RPLL │ 170122 │ 1440735 │ ██████████████▍ │ + 63. │ KRDU │ 167001 │ 830521 │ ████████▎ │ + 64. │ KAUS │ 164524 │ 1256198 │ ████████████▌ │ + 65. │ KBNA │ 163242 │ 1022726 │ ██████████▏ │ + 66. │ KSDF │ 162655 │ 1380867 │ █████████████▋ │ + 67. │ ENGM │ 160732 │ 910108 │ █████████ │ + 68. │ LIMC │ 160696 │ 1564620 │ ███████████████▋ │ + 69. │ KSJC │ 159278 │ 1081125 │ ██████████▋ │ + 70. │ KSTL │ 157984 │ 1026699 │ ██████████▎ │ + 71. │ UUWW │ 156811 │ 1261155 │ ████████████▌ │ + 72. │ KIND │ 153929 │ 987944 │ █████████▊ │ + 73. │ ESSA │ 153390 │ 1203439 │ ████████████ │ + 74. │ KMCO │ 153351 │ 1508657 │ ███████████████ │ + 75. │ KDVT │ 152895 │ 74048 │ ▋ │ + 76. │ VTBS │ 152645 │ 2255591 │ ██████████████████████▌ │ + 77. │ CYVR │ 149574 │ 2027413 │ ████████████████████▎ │ + 78. │ EIDW │ 148723 │ 1503985 │ ███████████████ │ + 79. │ LFPO │ 143277 │ 1152964 │ ███████████▌ │ + 80. │ EGSS │ 140830 │ 1348183 │ █████████████▍ │ + 81. │ KAPA │ 140776 │ 420441 │ ████▏ │ + 82. │ KHOU │ 138985 │ 1068806 │ ██████████▋ │ + 83. │ KTPA │ 138033 │ 1338223 │ █████████████▍ │ + 84. │ KFFZ │ 137333 │ 55397 │ ▌ │ + 85. │ NZAA │ 136092 │ 1581264 │ ███████████████▋ │ + 86. │ YPPH │ 133916 │ 1271550 │ ████████████▋ │ + 87. │ RJBB │ 133522 │ 1805623 │ ██████████████████ │ + 88. │ EDDL │ 133018 │ 1265919 │ ████████████▋ │ + 89. │ ULLI │ 130501 │ 1197108 │ ███████████▊ │ + 90. │ KIWA │ 127195 │ 250876 │ ██▌ │ + 91. │ KTEB │ 126969 │ 1189414 │ ███████████▊ │ + 92. │ VOMM │ 125616 │ 1127757 │ ███████████▎ │ + 93. │ LSGG │ 123998 │ 1049101 │ ██████████▍ │ + 94. │ LPPT │ 122733 │ 1779187 │ █████████████████▋ │ + 95. │ WSSS │ 120493 │ 3264122 │ ████████████████████████████████▋ │ + 96. │ EBBR │ 118539 │ 1579939 │ ███████████████▋ │ + 97. │ VTBD │ 118107 │ 661627 │ ██████▌ │ + 98. │ KVNY │ 116326 │ 692960 │ ██████▊ │ + 99. │ EDDT │ 115122 │ 941740 │ █████████▍ │ +100. │ EFHK │ 114860 │ 1629143 │ ████████████████▎ │ + └────────┴─────────┴──────────┴────────────────────────────────────────┘ +``` + +### Number of flights from three major Moscow airports, weekly {#flights-from-moscow} + +Query: + +```sql +SELECT + toMonday(day) AS k, + count() AS c, + bar(c, 0, 10000, 100) AS bar +FROM opensky +WHERE origin IN ('UUEE', 'UUDD', 'UUWW') +GROUP BY k +ORDER BY k ASC; +``` + +Result: + +```text + ┌──────────k─┬────c─┬─bar──────────────────────────────────────────────────────────────────────────┐ + 1. │ 2018-12-31 │ 5248 │ ████████████████████████████████████████████████████▍ │ + 2. │ 2019-01-07 │ 6302 │ ███████████████████████████████████████████████████████████████ │ + 3. │ 2019-01-14 │ 5701 │ █████████████████████████████████████████████████████████ │ + 4. │ 2019-01-21 │ 5638 │ ████████████████████████████████████████████████████████▍ │ + 5. │ 2019-01-28 │ 5731 │ █████████████████████████████████████████████████████████▎ │ + 6. │ 2019-02-04 │ 5683 │ ████████████████████████████████████████████████████████▋ │ + 7. │ 2019-02-11 │ 5759 │ █████████████████████████████████████████████████████████▌ │ + 8. │ 2019-02-18 │ 5736 │ █████████████████████████████████████████████████████████▎ │ + 9. │ 2019-02-25 │ 5873 │ ██████████████████████████████████████████████████████████▋ │ + 10. │ 2019-03-04 │ 5965 │ ███████████████████████████████████████████████████████████▋ │ + 11. │ 2019-03-11 │ 5900 │ ███████████████████████████████████████████████████████████ │ + 12. │ 2019-03-18 │ 5823 │ ██████████████████████████████████████████████████████████▏ │ + 13. │ 2019-03-25 │ 5899 │ ██████████████████████████████████████████████████████████▊ │ + 14. │ 2019-04-01 │ 6043 │ ████████████████████████████████████████████████████████████▍ │ + 15. │ 2019-04-08 │ 6098 │ ████████████████████████████████████████████████████████████▊ │ + 16. │ 2019-04-15 │ 6196 │ █████████████████████████████████████████████████████████████▊ │ + 17. │ 2019-04-22 │ 6486 │ ████████████████████████████████████████████████████████████████▋ │ + 18. │ 2019-04-29 │ 6682 │ ██████████████████████████████████████████████████████████████████▋ │ + 19. │ 2019-05-06 │ 6739 │ ███████████████████████████████████████████████████████████████████▍ │ + 20. │ 2019-05-13 │ 6600 │ ██████████████████████████████████████████████████████████████████ │ + 21. │ 2019-05-20 │ 6575 │ █████████████████████████████████████████████████████████████████▋ │ + 22. │ 2019-05-27 │ 6786 │ ███████████████████████████████████████████████████████████████████▋ │ + 23. │ 2019-06-03 │ 6872 │ ████████████████████████████████████████████████████████████████████▋ │ + 24. │ 2019-06-10 │ 7045 │ ██████████████████████████████████████████████████████████████████████▍ │ + 25. │ 2019-06-17 │ 7045 │ ██████████████████████████████████████████████████████████████████████▍ │ + 26. │ 2019-06-24 │ 6852 │ ████████████████████████████████████████████████████████████████████▌ │ + 27. │ 2019-07-01 │ 7248 │ ████████████████████████████████████████████████████████████████████████▍ │ + 28. │ 2019-07-08 │ 7284 │ ████████████████████████████████████████████████████████████████████████▋ │ + 29. │ 2019-07-15 │ 7142 │ ███████████████████████████████████████████████████████████████████████▍ │ + 30. │ 2019-07-22 │ 7108 │ ███████████████████████████████████████████████████████████████████████ │ + 31. │ 2019-07-29 │ 7251 │ ████████████████████████████████████████████████████████████████████████▌ │ + 32. │ 2019-08-05 │ 7403 │ ██████████████████████████████████████████████████████████████████████████ │ + 33. │ 2019-08-12 │ 7457 │ ██████████████████████████████████████████████████████████████████████████▌ │ + 34. │ 2019-08-19 │ 7502 │ ███████████████████████████████████████████████████████████████████████████ │ + 35. │ 2019-08-26 │ 7540 │ ███████████████████████████████████████████████████████████████████████████▍ │ + 36. │ 2019-09-02 │ 7237 │ ████████████████████████████████████████████████████████████████████████▎ │ + 37. │ 2019-09-09 │ 7328 │ █████████████████████████████████████████████████████████████████████████▎ │ + 38. │ 2019-09-16 │ 5566 │ ███████████████████████████████████████████████████████▋ │ + 39. │ 2019-09-23 │ 7049 │ ██████████████████████████████████████████████████████████████████████▍ │ + 40. │ 2019-09-30 │ 6880 │ ████████████████████████████████████████████████████████████████████▋ │ + 41. │ 2019-10-07 │ 6518 │ █████████████████████████████████████████████████████████████████▏ │ + 42. │ 2019-10-14 │ 6688 │ ██████████████████████████████████████████████████████████████████▊ │ + 43. │ 2019-10-21 │ 6667 │ ██████████████████████████████████████████████████████████████████▋ │ + 44. │ 2019-10-28 │ 6303 │ ███████████████████████████████████████████████████████████████ │ + 45. │ 2019-11-04 │ 6298 │ ██████████████████████████████████████████████████████████████▊ │ + 46. │ 2019-11-11 │ 6137 │ █████████████████████████████████████████████████████████████▎ │ + 47. │ 2019-11-18 │ 6051 │ ████████████████████████████████████████████████████████████▌ │ + 48. │ 2019-11-25 │ 5820 │ ██████████████████████████████████████████████████████████▏ │ + 49. │ 2019-12-02 │ 5942 │ ███████████████████████████████████████████████████████████▍ │ + 50. │ 2019-12-09 │ 4891 │ ████████████████████████████████████████████████▊ │ + 51. │ 2019-12-16 │ 5682 │ ████████████████████████████████████████████████████████▋ │ + 52. │ 2019-12-23 │ 6111 │ █████████████████████████████████████████████████████████████ │ + 53. │ 2019-12-30 │ 5870 │ ██████████████████████████████████████████████████████████▋ │ + 54. │ 2020-01-06 │ 5953 │ ███████████████████████████████████████████████████████████▌ │ + 55. │ 2020-01-13 │ 5698 │ ████████████████████████████████████████████████████████▊ │ + 56. │ 2020-01-20 │ 5339 │ █████████████████████████████████████████████████████▍ │ + 57. │ 2020-01-27 │ 5566 │ ███████████████████████████████████████████████████████▋ │ + 58. │ 2020-02-03 │ 5801 │ ██████████████████████████████████████████████████████████ │ + 59. │ 2020-02-10 │ 5692 │ ████████████████████████████████████████████████████████▊ │ + 60. │ 2020-02-17 │ 5912 │ ███████████████████████████████████████████████████████████ │ + 61. │ 2020-02-24 │ 6031 │ ████████████████████████████████████████████████████████████▎ │ + 62. │ 2020-03-02 │ 6105 │ █████████████████████████████████████████████████████████████ │ + 63. │ 2020-03-09 │ 5823 │ ██████████████████████████████████████████████████████████▏ │ + 64. │ 2020-03-16 │ 4659 │ ██████████████████████████████████████████████▌ │ + 65. │ 2020-03-23 │ 3720 │ █████████████████████████████████████▏ │ + 66. │ 2020-03-30 │ 1720 │ █████████████████▏ │ + 67. │ 2020-04-06 │ 849 │ ████████▍ │ + 68. │ 2020-04-13 │ 710 │ ███████ │ + 69. │ 2020-04-20 │ 725 │ ███████▏ │ + 70. │ 2020-04-27 │ 920 │ █████████▏ │ + 71. │ 2020-05-04 │ 859 │ ████████▌ │ + 72. │ 2020-05-11 │ 1047 │ ██████████▍ │ + 73. │ 2020-05-18 │ 1135 │ ███████████▎ │ + 74. │ 2020-05-25 │ 1266 │ ████████████▋ │ + 75. │ 2020-06-01 │ 1793 │ █████████████████▊ │ + 76. │ 2020-06-08 │ 1979 │ ███████████████████▋ │ + 77. │ 2020-06-15 │ 2297 │ ██████████████████████▊ │ + 78. │ 2020-06-22 │ 2788 │ ███████████████████████████▊ │ + 79. │ 2020-06-29 │ 3389 │ █████████████████████████████████▊ │ + 80. │ 2020-07-06 │ 3545 │ ███████████████████████████████████▍ │ + 81. │ 2020-07-13 │ 3569 │ ███████████████████████████████████▋ │ + 82. │ 2020-07-20 │ 3784 │ █████████████████████████████████████▋ │ + 83. │ 2020-07-27 │ 3960 │ ███████████████████████████████████████▌ │ + 84. │ 2020-08-03 │ 4323 │ ███████████████████████████████████████████▏ │ + 85. │ 2020-08-10 │ 4581 │ █████████████████████████████████████████████▋ │ + 86. │ 2020-08-17 │ 4791 │ ███████████████████████████████████████████████▊ │ + 87. │ 2020-08-24 │ 4928 │ █████████████████████████████████████████████████▎ │ + 88. │ 2020-08-31 │ 4687 │ ██████████████████████████████████████████████▋ │ + 89. │ 2020-09-07 │ 4643 │ ██████████████████████████████████████████████▍ │ + 90. │ 2020-09-14 │ 4594 │ █████████████████████████████████████████████▊ │ + 91. │ 2020-09-21 │ 4478 │ ████████████████████████████████████████████▋ │ + 92. │ 2020-09-28 │ 4382 │ ███████████████████████████████████████████▋ │ + 93. │ 2020-10-05 │ 4261 │ ██████████████████████████████████████████▌ │ + 94. │ 2020-10-12 │ 4243 │ ██████████████████████████████████████████▍ │ + 95. │ 2020-10-19 │ 3941 │ ███████████████████████████████████████▍ │ + 96. │ 2020-10-26 │ 3616 │ ████████████████████████████████████▏ │ + 97. │ 2020-11-02 │ 3586 │ ███████████████████████████████████▋ │ + 98. │ 2020-11-09 │ 3403 │ ██████████████████████████████████ │ + 99. │ 2020-11-16 │ 3336 │ █████████████████████████████████▎ │ +100. │ 2020-11-23 │ 3230 │ ████████████████████████████████▎ │ +101. │ 2020-11-30 │ 3183 │ ███████████████████████████████▋ │ +102. │ 2020-12-07 │ 3285 │ ████████████████████████████████▋ │ +103. │ 2020-12-14 │ 3367 │ █████████████████████████████████▋ │ +104. │ 2020-12-21 │ 3748 │ █████████████████████████████████████▍ │ +105. │ 2020-12-28 │ 3986 │ ███████████████████████████████████████▋ │ +106. │ 2021-01-04 │ 3906 │ ███████████████████████████████████████ │ +107. │ 2021-01-11 │ 3425 │ ██████████████████████████████████▎ │ +108. │ 2021-01-18 │ 3144 │ ███████████████████████████████▍ │ +109. │ 2021-01-25 │ 3115 │ ███████████████████████████████▏ │ +110. │ 2021-02-01 │ 3285 │ ████████████████████████████████▋ │ +111. │ 2021-02-08 │ 3321 │ █████████████████████████████████▏ │ +112. │ 2021-02-15 │ 3475 │ ██████████████████████████████████▋ │ +113. │ 2021-02-22 │ 3549 │ ███████████████████████████████████▍ │ +114. │ 2021-03-01 │ 3755 │ █████████████████████████████████████▌ │ +115. │ 2021-03-08 │ 3080 │ ██████████████████████████████▋ │ +116. │ 2021-03-15 │ 3789 │ █████████████████████████████████████▊ │ +117. │ 2021-03-22 │ 3804 │ ██████████████████████████████████████ │ +118. │ 2021-03-29 │ 4238 │ ██████████████████████████████████████████▍ │ +119. │ 2021-04-05 │ 4307 │ ███████████████████████████████████████████ │ +120. │ 2021-04-12 │ 4225 │ ██████████████████████████████████████████▎ │ +121. │ 2021-04-19 │ 4391 │ ███████████████████████████████████████████▊ │ +122. │ 2021-04-26 │ 4868 │ ████████████████████████████████████████████████▋ │ +123. │ 2021-05-03 │ 4977 │ █████████████████████████████████████████████████▋ │ +124. │ 2021-05-10 │ 5164 │ ███████████████████████████████████████████████████▋ │ +125. │ 2021-05-17 │ 4986 │ █████████████████████████████████████████████████▋ │ +126. │ 2021-05-24 │ 5024 │ ██████████████████████████████████████████████████▏ │ +127. │ 2021-05-31 │ 4824 │ ████████████████████████████████████████████████▏ │ +128. │ 2021-06-07 │ 5652 │ ████████████████████████████████████████████████████████▌ │ +129. │ 2021-06-14 │ 5613 │ ████████████████████████████████████████████████████████▏ │ +130. │ 2021-06-21 │ 6061 │ ████████████████████████████████████████████████████████████▌ │ +131. │ 2021-06-28 │ 2554 │ █████████████████████████▌ │ + └────────────┴──────┴──────────────────────────────────────────────────────────────────────────────┘ +``` + +### Online Playground {#playground} + +You can test other queries to this data set using the interactive resource [Online Playground](https://gh-api.clickhouse.com/play?user=play). For example, [like this](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=). However, please note that you cannot create temporary tables here. diff --git a/docs/en/reference/getting-started/example-datasets/recipes.md b/docs/en/reference/getting-started/example-datasets/recipes.md new file mode 100644 index 00000000000..1ae2493befb --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/recipes.md @@ -0,0 +1,339 @@ +--- +sidebar_label: Recipes Dataset +--- + +# Recipes Dataset + +RecipeNLG dataset is available for download [here](https://recipenlg.cs.put.poznan.pl/dataset). It contains 2.2 million recipes. The size is slightly less than 1 GB. + +## Download and Unpack the Dataset + +1. Go to the download page [https://recipenlg.cs.put.poznan.pl/dataset](https://recipenlg.cs.put.poznan.pl/dataset). +1. Accept Terms and Conditions and download zip file. +1. Unpack the zip file with `unzip`. You will get the `full_dataset.csv` file. + +## Create a Table + +Run clickhouse-client and execute the following CREATE query: + +``` sql +CREATE TABLE recipes +( + title String, + ingredients Array(String), + directions Array(String), + link String, + source LowCardinality(String), + NER Array(String) +) ENGINE = MergeTree ORDER BY title; +``` + +## Insert the Data + +Run the following command: + +``` bash +clickhouse-client --query " + INSERT INTO recipes + SELECT + title, + JSONExtract(ingredients, 'Array(String)'), + JSONExtract(directions, 'Array(String)'), + link, + source, + JSONExtract(NER, 'Array(String)') + FROM input('num UInt32, title String, ingredients String, directions String, link String, source LowCardinality(String), NER String') + FORMAT CSVWithNames +" --input_format_with_names_use_header 0 --format_csv_allow_single_quote 0 --input_format_allow_errors_num 10 < full_dataset.csv +``` + +This is a showcase how to parse custom CSV, as it requires multiple tunes. + +Explanation: +- The dataset is in CSV format, but it requires some preprocessing on insertion; we use table function [input](../sql-reference/table-functions/input.md) to perform preprocessing; +- The structure of CSV file is specified in the argument of the table function `input`; +- The field `num` (row number) is unneeded - we parse it from file and ignore; +- We use `FORMAT CSVWithNames` but the header in CSV will be ignored (by command line parameter `--input_format_with_names_use_header 0`), because the header does not contain the name for the first field; +- File is using only double quotes to enclose CSV strings; some strings are not enclosed in double quotes, and single quote must not be parsed as the string enclosing - that's why we also add the `--format_csv_allow_single_quote 0` parameter; +- Some strings from CSV cannot parse, because they contain `\M/` sequence at the beginning of the value; the only value starting with backslash in CSV can be `\N` that is parsed as SQL NULL. We add `--input_format_allow_errors_num 10` parameter and up to ten malformed records can be skipped; +- There are arrays for ingredients, directions and NER fields; these arrays are represented in unusual form: they are serialized into string as JSON and then placed in CSV - we parse them as String and then use [JSONExtract](../sql-reference/functions/json-functions.md) function to transform it to Array. + +## Validate the Inserted Data + +By checking the row count: + +Query: + +``` sql +SELECT count() FROM recipes; +``` + +Result: + +``` text +┌─count()─┐ +│ 2231141 │ +└─────────┘ +``` + +## Example Queries + +### Top Components by the Number of Recipes: + +In this example we learn how to use [arrayJoin](../sql-reference/functions/array-join/) function to expand an array into a set of rows. + +Query: + +``` sql +SELECT + arrayJoin(NER) AS k, + count() AS c +FROM recipes +GROUP BY k +ORDER BY c DESC +LIMIT 50 +``` + +Result: + +``` text +┌─k────────────────────┬──────c─┐ +│ salt │ 890741 │ +│ sugar │ 620027 │ +│ butter │ 493823 │ +│ flour │ 466110 │ +│ eggs │ 401276 │ +│ onion │ 372469 │ +│ garlic │ 358364 │ +│ milk │ 346769 │ +│ water │ 326092 │ +│ vanilla │ 270381 │ +│ olive oil │ 197877 │ +│ pepper │ 179305 │ +│ brown sugar │ 174447 │ +│ tomatoes │ 163933 │ +│ egg │ 160507 │ +│ baking powder │ 148277 │ +│ lemon juice │ 146414 │ +│ Salt │ 122557 │ +│ cinnamon │ 117927 │ +│ sour cream │ 116682 │ +│ cream cheese │ 114423 │ +│ margarine │ 112742 │ +│ celery │ 112676 │ +│ baking soda │ 110690 │ +│ parsley │ 102151 │ +│ chicken │ 101505 │ +│ onions │ 98903 │ +│ vegetable oil │ 91395 │ +│ oil │ 85600 │ +│ mayonnaise │ 84822 │ +│ pecans │ 79741 │ +│ nuts │ 78471 │ +│ potatoes │ 75820 │ +│ carrots │ 75458 │ +│ pineapple │ 74345 │ +│ soy sauce │ 70355 │ +│ black pepper │ 69064 │ +│ thyme │ 68429 │ +│ mustard │ 65948 │ +│ chicken broth │ 65112 │ +│ bacon │ 64956 │ +│ honey │ 64626 │ +│ oregano │ 64077 │ +│ ground beef │ 64068 │ +│ unsalted butter │ 63848 │ +│ mushrooms │ 61465 │ +│ Worcestershire sauce │ 59328 │ +│ cornstarch │ 58476 │ +│ green pepper │ 58388 │ +│ Cheddar cheese │ 58354 │ +└──────────────────────┴────────┘ + +50 rows in set. Elapsed: 0.112 sec. Processed 2.23 million rows, 361.57 MB (19.99 million rows/s., 3.24 GB/s.) +``` + +### The Most Complex Recipes with Strawberry + +``` sql +SELECT + title, + length(NER), + length(directions) +FROM recipes +WHERE has(NER, 'strawberry') +ORDER BY length(directions) DESC +LIMIT 10 +``` + +Result: + +``` text +┌─title────────────────────────────────────────────────────────────┬─length(NER)─┬─length(directions)─┐ +│ Chocolate-Strawberry-Orange Wedding Cake │ 24 │ 126 │ +│ Strawberry Cream Cheese Crumble Tart │ 19 │ 47 │ +│ Charlotte-Style Ice Cream │ 11 │ 45 │ +│ Sinfully Good a Million Layers Chocolate Layer Cake, With Strawb │ 31 │ 45 │ +│ Sweetened Berries With Elderflower Sherbet │ 24 │ 44 │ +│ Chocolate-Strawberry Mousse Cake │ 15 │ 42 │ +│ Rhubarb Charlotte with Strawberries and Rum │ 20 │ 42 │ +│ Chef Joey's Strawberry Vanilla Tart │ 7 │ 37 │ +│ Old-Fashioned Ice Cream Sundae Cake │ 17 │ 37 │ +│ Watermelon Cake │ 16 │ 36 │ +└──────────────────────────────────────────────────────────────────┴─────────────┴────────────────────┘ + +10 rows in set. Elapsed: 0.215 sec. Processed 2.23 million rows, 1.48 GB (10.35 million rows/s., 6.86 GB/s.) +``` + +In this example, we involve [has](../sql-reference/functions/array-functions/#hasarr-elem) function to filter by array elements and sort by the number of directions. + +There is a wedding cake that requires the whole 126 steps to produce! Show that directions: + +Query: + +``` sql +SELECT arrayJoin(directions) +FROM recipes +WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake' +``` + +Result: + +``` text +┌─arrayJoin(directions)───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Position 1 rack in center and 1 rack in bottom third of oven and preheat to 350F. │ +│ Butter one 5-inch-diameter cake pan with 2-inch-high sides, one 8-inch-diameter cake pan with 2-inch-high sides and one 12-inch-diameter cake pan with 2-inch-high sides. │ +│ Dust pans with flour; line bottoms with parchment. │ +│ Combine 1/3 cup orange juice and 2 ounces unsweetened chocolate in heavy small saucepan. │ +│ Stir mixture over medium-low heat until chocolate melts. │ +│ Remove from heat. │ +│ Gradually mix in 1 2/3 cups orange juice. │ +│ Sift 3 cups flour, 2/3 cup cocoa, 2 teaspoons baking soda, 1 teaspoon salt and 1/2 teaspoon baking powder into medium bowl. │ +│ using electric mixer, beat 1 cup (2 sticks) butter and 3 cups sugar in large bowl until blended (mixture will look grainy). │ +│ Add 4 eggs, 1 at a time, beating to blend after each. │ +│ Beat in 1 tablespoon orange peel and 1 tablespoon vanilla extract. │ +│ Add dry ingredients alternately with orange juice mixture in 3 additions each, beating well after each addition. │ +│ Mix in 1 cup chocolate chips. │ +│ Transfer 1 cup plus 2 tablespoons batter to prepared 5-inch pan, 3 cups batter to prepared 8-inch pan and remaining batter (about 6 cups) to 12-inch pan. │ +│ Place 5-inch and 8-inch pans on center rack of oven. │ +│ Place 12-inch pan on lower rack of oven. │ +│ Bake cakes until tester inserted into center comes out clean, about 35 minutes. │ +│ Transfer cakes in pans to racks and cool completely. │ +│ Mark 4-inch diameter circle on one 6-inch-diameter cardboard cake round. │ +│ Cut out marked circle. │ +│ Mark 7-inch-diameter circle on one 8-inch-diameter cardboard cake round. │ +│ Cut out marked circle. │ +│ Mark 11-inch-diameter circle on one 12-inch-diameter cardboard cake round. │ +│ Cut out marked circle. │ +│ Cut around sides of 5-inch-cake to loosen. │ +│ Place 4-inch cardboard over pan. │ +│ Hold cardboard and pan together; turn cake out onto cardboard. │ +│ Peel off parchment.Wrap cakes on its cardboard in foil. │ +│ Repeat turning out, peeling off parchment and wrapping cakes in foil, using 7-inch cardboard for 8-inch cake and 11-inch cardboard for 12-inch cake. │ +│ Using remaining ingredients, make 1 more batch of cake batter and bake 3 more cake layers as described above. │ +│ Cool cakes in pans. │ +│ Cover cakes in pans tightly with foil. │ +│ (Can be prepared ahead. │ +│ Let stand at room temperature up to 1 day or double-wrap all cake layers and freeze up to 1 week. │ +│ Bring cake layers to room temperature before using.) │ +│ Place first 12-inch cake on its cardboard on work surface. │ +│ Spread 2 3/4 cups ganache over top of cake and all the way to edge. │ +│ Spread 2/3 cup jam over ganache, leaving 1/2-inch chocolate border at edge. │ +│ Drop 1 3/4 cups white chocolate frosting by spoonfuls over jam. │ +│ Gently spread frosting over jam, leaving 1/2-inch chocolate border at edge. │ +│ Rub some cocoa powder over second 12-inch cardboard. │ +│ Cut around sides of second 12-inch cake to loosen. │ +│ Place cardboard, cocoa side down, over pan. │ +│ Turn cake out onto cardboard. │ +│ Peel off parchment. │ +│ Carefully slide cake off cardboard and onto filling on first 12-inch cake. │ +│ Refrigerate. │ +│ Place first 8-inch cake on its cardboard on work surface. │ +│ Spread 1 cup ganache over top all the way to edge. │ +│ Spread 1/4 cup jam over, leaving 1/2-inch chocolate border at edge. │ +│ Drop 1 cup white chocolate frosting by spoonfuls over jam. │ +│ Gently spread frosting over jam, leaving 1/2-inch chocolate border at edge. │ +│ Rub some cocoa over second 8-inch cardboard. │ +│ Cut around sides of second 8-inch cake to loosen. │ +│ Place cardboard, cocoa side down, over pan. │ +│ Turn cake out onto cardboard. │ +│ Peel off parchment. │ +│ Slide cake off cardboard and onto filling on first 8-inch cake. │ +│ Refrigerate. │ +│ Place first 5-inch cake on its cardboard on work surface. │ +│ Spread 1/2 cup ganache over top of cake and all the way to edge. │ +│ Spread 2 tablespoons jam over, leaving 1/2-inch chocolate border at edge. │ +│ Drop 1/3 cup white chocolate frosting by spoonfuls over jam. │ +│ Gently spread frosting over jam, leaving 1/2-inch chocolate border at edge. │ +│ Rub cocoa over second 6-inch cardboard. │ +│ Cut around sides of second 5-inch cake to loosen. │ +│ Place cardboard, cocoa side down, over pan. │ +│ Turn cake out onto cardboard. │ +│ Peel off parchment. │ +│ Slide cake off cardboard and onto filling on first 5-inch cake. │ +│ Chill all cakes 1 hour to set filling. │ +│ Place 12-inch tiered cake on its cardboard on revolving cake stand. │ +│ Spread 2 2/3 cups frosting over top and sides of cake as a first coat. │ +│ Refrigerate cake. │ +│ Place 8-inch tiered cake on its cardboard on cake stand. │ +│ Spread 1 1/4 cups frosting over top and sides of cake as a first coat. │ +│ Refrigerate cake. │ +│ Place 5-inch tiered cake on its cardboard on cake stand. │ +│ Spread 3/4 cup frosting over top and sides of cake as a first coat. │ +│ Refrigerate all cakes until first coats of frosting set, about 1 hour. │ +│ (Cakes can be made to this point up to 1 day ahead; cover and keep refrigerate.) │ +│ Prepare second batch of frosting, using remaining frosting ingredients and following directions for first batch. │ +│ Spoon 2 cups frosting into pastry bag fitted with small star tip. │ +│ Place 12-inch cake on its cardboard on large flat platter. │ +│ Place platter on cake stand. │ +│ Using icing spatula, spread 2 1/2 cups frosting over top and sides of cake; smooth top. │ +│ Using filled pastry bag, pipe decorative border around top edge of cake. │ +│ Refrigerate cake on platter. │ +│ Place 8-inch cake on its cardboard on cake stand. │ +│ Using icing spatula, spread 1 1/2 cups frosting over top and sides of cake; smooth top. │ +│ Using pastry bag, pipe decorative border around top edge of cake. │ +│ Refrigerate cake on its cardboard. │ +│ Place 5-inch cake on its cardboard on cake stand. │ +│ Using icing spatula, spread 3/4 cup frosting over top and sides of cake; smooth top. │ +│ Using pastry bag, pipe decorative border around top edge of cake, spooning more frosting into bag if necessary. │ +│ Refrigerate cake on its cardboard. │ +│ Keep all cakes refrigerated until frosting sets, about 2 hours. │ +│ (Can be prepared 2 days ahead. │ +│ Cover loosely; keep refrigerated.) │ +│ Place 12-inch cake on platter on work surface. │ +│ Press 1 wooden dowel straight down into and completely through center of cake. │ +│ Mark dowel 1/4 inch above top of frosting. │ +│ Remove dowel and cut with serrated knife at marked point. │ +│ Cut 4 more dowels to same length. │ +│ Press 1 cut dowel back into center of cake. │ +│ Press remaining 4 cut dowels into cake, positioning 3 1/2 inches inward from cake edges and spacing evenly. │ +│ Place 8-inch cake on its cardboard on work surface. │ +│ Press 1 dowel straight down into and completely through center of cake. │ +│ Mark dowel 1/4 inch above top of frosting. │ +│ Remove dowel and cut with serrated knife at marked point. │ +│ Cut 3 more dowels to same length. │ +│ Press 1 cut dowel back into center of cake. │ +│ Press remaining 3 cut dowels into cake, positioning 2 1/2 inches inward from edges and spacing evenly. │ +│ Using large metal spatula as aid, place 8-inch cake on its cardboard atop dowels in 12-inch cake, centering carefully. │ +│ Gently place 5-inch cake on its cardboard atop dowels in 8-inch cake, centering carefully. │ +│ Using citrus stripper, cut long strips of orange peel from oranges. │ +│ Cut strips into long segments. │ +│ To make orange peel coils, wrap peel segment around handle of wooden spoon; gently slide peel off handle so that peel keeps coiled shape. │ +│ Garnish cake with orange peel coils, ivy or mint sprigs, and some berries. │ +│ (Assembled cake can be made up to 8 hours ahead. │ +│ Let stand at cool room temperature.) │ +│ Remove top and middle cake tiers. │ +│ Remove dowels from cakes. │ +│ Cut top and middle cakes into slices. │ +│ To cut 12-inch cake: Starting 3 inches inward from edge and inserting knife straight down, cut through from top to bottom to make 6-inch-diameter circle in center of cake. │ +│ Cut outer portion of cake into slices; cut inner portion into slices and serve with strawberries. │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ + +126 rows in set. Elapsed: 0.011 sec. Processed 8.19 thousand rows, 5.34 MB (737.75 thousand rows/s., 480.59 MB/s.) +``` + +### Online Playground + +The dataset is also available in the [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==). + +[Original article](https://clickhouse.com/docs/en/getting-started/example-datasets/recipes/) diff --git a/docs/en/reference/getting-started/example-datasets/star-schema.md b/docs/en/reference/getting-started/example-datasets/star-schema.md new file mode 100644 index 00000000000..35ff492c360 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/star-schema.md @@ -0,0 +1,371 @@ +--- +sidebar_label: Star Schema Benchmark +description: "Dataset based on the TPC-H dbgen source. The coding style and architecture +follows the TPCH dbgen." +--- + +# Star Schema Benchmark + + +Compiling dbgen: + +``` bash +$ git clone git@github.com:vadimtk/ssb-dbgen.git +$ cd ssb-dbgen +$ make +``` + +Generating data: + +:::warning +With `-s 100` dbgen generates 600 million rows (67 GB), while while `-s 1000` it generates 6 billion rows (which takes a lot of time) +::: + +``` bash +$ ./dbgen -s 1000 -T c +$ ./dbgen -s 1000 -T l +$ ./dbgen -s 1000 -T p +$ ./dbgen -s 1000 -T s +$ ./dbgen -s 1000 -T d +``` + +Creating tables in ClickHouse: + +``` sql +CREATE TABLE customer +( + C_CUSTKEY UInt32, + C_NAME String, + C_ADDRESS String, + C_CITY LowCardinality(String), + C_NATION LowCardinality(String), + C_REGION LowCardinality(String), + C_PHONE String, + C_MKTSEGMENT LowCardinality(String) +) +ENGINE = MergeTree ORDER BY (C_CUSTKEY); + +CREATE TABLE lineorder +( + LO_ORDERKEY UInt32, + LO_LINENUMBER UInt8, + LO_CUSTKEY UInt32, + LO_PARTKEY UInt32, + LO_SUPPKEY UInt32, + LO_ORDERDATE Date, + LO_ORDERPRIORITY LowCardinality(String), + LO_SHIPPRIORITY UInt8, + LO_QUANTITY UInt8, + LO_EXTENDEDPRICE UInt32, + LO_ORDTOTALPRICE UInt32, + LO_DISCOUNT UInt8, + LO_REVENUE UInt32, + LO_SUPPLYCOST UInt32, + LO_TAX UInt8, + LO_COMMITDATE Date, + LO_SHIPMODE LowCardinality(String) +) +ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY); + +CREATE TABLE part +( + P_PARTKEY UInt32, + P_NAME String, + P_MFGR LowCardinality(String), + P_CATEGORY LowCardinality(String), + P_BRAND LowCardinality(String), + P_COLOR LowCardinality(String), + P_TYPE LowCardinality(String), + P_SIZE UInt8, + P_CONTAINER LowCardinality(String) +) +ENGINE = MergeTree ORDER BY P_PARTKEY; + +CREATE TABLE supplier +( + S_SUPPKEY UInt32, + S_NAME String, + S_ADDRESS String, + S_CITY LowCardinality(String), + S_NATION LowCardinality(String), + S_REGION LowCardinality(String), + S_PHONE String +) +ENGINE = MergeTree ORDER BY S_SUPPKEY; +``` + +Inserting data: + +``` bash +$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl +$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl +$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl +$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl +``` + +Converting “star schema” to denormalized “flat schema”: + +``` sql +SET max_memory_usage = 20000000000; + +CREATE TABLE lineorder_flat +ENGINE = MergeTree +PARTITION BY toYear(LO_ORDERDATE) +ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS +SELECT + l.LO_ORDERKEY AS LO_ORDERKEY, + l.LO_LINENUMBER AS LO_LINENUMBER, + l.LO_CUSTKEY AS LO_CUSTKEY, + l.LO_PARTKEY AS LO_PARTKEY, + l.LO_SUPPKEY AS LO_SUPPKEY, + l.LO_ORDERDATE AS LO_ORDERDATE, + l.LO_ORDERPRIORITY AS LO_ORDERPRIORITY, + l.LO_SHIPPRIORITY AS LO_SHIPPRIORITY, + l.LO_QUANTITY AS LO_QUANTITY, + l.LO_EXTENDEDPRICE AS LO_EXTENDEDPRICE, + l.LO_ORDTOTALPRICE AS LO_ORDTOTALPRICE, + l.LO_DISCOUNT AS LO_DISCOUNT, + l.LO_REVENUE AS LO_REVENUE, + l.LO_SUPPLYCOST AS LO_SUPPLYCOST, + l.LO_TAX AS LO_TAX, + l.LO_COMMITDATE AS LO_COMMITDATE, + l.LO_SHIPMODE AS LO_SHIPMODE, + c.C_NAME AS C_NAME, + c.C_ADDRESS AS C_ADDRESS, + c.C_CITY AS C_CITY, + c.C_NATION AS C_NATION, + c.C_REGION AS C_REGION, + c.C_PHONE AS C_PHONE, + c.C_MKTSEGMENT AS C_MKTSEGMENT, + s.S_NAME AS S_NAME, + s.S_ADDRESS AS S_ADDRESS, + s.S_CITY AS S_CITY, + s.S_NATION AS S_NATION, + s.S_REGION AS S_REGION, + s.S_PHONE AS S_PHONE, + p.P_NAME AS P_NAME, + p.P_MFGR AS P_MFGR, + p.P_CATEGORY AS P_CATEGORY, + p.P_BRAND AS P_BRAND, + p.P_COLOR AS P_COLOR, + p.P_TYPE AS P_TYPE, + p.P_SIZE AS P_SIZE, + p.P_CONTAINER AS P_CONTAINER +FROM lineorder AS l +INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY +INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY +INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; +``` + +Running the queries: + +Q1.1 + +``` sql +SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue +FROM lineorder_flat +WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; +``` + +Q1.2 + +``` sql +SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue +FROM lineorder_flat +WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; +``` + +Q1.3 + +``` sql +SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue +FROM lineorder_flat +WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 + AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; +``` + +Q2.1 + +``` sql +SELECT + sum(LO_REVENUE), + toYear(LO_ORDERDATE) AS year, + P_BRAND +FROM lineorder_flat +WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' +GROUP BY + year, + P_BRAND +ORDER BY + year, + P_BRAND; +``` + +Q2.2 + +``` sql +SELECT + sum(LO_REVENUE), + toYear(LO_ORDERDATE) AS year, + P_BRAND +FROM lineorder_flat +WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA' +GROUP BY + year, + P_BRAND +ORDER BY + year, + P_BRAND; +``` + +Q2.3 + +``` sql +SELECT + sum(LO_REVENUE), + toYear(LO_ORDERDATE) AS year, + P_BRAND +FROM lineorder_flat +WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' +GROUP BY + year, + P_BRAND +ORDER BY + year, + P_BRAND; +``` + +Q3.1 + +``` sql +SELECT + C_NATION, + S_NATION, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 +GROUP BY + C_NATION, + S_NATION, + year +ORDER BY + year ASC, + revenue DESC; +``` + +Q3.2 + +``` sql +SELECT + C_CITY, + S_CITY, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 +GROUP BY + C_CITY, + S_CITY, + year +ORDER BY + year ASC, + revenue DESC; +``` + +Q3.3 + +``` sql +SELECT + C_CITY, + S_CITY, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 +GROUP BY + C_CITY, + S_CITY, + year +ORDER BY + year ASC, + revenue DESC; +``` + +Q3.4 + +``` sql +SELECT + C_CITY, + S_CITY, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = 199712 +GROUP BY + C_CITY, + S_CITY, + year +ORDER BY + year ASC, + revenue DESC; +``` + +Q4.1 + +``` sql +SELECT + toYear(LO_ORDERDATE) AS year, + C_NATION, + sum(LO_REVENUE - LO_SUPPLYCOST) AS profit +FROM lineorder_flat +WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') +GROUP BY + year, + C_NATION +ORDER BY + year ASC, + C_NATION ASC; +``` + +Q4.2 + +``` sql +SELECT + toYear(LO_ORDERDATE) AS year, + S_NATION, + P_CATEGORY, + sum(LO_REVENUE - LO_SUPPLYCOST) AS profit +FROM lineorder_flat +WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') +GROUP BY + year, + S_NATION, + P_CATEGORY +ORDER BY + year ASC, + S_NATION ASC, + P_CATEGORY ASC; +``` + +Q4.3 + +``` sql +SELECT + toYear(LO_ORDERDATE) AS year, + S_CITY, + P_BRAND, + sum(LO_REVENUE - LO_SUPPLYCOST) AS profit +FROM lineorder_flat +WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' +GROUP BY + year, + S_CITY, + P_BRAND +ORDER BY + year ASC, + S_CITY ASC, + P_BRAND ASC; +``` + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/star_schema/) diff --git a/docs/en/reference/getting-started/example-datasets/uk-price-paid.md b/docs/en/reference/getting-started/example-datasets/uk-price-paid.md new file mode 100644 index 00000000000..e19e801dcf9 --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/uk-price-paid.md @@ -0,0 +1,648 @@ +--- +sidebar_label: UK Property Price Paid +--- + +# UK Property Price Paid + +The dataset contains data about prices paid for real-estate property in England and Wales. The data is available since year 1995. +The size of the dataset in uncompressed form is about 4 GiB and it will take about 278 MiB in ClickHouse. + +Source: https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads +Description of the fields: https://www.gov.uk/guidance/about-the-price-paid-data + +Contains HM Land Registry data © Crown copyright and database right 2021. This data is licensed under the Open Government Licence v3.0. + +## Download the Dataset {#download-dataset} + +Run the command: + +```bash +wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv +``` + +Download will take about 2 minutes with good internet connection. + +## Create the Table {#create-table} + +```sql +CREATE TABLE uk_price_paid +( + price UInt32, + date Date, + postcode1 LowCardinality(String), + postcode2 LowCardinality(String), + type Enum8('terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4, 'other' = 0), + is_new UInt8, + duration Enum8('freehold' = 1, 'leasehold' = 2, 'unknown' = 0), + addr1 String, + addr2 String, + street LowCardinality(String), + locality LowCardinality(String), + town LowCardinality(String), + district LowCardinality(String), + county LowCardinality(String), + category UInt8 +) ENGINE = MergeTree ORDER BY (postcode1, postcode2, addr1, addr2); +``` + +## Preprocess and Import Data {#preprocess-import-data} + +We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it. + +In this example, we define the structure of source data from the CSV file and specify a query to preprocess the data with `clickhouse-local`. + +The preprocessing is: +- splitting the postcode to two different columns `postcode1` and `postcode2` that is better for storage and queries; +- coverting the `time` field to date as it only contains 00:00 time; +- ignoring the [UUid](../sql-reference/data-types/uuid.md) field because we don't need it for analysis; +- transforming `type` and `duration` to more readable Enum fields with function [transform](../sql-reference/functions/other-functions.md#transform); +- transforming `is_new` and `category` fields from single-character string (`Y`/`N` and `A`/`B`) to [UInt8](../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256) field with 0 and 1. + +Preprocessed data is piped directly to `clickhouse-client` to be inserted into ClickHouse table in streaming fashion. + +```bash +clickhouse-local --input-format CSV --structure ' + uuid String, + price UInt32, + time DateTime, + postcode String, + a String, + b String, + c String, + addr1 String, + addr2 String, + street String, + locality String, + town String, + district String, + county String, + d String, + e String +' --query " + WITH splitByChar(' ', postcode) AS p + SELECT + price, + toDate(time) AS date, + p[1] AS postcode1, + p[2] AS postcode2, + transform(a, ['T', 'S', 'D', 'F', 'O'], ['terraced', 'semi-detached', 'detached', 'flat', 'other']) AS type, + b = 'Y' AS is_new, + transform(c, ['F', 'L', 'U'], ['freehold', 'leasehold', 'unknown']) AS duration, + addr1, + addr2, + street, + locality, + town, + district, + county, + d = 'B' AS category + FROM table" --date_time_input_format best_effort < pp-complete.csv | clickhouse-client --query "INSERT INTO uk_price_paid FORMAT TSV" +``` + +It will take about 40 seconds. + +## Validate the Data {#validate-data} + +Query: + +```sql +SELECT count() FROM uk_price_paid; +``` + +Result: + +```text +┌──count()─┐ +│ 26321785 │ +└──────────┘ +``` + +The size of dataset in ClickHouse is just 278 MiB, check it. + +Query: + +```sql +SELECT formatReadableSize(total_bytes) FROM system.tables WHERE name = 'uk_price_paid'; +``` + +Result: + +```text +┌─formatReadableSize(total_bytes)─┐ +│ 278.80 MiB │ +└─────────────────────────────────┘ +``` + +## Run Some Queries {#run-queries} + +### Query 1. Average Price Per Year {#average-price} + +Query: + +```sql +SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 1000000, 80) FROM uk_price_paid GROUP BY year ORDER BY year; +``` + +Result: + +```text +┌─year─┬──price─┬─bar(round(avg(price)), 0, 1000000, 80)─┐ +│ 1995 │ 67932 │ █████▍ │ +│ 1996 │ 71505 │ █████▋ │ +│ 1997 │ 78532 │ ██████▎ │ +│ 1998 │ 85436 │ ██████▋ │ +│ 1999 │ 96037 │ ███████▋ │ +│ 2000 │ 107479 │ ████████▌ │ +│ 2001 │ 118885 │ █████████▌ │ +│ 2002 │ 137941 │ ███████████ │ +│ 2003 │ 155889 │ ████████████▍ │ +│ 2004 │ 178885 │ ██████████████▎ │ +│ 2005 │ 189351 │ ███████████████▏ │ +│ 2006 │ 203528 │ ████████████████▎ │ +│ 2007 │ 219378 │ █████████████████▌ │ +│ 2008 │ 217056 │ █████████████████▎ │ +│ 2009 │ 213419 │ █████████████████ │ +│ 2010 │ 236109 │ ██████████████████▊ │ +│ 2011 │ 232805 │ ██████████████████▌ │ +│ 2012 │ 238367 │ ███████████████████ │ +│ 2013 │ 256931 │ ████████████████████▌ │ +│ 2014 │ 279915 │ ██████████████████████▍ │ +│ 2015 │ 297266 │ ███████████████████████▋ │ +│ 2016 │ 313201 │ █████████████████████████ │ +│ 2017 │ 346097 │ ███████████████████████████▋ │ +│ 2018 │ 350116 │ ████████████████████████████ │ +│ 2019 │ 351013 │ ████████████████████████████ │ +│ 2020 │ 369420 │ █████████████████████████████▌ │ +│ 2021 │ 386903 │ ██████████████████████████████▊ │ +└──────┴────────┴────────────────────────────────────────┘ +``` + +### Query 2. Average Price per Year in London {#average-price-london} + +Query: + +```sql +SELECT toYear(date) AS year, round(avg(price)) AS price, bar(price, 0, 2000000, 100) FROM uk_price_paid WHERE town = 'LONDON' GROUP BY year ORDER BY year; +``` + +Result: + +```text +┌─year─┬───price─┬─bar(round(avg(price)), 0, 2000000, 100)───────────────┐ +│ 1995 │ 109116 │ █████▍ │ +│ 1996 │ 118667 │ █████▊ │ +│ 1997 │ 136518 │ ██████▋ │ +│ 1998 │ 152983 │ ███████▋ │ +│ 1999 │ 180637 │ █████████ │ +│ 2000 │ 215838 │ ██████████▋ │ +│ 2001 │ 232994 │ ███████████▋ │ +│ 2002 │ 263670 │ █████████████▏ │ +│ 2003 │ 278394 │ █████████████▊ │ +│ 2004 │ 304666 │ ███████████████▏ │ +│ 2005 │ 322875 │ ████████████████▏ │ +│ 2006 │ 356191 │ █████████████████▋ │ +│ 2007 │ 404054 │ ████████████████████▏ │ +│ 2008 │ 420741 │ █████████████████████ │ +│ 2009 │ 427753 │ █████████████████████▍ │ +│ 2010 │ 480306 │ ████████████████████████ │ +│ 2011 │ 496274 │ ████████████████████████▋ │ +│ 2012 │ 519442 │ █████████████████████████▊ │ +│ 2013 │ 616212 │ ██████████████████████████████▋ │ +│ 2014 │ 724154 │ ████████████████████████████████████▏ │ +│ 2015 │ 792129 │ ███████████████████████████████████████▌ │ +│ 2016 │ 843655 │ ██████████████████████████████████████████▏ │ +│ 2017 │ 982642 │ █████████████████████████████████████████████████▏ │ +│ 2018 │ 1016835 │ ██████████████████████████████████████████████████▋ │ +│ 2019 │ 1042849 │ ████████████████████████████████████████████████████▏ │ +│ 2020 │ 1011889 │ ██████████████████████████████████████████████████▌ │ +│ 2021 │ 960343 │ ████████████████████████████████████████████████ │ +└──────┴─────────┴───────────────────────────────────────────────────────┘ +``` + +Something happened in 2013. I don't have a clue. Maybe you have a clue what happened in 2020? + +### Query 3. The Most Expensive Neighborhoods {#most-expensive-neighborhoods} + +Query: + +```sql +SELECT + town, + district, + count() AS c, + round(avg(price)) AS price, + bar(price, 0, 5000000, 100) +FROM uk_price_paid +WHERE date >= '2020-01-01' +GROUP BY + town, + district +HAVING c >= 100 +ORDER BY price DESC +LIMIT 100; +``` + +Result: + +```text + +┌─town─────────────────┬─district───────────────┬────c─┬───price─┬─bar(round(avg(price)), 0, 5000000, 100)────────────────────────────┐ +│ LONDON │ CITY OF WESTMINSTER │ 3606 │ 3280239 │ █████████████████████████████████████████████████████████████████▌ │ +│ LONDON │ CITY OF LONDON │ 274 │ 3160502 │ ███████████████████████████████████████████████████████████████▏ │ +│ LONDON │ KENSINGTON AND CHELSEA │ 2550 │ 2308478 │ ██████████████████████████████████████████████▏ │ +│ LEATHERHEAD │ ELMBRIDGE │ 114 │ 1897407 │ █████████████████████████████████████▊ │ +│ LONDON │ CAMDEN │ 3033 │ 1805404 │ ████████████████████████████████████ │ +│ VIRGINIA WATER │ RUNNYMEDE │ 156 │ 1753247 │ ███████████████████████████████████ │ +│ WINDLESHAM │ SURREY HEATH │ 108 │ 1677613 │ █████████████████████████████████▌ │ +│ THORNTON HEATH │ CROYDON │ 546 │ 1671721 │ █████████████████████████████████▍ │ +│ BARNET │ ENFIELD │ 124 │ 1505840 │ ██████████████████████████████ │ +│ COBHAM │ ELMBRIDGE │ 387 │ 1237250 │ ████████████████████████▋ │ +│ LONDON │ ISLINGTON │ 2668 │ 1236980 │ ████████████████████████▋ │ +│ OXFORD │ SOUTH OXFORDSHIRE │ 321 │ 1220907 │ ████████████████████████▍ │ +│ LONDON │ RICHMOND UPON THAMES │ 704 │ 1215551 │ ████████████████████████▎ │ +│ LONDON │ HOUNSLOW │ 671 │ 1207493 │ ████████████████████████▏ │ +│ ASCOT │ WINDSOR AND MAIDENHEAD │ 407 │ 1183299 │ ███████████████████████▋ │ +│ BEACONSFIELD │ BUCKINGHAMSHIRE │ 330 │ 1175615 │ ███████████████████████▌ │ +│ RICHMOND │ RICHMOND UPON THAMES │ 874 │ 1110444 │ ██████████████████████▏ │ +│ LONDON │ HAMMERSMITH AND FULHAM │ 3086 │ 1053983 │ █████████████████████ │ +│ SURBITON │ ELMBRIDGE │ 100 │ 1011800 │ ████████████████████▏ │ +│ RADLETT │ HERTSMERE │ 283 │ 1011712 │ ████████████████████▏ │ +│ SALCOMBE │ SOUTH HAMS │ 127 │ 1011624 │ ████████████████████▏ │ +│ WEYBRIDGE │ ELMBRIDGE │ 655 │ 1007265 │ ████████████████████▏ │ +│ ESHER │ ELMBRIDGE │ 485 │ 986581 │ ███████████████████▋ │ +│ LEATHERHEAD │ GUILDFORD │ 202 │ 977320 │ ███████████████████▌ │ +│ BURFORD │ WEST OXFORDSHIRE │ 111 │ 966893 │ ███████████████████▎ │ +│ BROCKENHURST │ NEW FOREST │ 129 │ 956675 │ ███████████████████▏ │ +│ HINDHEAD │ WAVERLEY │ 137 │ 953753 │ ███████████████████ │ +│ GERRARDS CROSS │ BUCKINGHAMSHIRE │ 419 │ 951121 │ ███████████████████ │ +│ EAST MOLESEY │ ELMBRIDGE │ 192 │ 936769 │ ██████████████████▋ │ +│ CHALFONT ST GILES │ BUCKINGHAMSHIRE │ 146 │ 925515 │ ██████████████████▌ │ +│ LONDON │ TOWER HAMLETS │ 4388 │ 918304 │ ██████████████████▎ │ +│ OLNEY │ MILTON KEYNES │ 235 │ 910646 │ ██████████████████▏ │ +│ HENLEY-ON-THAMES │ SOUTH OXFORDSHIRE │ 540 │ 902418 │ ██████████████████ │ +│ LONDON │ SOUTHWARK │ 3885 │ 892997 │ █████████████████▋ │ +│ KINGSTON UPON THAMES │ KINGSTON UPON THAMES │ 960 │ 885969 │ █████████████████▋ │ +│ LONDON │ EALING │ 2658 │ 871755 │ █████████████████▍ │ +│ CRANBROOK │ TUNBRIDGE WELLS │ 431 │ 862348 │ █████████████████▏ │ +│ LONDON │ MERTON │ 2099 │ 859118 │ █████████████████▏ │ +│ BELVEDERE │ BEXLEY │ 346 │ 842423 │ ████████████████▋ │ +│ GUILDFORD │ WAVERLEY │ 143 │ 841277 │ ████████████████▋ │ +│ HARPENDEN │ ST ALBANS │ 657 │ 841216 │ ████████████████▋ │ +│ LONDON │ HACKNEY │ 3307 │ 837090 │ ████████████████▋ │ +│ LONDON │ WANDSWORTH │ 6566 │ 832663 │ ████████████████▋ │ +│ MAIDENHEAD │ BUCKINGHAMSHIRE │ 123 │ 824299 │ ████████████████▍ │ +│ KINGS LANGLEY │ DACORUM │ 145 │ 821331 │ ████████████████▍ │ +│ BERKHAMSTED │ DACORUM │ 543 │ 818415 │ ████████████████▎ │ +│ GREAT MISSENDEN │ BUCKINGHAMSHIRE │ 226 │ 802807 │ ████████████████ │ +│ BILLINGSHURST │ CHICHESTER │ 144 │ 797829 │ ███████████████▊ │ +│ WOKING │ GUILDFORD │ 176 │ 793494 │ ███████████████▋ │ +│ STOCKBRIDGE │ TEST VALLEY │ 178 │ 793269 │ ███████████████▋ │ +│ EPSOM │ REIGATE AND BANSTEAD │ 172 │ 791862 │ ███████████████▋ │ +│ TONBRIDGE │ TUNBRIDGE WELLS │ 360 │ 787876 │ ███████████████▋ │ +│ TEDDINGTON │ RICHMOND UPON THAMES │ 595 │ 786492 │ ███████████████▋ │ +│ TWICKENHAM │ RICHMOND UPON THAMES │ 1155 │ 786193 │ ███████████████▋ │ +│ LYNDHURST │ NEW FOREST │ 102 │ 785593 │ ███████████████▋ │ +│ LONDON │ LAMBETH │ 5228 │ 774574 │ ███████████████▍ │ +│ LONDON │ BARNET │ 3955 │ 773259 │ ███████████████▍ │ +│ OXFORD │ VALE OF WHITE HORSE │ 353 │ 772088 │ ███████████████▍ │ +│ TONBRIDGE │ MAIDSTONE │ 305 │ 770740 │ ███████████████▍ │ +│ LUTTERWORTH │ HARBOROUGH │ 538 │ 768634 │ ███████████████▎ │ +│ WOODSTOCK │ WEST OXFORDSHIRE │ 140 │ 766037 │ ███████████████▎ │ +│ MIDHURST │ CHICHESTER │ 257 │ 764815 │ ███████████████▎ │ +│ MARLOW │ BUCKINGHAMSHIRE │ 327 │ 761876 │ ███████████████▏ │ +│ LONDON │ NEWHAM │ 3237 │ 761784 │ ███████████████▏ │ +│ ALDERLEY EDGE │ CHESHIRE EAST │ 178 │ 757318 │ ███████████████▏ │ +│ LUTON │ CENTRAL BEDFORDSHIRE │ 212 │ 754283 │ ███████████████ │ +│ PETWORTH │ CHICHESTER │ 154 │ 754220 │ ███████████████ │ +│ ALRESFORD │ WINCHESTER │ 219 │ 752718 │ ███████████████ │ +│ POTTERS BAR │ WELWYN HATFIELD │ 174 │ 748465 │ ██████████████▊ │ +│ HASLEMERE │ CHICHESTER │ 128 │ 746907 │ ██████████████▊ │ +│ TADWORTH │ REIGATE AND BANSTEAD │ 502 │ 743252 │ ██████████████▋ │ +│ THAMES DITTON │ ELMBRIDGE │ 244 │ 741913 │ ██████████████▋ │ +│ REIGATE │ REIGATE AND BANSTEAD │ 581 │ 738198 │ ██████████████▋ │ +│ BOURNE END │ BUCKINGHAMSHIRE │ 138 │ 735190 │ ██████████████▋ │ +│ SEVENOAKS │ SEVENOAKS │ 1156 │ 730018 │ ██████████████▌ │ +│ OXTED │ TANDRIDGE │ 336 │ 729123 │ ██████████████▌ │ +│ INGATESTONE │ BRENTWOOD │ 166 │ 728103 │ ██████████████▌ │ +│ LONDON │ BRENT │ 2079 │ 720605 │ ██████████████▍ │ +│ LONDON │ HARINGEY │ 3216 │ 717780 │ ██████████████▎ │ +│ PURLEY │ CROYDON │ 575 │ 716108 │ ██████████████▎ │ +│ WELWYN │ WELWYN HATFIELD │ 222 │ 710603 │ ██████████████▏ │ +│ RICKMANSWORTH │ THREE RIVERS │ 798 │ 704571 │ ██████████████ │ +│ BANSTEAD │ REIGATE AND BANSTEAD │ 401 │ 701293 │ ██████████████ │ +│ CHIGWELL │ EPPING FOREST │ 261 │ 701203 │ ██████████████ │ +│ PINNER │ HARROW │ 528 │ 698885 │ █████████████▊ │ +│ HASLEMERE │ WAVERLEY │ 280 │ 696659 │ █████████████▊ │ +│ SLOUGH │ BUCKINGHAMSHIRE │ 396 │ 694917 │ █████████████▊ │ +│ WALTON-ON-THAMES │ ELMBRIDGE │ 946 │ 692395 │ █████████████▋ │ +│ READING │ SOUTH OXFORDSHIRE │ 318 │ 691988 │ █████████████▋ │ +│ NORTHWOOD │ HILLINGDON │ 271 │ 690643 │ █████████████▋ │ +│ FELTHAM │ HOUNSLOW │ 763 │ 688595 │ █████████████▋ │ +│ ASHTEAD │ MOLE VALLEY │ 303 │ 687923 │ █████████████▋ │ +│ BARNET │ BARNET │ 975 │ 686980 │ █████████████▋ │ +│ WOKING │ SURREY HEATH │ 283 │ 686669 │ █████████████▋ │ +│ MALMESBURY │ WILTSHIRE │ 323 │ 683324 │ █████████████▋ │ +│ AMERSHAM │ BUCKINGHAMSHIRE │ 496 │ 680962 │ █████████████▌ │ +│ CHISLEHURST │ BROMLEY │ 430 │ 680209 │ █████████████▌ │ +│ HYTHE │ FOLKESTONE AND HYTHE │ 490 │ 676908 │ █████████████▌ │ +│ MAYFIELD │ WEALDEN │ 101 │ 676210 │ █████████████▌ │ +│ ASCOT │ BRACKNELL FOREST │ 168 │ 676004 │ █████████████▌ │ +└──────────────────────┴────────────────────────┴──────┴─────────┴────────────────────────────────────────────────────────────────────┘ +``` + +## Let's Speed Up Queries Using Projections {#speedup-with-projections} + +[Projections](../sql-reference/statements/alter/projection.md) allow to improve queries speed by storing pre-aggregated data. + +### Build a Projection {#build-projection} + +Create an aggregate projection by dimensions `toYear(date)`, `district`, `town`: + +```sql +ALTER TABLE uk_price_paid + ADD PROJECTION projection_by_year_district_town + ( + SELECT + toYear(date), + district, + town, + avg(price), + sum(price), + count() + GROUP BY + toYear(date), + district, + town + ); +``` + +Populate the projection for existing data (without it projection will be created for only newly inserted data): + +```sql +ALTER TABLE uk_price_paid + MATERIALIZE PROJECTION projection_by_year_district_town +SETTINGS mutations_sync = 1; +``` + +## Test Performance {#test-performance} + +Let's run the same 3 queries. + +[Enable](../operations/settings/settings.md#allow-experimental-projection-optimization) projections for selects: + +```sql +SET allow_experimental_projection_optimization = 1; +``` + +### Query 1. Average Price Per Year {#average-price-projections} + +Query: + +```sql +SELECT + toYear(date) AS year, + round(avg(price)) AS price, + bar(price, 0, 1000000, 80) +FROM uk_price_paid +GROUP BY year +ORDER BY year ASC; +``` + +Result: + +```text +┌─year─┬──price─┬─bar(round(avg(price)), 0, 1000000, 80)─┐ +│ 1995 │ 67932 │ █████▍ │ +│ 1996 │ 71505 │ █████▋ │ +│ 1997 │ 78532 │ ██████▎ │ +│ 1998 │ 85436 │ ██████▋ │ +│ 1999 │ 96037 │ ███████▋ │ +│ 2000 │ 107479 │ ████████▌ │ +│ 2001 │ 118885 │ █████████▌ │ +│ 2002 │ 137941 │ ███████████ │ +│ 2003 │ 155889 │ ████████████▍ │ +│ 2004 │ 178885 │ ██████████████▎ │ +│ 2005 │ 189351 │ ███████████████▏ │ +│ 2006 │ 203528 │ ████████████████▎ │ +│ 2007 │ 219378 │ █████████████████▌ │ +│ 2008 │ 217056 │ █████████████████▎ │ +│ 2009 │ 213419 │ █████████████████ │ +│ 2010 │ 236109 │ ██████████████████▊ │ +│ 2011 │ 232805 │ ██████████████████▌ │ +│ 2012 │ 238367 │ ███████████████████ │ +│ 2013 │ 256931 │ ████████████████████▌ │ +│ 2014 │ 279915 │ ██████████████████████▍ │ +│ 2015 │ 297266 │ ███████████████████████▋ │ +│ 2016 │ 313201 │ █████████████████████████ │ +│ 2017 │ 346097 │ ███████████████████████████▋ │ +│ 2018 │ 350116 │ ████████████████████████████ │ +│ 2019 │ 351013 │ ████████████████████████████ │ +│ 2020 │ 369420 │ █████████████████████████████▌ │ +│ 2021 │ 386903 │ ██████████████████████████████▊ │ +└──────┴────────┴────────────────────────────────────────┘ +``` + +### Query 2. Average Price Per Year in London {#average-price-london-projections} + +Query: + +```sql +SELECT + toYear(date) AS year, + round(avg(price)) AS price, + bar(price, 0, 2000000, 100) +FROM uk_price_paid +WHERE town = 'LONDON' +GROUP BY year +ORDER BY year ASC; +``` + +Result: + +```text +┌─year─┬───price─┬─bar(round(avg(price)), 0, 2000000, 100)───────────────┐ +│ 1995 │ 109116 │ █████▍ │ +│ 1996 │ 118667 │ █████▊ │ +│ 1997 │ 136518 │ ██████▋ │ +│ 1998 │ 152983 │ ███████▋ │ +│ 1999 │ 180637 │ █████████ │ +│ 2000 │ 215838 │ ██████████▋ │ +│ 2001 │ 232994 │ ███████████▋ │ +│ 2002 │ 263670 │ █████████████▏ │ +│ 2003 │ 278394 │ █████████████▊ │ +│ 2004 │ 304666 │ ███████████████▏ │ +│ 2005 │ 322875 │ ████████████████▏ │ +│ 2006 │ 356191 │ █████████████████▋ │ +│ 2007 │ 404054 │ ████████████████████▏ │ +│ 2008 │ 420741 │ █████████████████████ │ +│ 2009 │ 427753 │ █████████████████████▍ │ +│ 2010 │ 480306 │ ████████████████████████ │ +│ 2011 │ 496274 │ ████████████████████████▋ │ +│ 2012 │ 519442 │ █████████████████████████▊ │ +│ 2013 │ 616212 │ ██████████████████████████████▋ │ +│ 2014 │ 724154 │ ████████████████████████████████████▏ │ +│ 2015 │ 792129 │ ███████████████████████████████████████▌ │ +│ 2016 │ 843655 │ ██████████████████████████████████████████▏ │ +│ 2017 │ 982642 │ █████████████████████████████████████████████████▏ │ +│ 2018 │ 1016835 │ ██████████████████████████████████████████████████▋ │ +│ 2019 │ 1042849 │ ████████████████████████████████████████████████████▏ │ +│ 2020 │ 1011889 │ ██████████████████████████████████████████████████▌ │ +│ 2021 │ 960343 │ ████████████████████████████████████████████████ │ +└──────┴─────────┴───────────────────────────────────────────────────────┘ +``` + +### Query 3. The Most Expensive Neighborhoods {#most-expensive-neighborhoods-projections} + +The condition (date >= '2020-01-01') needs to be modified to match projection dimension (toYear(date) >= 2020). + +Query: + +```sql +SELECT + town, + district, + count() AS c, + round(avg(price)) AS price, + bar(price, 0, 5000000, 100) +FROM uk_price_paid +WHERE toYear(date) >= 2020 +GROUP BY + town, + district +HAVING c >= 100 +ORDER BY price DESC +LIMIT 100; +``` + +Result: + +```text +┌─town─────────────────┬─district───────────────┬────c─┬───price─┬─bar(round(avg(price)), 0, 5000000, 100)────────────────────────────┐ +│ LONDON │ CITY OF WESTMINSTER │ 3606 │ 3280239 │ █████████████████████████████████████████████████████████████████▌ │ +│ LONDON │ CITY OF LONDON │ 274 │ 3160502 │ ███████████████████████████████████████████████████████████████▏ │ +│ LONDON │ KENSINGTON AND CHELSEA │ 2550 │ 2308478 │ ██████████████████████████████████████████████▏ │ +│ LEATHERHEAD │ ELMBRIDGE │ 114 │ 1897407 │ █████████████████████████████████████▊ │ +│ LONDON │ CAMDEN │ 3033 │ 1805404 │ ████████████████████████████████████ │ +│ VIRGINIA WATER │ RUNNYMEDE │ 156 │ 1753247 │ ███████████████████████████████████ │ +│ WINDLESHAM │ SURREY HEATH │ 108 │ 1677613 │ █████████████████████████████████▌ │ +│ THORNTON HEATH │ CROYDON │ 546 │ 1671721 │ █████████████████████████████████▍ │ +│ BARNET │ ENFIELD │ 124 │ 1505840 │ ██████████████████████████████ │ +│ COBHAM │ ELMBRIDGE │ 387 │ 1237250 │ ████████████████████████▋ │ +│ LONDON │ ISLINGTON │ 2668 │ 1236980 │ ████████████████████████▋ │ +│ OXFORD │ SOUTH OXFORDSHIRE │ 321 │ 1220907 │ ████████████████████████▍ │ +│ LONDON │ RICHMOND UPON THAMES │ 704 │ 1215551 │ ████████████████████████▎ │ +│ LONDON │ HOUNSLOW │ 671 │ 1207493 │ ████████████████████████▏ │ +│ ASCOT │ WINDSOR AND MAIDENHEAD │ 407 │ 1183299 │ ███████████████████████▋ │ +│ BEACONSFIELD │ BUCKINGHAMSHIRE │ 330 │ 1175615 │ ███████████████████████▌ │ +│ RICHMOND │ RICHMOND UPON THAMES │ 874 │ 1110444 │ ██████████████████████▏ │ +│ LONDON │ HAMMERSMITH AND FULHAM │ 3086 │ 1053983 │ █████████████████████ │ +│ SURBITON │ ELMBRIDGE │ 100 │ 1011800 │ ████████████████████▏ │ +│ RADLETT │ HERTSMERE │ 283 │ 1011712 │ ████████████████████▏ │ +│ SALCOMBE │ SOUTH HAMS │ 127 │ 1011624 │ ████████████████████▏ │ +│ WEYBRIDGE │ ELMBRIDGE │ 655 │ 1007265 │ ████████████████████▏ │ +│ ESHER │ ELMBRIDGE │ 485 │ 986581 │ ███████████████████▋ │ +│ LEATHERHEAD │ GUILDFORD │ 202 │ 977320 │ ███████████████████▌ │ +│ BURFORD │ WEST OXFORDSHIRE │ 111 │ 966893 │ ███████████████████▎ │ +│ BROCKENHURST │ NEW FOREST │ 129 │ 956675 │ ███████████████████▏ │ +│ HINDHEAD │ WAVERLEY │ 137 │ 953753 │ ███████████████████ │ +│ GERRARDS CROSS │ BUCKINGHAMSHIRE │ 419 │ 951121 │ ███████████████████ │ +│ EAST MOLESEY │ ELMBRIDGE │ 192 │ 936769 │ ██████████████████▋ │ +│ CHALFONT ST GILES │ BUCKINGHAMSHIRE │ 146 │ 925515 │ ██████████████████▌ │ +│ LONDON │ TOWER HAMLETS │ 4388 │ 918304 │ ██████████████████▎ │ +│ OLNEY │ MILTON KEYNES │ 235 │ 910646 │ ██████████████████▏ │ +│ HENLEY-ON-THAMES │ SOUTH OXFORDSHIRE │ 540 │ 902418 │ ██████████████████ │ +│ LONDON │ SOUTHWARK │ 3885 │ 892997 │ █████████████████▋ │ +│ KINGSTON UPON THAMES │ KINGSTON UPON THAMES │ 960 │ 885969 │ █████████████████▋ │ +│ LONDON │ EALING │ 2658 │ 871755 │ █████████████████▍ │ +│ CRANBROOK │ TUNBRIDGE WELLS │ 431 │ 862348 │ █████████████████▏ │ +│ LONDON │ MERTON │ 2099 │ 859118 │ █████████████████▏ │ +│ BELVEDERE │ BEXLEY │ 346 │ 842423 │ ████████████████▋ │ +│ GUILDFORD │ WAVERLEY │ 143 │ 841277 │ ████████████████▋ │ +│ HARPENDEN │ ST ALBANS │ 657 │ 841216 │ ████████████████▋ │ +│ LONDON │ HACKNEY │ 3307 │ 837090 │ ████████████████▋ │ +│ LONDON │ WANDSWORTH │ 6566 │ 832663 │ ████████████████▋ │ +│ MAIDENHEAD │ BUCKINGHAMSHIRE │ 123 │ 824299 │ ████████████████▍ │ +│ KINGS LANGLEY │ DACORUM │ 145 │ 821331 │ ████████████████▍ │ +│ BERKHAMSTED │ DACORUM │ 543 │ 818415 │ ████████████████▎ │ +│ GREAT MISSENDEN │ BUCKINGHAMSHIRE │ 226 │ 802807 │ ████████████████ │ +│ BILLINGSHURST │ CHICHESTER │ 144 │ 797829 │ ███████████████▊ │ +│ WOKING │ GUILDFORD │ 176 │ 793494 │ ███████████████▋ │ +│ STOCKBRIDGE │ TEST VALLEY │ 178 │ 793269 │ ███████████████▋ │ +│ EPSOM │ REIGATE AND BANSTEAD │ 172 │ 791862 │ ███████████████▋ │ +│ TONBRIDGE │ TUNBRIDGE WELLS │ 360 │ 787876 │ ███████████████▋ │ +│ TEDDINGTON │ RICHMOND UPON THAMES │ 595 │ 786492 │ ███████████████▋ │ +│ TWICKENHAM │ RICHMOND UPON THAMES │ 1155 │ 786193 │ ███████████████▋ │ +│ LYNDHURST │ NEW FOREST │ 102 │ 785593 │ ███████████████▋ │ +│ LONDON │ LAMBETH │ 5228 │ 774574 │ ███████████████▍ │ +│ LONDON │ BARNET │ 3955 │ 773259 │ ███████████████▍ │ +│ OXFORD │ VALE OF WHITE HORSE │ 353 │ 772088 │ ███████████████▍ │ +│ TONBRIDGE │ MAIDSTONE │ 305 │ 770740 │ ███████████████▍ │ +│ LUTTERWORTH │ HARBOROUGH │ 538 │ 768634 │ ███████████████▎ │ +│ WOODSTOCK │ WEST OXFORDSHIRE │ 140 │ 766037 │ ███████████████▎ │ +│ MIDHURST │ CHICHESTER │ 257 │ 764815 │ ███████████████▎ │ +│ MARLOW │ BUCKINGHAMSHIRE │ 327 │ 761876 │ ███████████████▏ │ +│ LONDON │ NEWHAM │ 3237 │ 761784 │ ███████████████▏ │ +│ ALDERLEY EDGE │ CHESHIRE EAST │ 178 │ 757318 │ ███████████████▏ │ +│ LUTON │ CENTRAL BEDFORDSHIRE │ 212 │ 754283 │ ███████████████ │ +│ PETWORTH │ CHICHESTER │ 154 │ 754220 │ ███████████████ │ +│ ALRESFORD │ WINCHESTER │ 219 │ 752718 │ ███████████████ │ +│ POTTERS BAR │ WELWYN HATFIELD │ 174 │ 748465 │ ██████████████▊ │ +│ HASLEMERE │ CHICHESTER │ 128 │ 746907 │ ██████████████▊ │ +│ TADWORTH │ REIGATE AND BANSTEAD │ 502 │ 743252 │ ██████████████▋ │ +│ THAMES DITTON │ ELMBRIDGE │ 244 │ 741913 │ ██████████████▋ │ +│ REIGATE │ REIGATE AND BANSTEAD │ 581 │ 738198 │ ██████████████▋ │ +│ BOURNE END │ BUCKINGHAMSHIRE │ 138 │ 735190 │ ██████████████▋ │ +│ SEVENOAKS │ SEVENOAKS │ 1156 │ 730018 │ ██████████████▌ │ +│ OXTED │ TANDRIDGE │ 336 │ 729123 │ ██████████████▌ │ +│ INGATESTONE │ BRENTWOOD │ 166 │ 728103 │ ██████████████▌ │ +│ LONDON │ BRENT │ 2079 │ 720605 │ ██████████████▍ │ +│ LONDON │ HARINGEY │ 3216 │ 717780 │ ██████████████▎ │ +│ PURLEY │ CROYDON │ 575 │ 716108 │ ██████████████▎ │ +│ WELWYN │ WELWYN HATFIELD │ 222 │ 710603 │ ██████████████▏ │ +│ RICKMANSWORTH │ THREE RIVERS │ 798 │ 704571 │ ██████████████ │ +│ BANSTEAD │ REIGATE AND BANSTEAD │ 401 │ 701293 │ ██████████████ │ +│ CHIGWELL │ EPPING FOREST │ 261 │ 701203 │ ██████████████ │ +│ PINNER │ HARROW │ 528 │ 698885 │ █████████████▊ │ +│ HASLEMERE │ WAVERLEY │ 280 │ 696659 │ █████████████▊ │ +│ SLOUGH │ BUCKINGHAMSHIRE │ 396 │ 694917 │ █████████████▊ │ +│ WALTON-ON-THAMES │ ELMBRIDGE │ 946 │ 692395 │ █████████████▋ │ +│ READING │ SOUTH OXFORDSHIRE │ 318 │ 691988 │ █████████████▋ │ +│ NORTHWOOD │ HILLINGDON │ 271 │ 690643 │ █████████████▋ │ +│ FELTHAM │ HOUNSLOW │ 763 │ 688595 │ █████████████▋ │ +│ ASHTEAD │ MOLE VALLEY │ 303 │ 687923 │ █████████████▋ │ +│ BARNET │ BARNET │ 975 │ 686980 │ █████████████▋ │ +│ WOKING │ SURREY HEATH │ 283 │ 686669 │ █████████████▋ │ +│ MALMESBURY │ WILTSHIRE │ 323 │ 683324 │ █████████████▋ │ +│ AMERSHAM │ BUCKINGHAMSHIRE │ 496 │ 680962 │ █████████████▌ │ +│ CHISLEHURST │ BROMLEY │ 430 │ 680209 │ █████████████▌ │ +│ HYTHE │ FOLKESTONE AND HYTHE │ 490 │ 676908 │ █████████████▌ │ +│ MAYFIELD │ WEALDEN │ 101 │ 676210 │ █████████████▌ │ +│ ASCOT │ BRACKNELL FOREST │ 168 │ 676004 │ █████████████▌ │ +└──────────────────────┴────────────────────────┴──────┴─────────┴────────────────────────────────────────────────────────────────────┘ +``` + +### Summary {#summary} + +All 3 queries work much faster and read fewer rows. + +```text +Query 1 + +no projection: 27 rows in set. Elapsed: 0.158 sec. Processed 26.32 million rows, 157.93 MB (166.57 million rows/s., 999.39 MB/s.) + projection: 27 rows in set. Elapsed: 0.007 sec. Processed 105.96 thousand rows, 3.33 MB (14.58 million rows/s., 458.13 MB/s.) + + +Query 2 + +no projection: 27 rows in set. Elapsed: 0.163 sec. Processed 26.32 million rows, 80.01 MB (161.75 million rows/s., 491.64 MB/s.) + projection: 27 rows in set. Elapsed: 0.008 sec. Processed 105.96 thousand rows, 3.67 MB (13.29 million rows/s., 459.89 MB/s.) + +Query 3 + +no projection: 100 rows in set. Elapsed: 0.069 sec. Processed 26.32 million rows, 62.47 MB (382.13 million rows/s., 906.93 MB/s.) + projection: 100 rows in set. Elapsed: 0.029 sec. Processed 8.08 thousand rows, 511.08 KB (276.06 thousand rows/s., 17.47 MB/s.) +``` + +### Test It in Playground {#playground} + +The dataset is also available in the [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==). diff --git a/docs/en/reference/getting-started/example-datasets/wikistat.md b/docs/en/reference/getting-started/example-datasets/wikistat.md new file mode 100644 index 00000000000..1185338a1da --- /dev/null +++ b/docs/en/reference/getting-started/example-datasets/wikistat.md @@ -0,0 +1,32 @@ +--- +sidebar_label: WikiStat +--- + +# WikiStat + +See http://dumps.wikimedia.org/other/pagecounts-raw/ for details. + +Creating a table: + +``` sql +CREATE TABLE wikistat +( + date Date, + time DateTime, + project String, + subproject String, + path String, + hits UInt64, + size UInt64 +) ENGINE = MergeTree(date, (path, time), 8192); +``` + +Loading data: + +``` bash +$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt +$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done +$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done +``` + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/wikistat/) diff --git a/docs/en/reference/getting-started/install.md b/docs/en/reference/getting-started/install.md new file mode 100644 index 00000000000..5682e2a0861 --- /dev/null +++ b/docs/en/reference/getting-started/install.md @@ -0,0 +1,312 @@ +--- +sidebar_label: Installation +sidebar_position: 1 +keywords: [clickhouse, install, installation, docs] +description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture. +slug: /en/getting-started/install +--- + +# Installation {#installation} + +## System Requirements {#system-requirements} + +ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture. + +Official pre-built binaries are typically compiled for x86_64 and leverage SSE 4.2 instruction set, so unless otherwise stated usage of CPU that supports it becomes an additional system requirement. Here’s the command to check if current CPU has support for SSE 4.2: + +``` bash +$ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" +``` + +To run ClickHouse on processors that do not support SSE 4.2 or have AArch64 or PowerPC64LE architecture, you should [build ClickHouse from sources](#from-sources) with proper configuration adjustments. + +## Available Installation Options {#available-installation-options} + +### From DEB Packages {#install-from-deb-packages} + +It is recommended to use official pre-compiled `deb` packages for Debian or Ubuntu. Run these commands to install packages: + +``` bash +sudo apt-get install -y apt-transport-https ca-certificates dirmngr +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 + +echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee \ + /etc/apt/sources.list.d/clickhouse.list +sudo apt-get update + +sudo apt-get install -y clickhouse-server clickhouse-client + +sudo service clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you've set up a password. +``` + +
+Deprecated Method for installing deb-packages + +``` bash +sudo apt-get install apt-transport-https ca-certificates dirmngr +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 + +echo "deb https://repo.clickhouse.com/deb/stable/ main/" | sudo tee \ + /etc/apt/sources.list.d/clickhouse.list +sudo apt-get update + +sudo apt-get install -y clickhouse-server clickhouse-client + +sudo service clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you set up a password. +``` + +
+ +You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs. + +You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/stable). + +#### Packages {#packages} + +- `clickhouse-common-static` — Installs ClickHouse compiled binary files. +- `clickhouse-server` — Creates a symbolic link for `clickhouse-server` and installs the default server configuration. +- `clickhouse-client` — Creates a symbolic link for `clickhouse-client` and other client-related tools. and installs client configuration files. +- `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info. + +:::info +If you need to install specific version of ClickHouse you have to install all packages with the same version: +`sudo apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7` +::: + +### From RPM Packages {#from-rpm-packages} + +It is recommended to use official pre-compiled `rpm` packages for CentOS, RedHat, and all other rpm-based Linux distributions. + +First, you need to add the official repository: + +``` bash +sudo yum install -y yum-utils +sudo yum-config-manager --add-repo https://packages.clickhouse.com/rpm/clickhouse.repo +sudo yum install -y clickhouse-server clickhouse-client + +sudo /etc/init.d/clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you set up a password. +``` + +
+ +Deprecated Method for installing rpm-packages + +``` bash +sudo yum install yum-utils +sudo rpm --import https://repo.clickhouse.com/CLICKHOUSE-KEY.GPG +sudo yum-config-manager --add-repo https://repo.clickhouse.com/rpm/clickhouse.repo +sudo yum install clickhouse-server clickhouse-client + +sudo /etc/init.d/clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you set up a password. +``` + +
+ +If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available. + +Then run these commands to install packages: + +``` bash +sudo yum install clickhouse-server clickhouse-client +``` + +You can also download and install packages manually from [here](https://packages.clickhouse.com/rpm/stable). + +### From Tgz Archives {#from-tgz-archives} + +It is recommended to use official pre-compiled `tgz` archives for all Linux distributions, where installation of `deb` or `rpm` packages is not possible. + +The required version can be downloaded with `curl` or `wget` from repository https://packages.clickhouse.com/tgz/. +After that downloaded archives should be unpacked and installed with installation scripts. Example for the latest stable version: + +``` bash +LATEST_VERSION=$(curl -s https://packages.clickhouse.com/tgz/stable/ | \ + grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) +export LATEST_VERSION +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-common-static-$LATEST_VERSION.tgz" +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-common-static-dbg-$LATEST_VERSION.tgz" +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-server-$LATEST_VERSION.tgz" +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-client-$LATEST_VERSION.tgz" + +tar -xzvf "clickhouse-common-static-$LATEST_VERSION.tgz" +sudo "clickhouse-common-static-$LATEST_VERSION/install/doinst.sh" + +tar -xzvf "clickhouse-common-static-dbg-$LATEST_VERSION.tgz" +sudo "clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh" + +tar -xzvf "clickhouse-server-$LATEST_VERSION.tgz" +sudo "clickhouse-server-$LATEST_VERSION/install/doinst.sh" +sudo /etc/init.d/clickhouse-server start + +tar -xzvf "clickhouse-client-$LATEST_VERSION.tgz" +sudo "clickhouse-client-$LATEST_VERSION/install/doinst.sh" +``` + +
+ +Deprecated Method for installing tgz archives + +``` bash +export LATEST_VERSION=$(curl -s https://repo.clickhouse.com/tgz/stable/ | \ + grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-common-static-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-common-static-dbg-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-server-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-client-$LATEST_VERSION.tgz + +tar -xzvf clickhouse-common-static-$LATEST_VERSION.tgz +sudo clickhouse-common-static-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-common-static-dbg-$LATEST_VERSION.tgz +sudo clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-server-$LATEST_VERSION.tgz +sudo clickhouse-server-$LATEST_VERSION/install/doinst.sh +sudo /etc/init.d/clickhouse-server start + +tar -xzvf clickhouse-client-$LATEST_VERSION.tgz +sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh +``` +
+ +For production environments, it’s recommended to use the latest `stable`-version. You can find its number on GitHub page https://github.com/ClickHouse/ClickHouse/tags with postfix `-stable`. + +### From Docker Image {#from-docker-image} + +To run ClickHouse inside Docker follow the guide on [Docker Hub](https://hub.docker.com/r/clickhouse/clickhouse-server/). Those images use official `deb` packages inside. + +### Single Binary {#from-single-binary} + +You can install ClickHouse on Linux using a single portable binary from the latest commit of the `master` branch: [https://builds.clickhouse.com/master/amd64/clickhouse]. + +``` bash +curl -O 'https://builds.clickhouse.com/master/amd64/clickhouse' && chmod a+x clickhouse +sudo ./clickhouse install +``` + +### From Precompiled Binaries for Non-Standard Environments {#from-binaries-non-linux} + +For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay). + + +- [MacOS x86_64](https://builds.clickhouse.com/master/macos/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/macos/clickhouse' && chmod a+x ./clickhouse + ``` +- [MacOS Aarch64 (Apple Silicon)](https://builds.clickhouse.com/master/macos-aarch64/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse' && chmod a+x ./clickhouse + ``` +- [FreeBSD x86_64](https://builds.clickhouse.com/master/freebsd/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/freebsd/clickhouse' && chmod a+x ./clickhouse + ``` +- [Linux AArch64](https://builds.clickhouse.com/master/aarch64/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse + ``` + +Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. + +Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data. + +### From Sources {#from-sources} + +To manually compile ClickHouse, follow the instructions for [Linux](./development/build.md) or [Mac OS X](./development/build-osx.md). + +You can compile packages and install them or use programs without installing packages. Also by building manually you can disable SSE 4.2 requirement or build for AArch64 CPUs. + + Client: programs/clickhouse-client + Server: programs/clickhouse-server + +You’ll need to create a data and metadata folders and `chown` them for the desired user. Their paths can be changed in server config (src/programs/server/config.xml), by default they are: + + /var/lib/clickhouse/data/default/ + /var/lib/clickhouse/metadata/default/ + +On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources. + +## Launch {#launch} + +To start the server as a daemon, run: + +``` bash +$ sudo clickhouse start +``` + +There are also another ways to run ClickHouse: + +``` bash +$ sudo service clickhouse-server start +``` + +If you do not have `service` command, run as + +``` bash +$ sudo /etc/init.d/clickhouse-server start +``` + +If you have `systemctl` command, run as + +``` bash +$ sudo systemctl start clickhouse-server.service +``` + +See the logs in the `/var/log/clickhouse-server/` directory. + +If the server does not start, check the configurations in the file `/etc/clickhouse-server/config.xml`. + +You can also manually launch the server from the console: + +``` bash +$ clickhouse-server --config-file=/etc/clickhouse-server/config.xml +``` + +In this case, the log will be printed to the console, which is convenient during development. +If the configuration file is in the current directory, you do not need to specify the `--config-file` parameter. By default, it uses `./config.xml`. + +ClickHouse supports access restriction settings. They are located in the `users.xml` file (next to `config.xml`). +By default, access is allowed from anywhere for the `default` user, without a password. See `user/default/networks`. +For more information, see the section [“Configuration Files”](./operations/configuration-files.md). + +After launching server, you can use the command-line client to connect to it: + +``` bash +$ clickhouse-client +``` + +By default, it connects to `localhost:9000` on behalf of the user `default` without a password. It can also be used to connect to a remote server using `--host` argument. + +The terminal must use UTF-8 encoding. +For more information, see the section [“Command-line client”](./interfaces/cli.md). + +Example: + +``` +$ ./clickhouse-client +ClickHouse client version 0.0.18749. +Connecting to localhost:9000. +Connected to ClickHouse server version 0.0.18749. + +:) SELECT 1 + +SELECT 1 + +┌─1─┐ +│ 1 │ +└───┘ + +1 rows in set. Elapsed: 0.003 sec. + +:) +``` + +**Congratulations, the system works!** + +To continue experimenting, you can download one of the test data sets or go through [tutorial](./tutorial.md). + +[Original article](https://clickhouse.com/docs/en/getting_started/install/) diff --git a/docs/en/reference/getting-started/playground.md b/docs/en/reference/getting-started/playground.md new file mode 100644 index 00000000000..ea7b2ccf2c5 --- /dev/null +++ b/docs/en/reference/getting-started/playground.md @@ -0,0 +1,46 @@ +--- +sidebar_label: Playground +sidebar_position: 2 +keywords: [clickhouse, playground, getting, started, docs] +description: The ClickHouse Playground allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. +slug: /en/getting-started/playground +--- + +# ClickHouse Playground {#clickhouse-playground} + +[ClickHouse Playground](https://play.clickhouse.com/play?user=play) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. +Several example datasets are available in Playground. + +You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](./interfaces/jdbc.md) or [ODBC](./interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](./interfaces/index.md). + +## Credentials {#credentials} + +| Parameter | Value | +|:--------------------|:-----------------------------------| +| HTTPS endpoint | `https://play.clickhouse.com:443/` | +| Native TCP endpoint | `play.clickhouse.com:9440` | +| User | `explorer` or `play` | +| Password | (empty) | + +## Limitations {#limitations} + +The queries are executed as a read-only user. It implies some limitations: + +- DDL queries are not allowed +- INSERT queries are not allowed + +The service also have quotas on its usage. + +## Examples {#examples} + +HTTPS endpoint example with `curl`: + +``` bash +curl "https://play.clickhouse.com/?user=explorer" --data-binary "SELECT 'Play ClickHouse'" +``` + +TCP endpoint example with [CLI](./interfaces/cli.md): + +``` bash +clickhouse client --secure --host play.clickhouse.com --user explorer +``` diff --git a/docs/en/reference/images/column-oriented.gif b/docs/en/reference/images/column-oriented.gif new file mode 100644 index 0000000000000000000000000000000000000000..d5ac7c82848cb633cb0f1b341909c0b955e79fcf GIT binary patch literal 43771 zcmd42RajilwlCUP&_GCn6Ck)lNN^AC8XOYb-M#T{XxtiicXv&2_r?kC5?uS{zxH1H z+=sjFcOK5enJ=?y%%N4I>YG*dD=8UC9$rH?L_~xe1OR*u2>%U)4*}pE0C+C|-VcDE zcf&8c;k!WiC;)z(4PO+24+G)TK=@%dd=Usw0l@PB@LB-83;OZv%_!a;VY`}^Gf*okB7sO?d`3DgM;hq zOCK+f;-Z4|w3M~ARb6e(nVD%7C3!OwqmGXDxY+2y!2w5m+q&B7u+X5Fot^vZW4Md+ z(}vyS-Jh$Im6zwomyQnjMH8Hi41RfeeYOGD)Pz4B-Tv8!M3YwYwPm9?f1D;=W5pU?&{g~pZllD-L1RhR`|Xq{4@}Lkp%zr=_NV& zWr`R+js-u6j%|-?`ej$#c(Ey#=It!!a@`R9(*3Q4z{K)h7=yQHg?W@9)gsHCiX_Ae1FUTkl83H z{>9>AEl4T)&j*TsY>L@CnNo1Fa+Z=H>uhmC`sg`Jm= z^D`ei2gQFslz*)`nV9jZic9{tt-m!vN(&bk2R=47cXxMI_s^{MPUdW%czJo**g4oZ zI9UF2usD0#xfpt|*f~@ETZ6c%v$2z*gE`=T03Xf ze}%(pY;VivVd%j2iIx4Ikp9i6pz#0CYHRyH+|DklrvJ;||4+rv>Yfg!Y^tWt_O4FG zrhnO~{xRjiC+1{o=wk1rZf|e%?^aZ{uy?U{wy<}g5EJ`Xt5MJ^7#dsJ{X=BMzhzDSm#pZ2l>H|mY#shu7B_XWbTc)Pbh5Xl_*dY3mjBro?*FLo zzhzDSvoAdVQI_qmGi?8K@Bgd2|4sUvL;n>2Cwc!?{wMuS?f&Mx)8FJqfWJIHJwDvu z-QHYZUH-W^KRZ1+J~}+u-`m~U-rD@VvA(vtvb?kiTbQ4lotd7RoERS)9T^@P9O&=+ z^|QC9yQ{OKy{)ySxv8&_?ryFw&Q6XF_I9>5)>f7l=4PfQ#zuxez8mQ4 z>FQ`}X=;2^S5s9{R#H@umy?x|mXeeZ7yBwIA}k~*z|Y6a!_CF{9G$T-PN(EYVG4a2zBMQXEvHqTE6*`jwcs78$JI zvtqFu91yIFL%kU27f4S(gi9zIl^*Q)d37{W6i9l3{u9JQCM(EQi&|$eX-^dZRUZ(O zk)aZVhvpx*{*oH0llAF-?U`7rT5W`cN#+Rx@x*?KeTpcf9(q&F5#fNhy_Jhg863*< ziZZzH4UKg0_7ws$*o07g_xcS-)u*`jwBgWSq}!;t)Pf+w7Se?aIwo9i#(q}|&)-rb@(~`2y@0C@HiA2fdsAxnDos!rO zOau*jm6N#+sBe+X-^5B5Ii3&Z1^FyVF16`3pR2^+sBQlCk6+ zzLB_V{UHOB86pOvxa)xEU}k#odq-qZDFvPHcKr&--=(>Tf1A{ADq#Nyn@@oDR1X7rt$?9!LJzF zxAAamc+x*i<+tq7K#GU2xFScf`QO*Ze4`zI=k!KkTm91KAb9%^?`KEBU&@OvkyEhi zAN8j`w+4u=A00g}VbZsM09<~CK~ha|#3a-BcIp?O)dt)(lyocl zpttRN@BKnVrdhExO=|RQl z(`L0^@3U_Ihey7jsAtj_{dDAne@eS5pQuNL*`7tGG}WH3g7l)FZ{{ts+Adcd_LUH)s zqWe{8U_dW0dxv_z8zU+(u&bM*ug5X?!tdqZc<+ml5?+9$X9KWf2F0nDx?d?ka^I)i{bJCW z0)1=W#H)je(&oN3`t8+0uUl0`P8`ggEBg`aY^me)Sri><(+Ebv6)ZX z)M5<7Dt5IA83LvZGA**-a^vDEN2!@4ct@1xIbxf(O;}BdqiFQml;aA3>QF3B z_aF^Yy~@S!pqHd7oeDLp&%=JEmu6s`j*UymqrIk=RnSpQs{B%b=UXDF%cq(eI9rI( zXDQ{%raIdpVt2PPklY?dP-=czr2v!{TfO z*|&79$Rhg4)NlD-bE!J6j9MdB^Ob&TrRuh&x;Vocz|MwBw>~<~z6Y7w+=dFB3~Rla z!}+q*Zxyl%HU>Xi7y2p_GyVIlc}O0$6lvcViuT!ucbYHaL^(0%d{mnXb*oQ3 zv9S!XT8fNIDk^<#X|3-)vnVv%i22P{B{FQbm$I=oD#OMB4^yidTeErK1O0cf_rf3_ zcj?F+%~wzBm99ZkyYAPa=WyGPqd$;}=6i62 zPC~vfFJVwMcbA%0gWk0-Z;UOp-M_B(Qo~w*e9`Qlwyg>`K3>0Da_=7w`2>6%Hn<;d z?i=g72>12bB*97Pl3x215xw%AVj0>mgY887d}u<`1?%H8b4r}jHD>W98rFK@NOmUO zVdf?pwP>Uc5QY4nvxANm`_=^u9xtLa5h1wrUuFm&t71R4j5|d+D{0GF#3T`VNYK{C z^fB)+@p}y!hBD_9`R+={wN52Wo9EI>VMXXb#gy1B>r{tiW2NJXwPEto>Nf@M_8l*MSIwo_Iy>$} z`;H^OF7MSFp~jkk5B3J|whdiHu5Rv=BOh`BgdttvFFM7GaNYDx4CU@V?r}%3dHWUx zoM%w1@*-AOZ=1G@XIO3RBC)Q0hsOVY$oR!6c_Dq5>N&Pd3bJHfP)gs`k7uH1IH25am)~?t7$K<+YG}>Kf z)u1VGg88Ox#@f~)aDDj{W1y{j=e51Y?ehY{1b+>>+#TEvGjla|#k~@K>h4dqcH!m! z^b>d7eu&NI>^m#hI*GsMFcZIr-azv*Sjej(Oz5VK^+VfA* z^w`{Uh~bv+|LFr;@cvQcyGrMavF~FX=F@KO`+m@u*F0b%Ip8hSH$Ni@^wPXy$jvEcvMqZ)=8VO4|!d2I~<9fAdgywQono;t6xn3L}{eAlvu0E)J_%2!PSKIS_yfnu40+5gD=(LqVW0C_*ttX+F=o$cM%;hL|CG~cCj@1Bw{xd(sPF(PYC?l95Pr8nche2 zg@q^-2W#K>opXkkB?-*jMYd^0?rXpyU>C^7T?9)vWb+mxW)Yn6K5}nAVzW6)`OaTY z0w{DBydv%2Qy2bn7hM7iH>VH4v;YGy|;fO@x1eQ*(?qf@#G1V49n4W<|cQLe{ac@iFnk*vvHG_Deu_Pt2q{$K0 z@1m!9qUU)+(0@b;5k(zBqZg!uf6>dGYbDHr^dF%K>cx?X1c|30>9zfYeIAG=EO7^x zZ~=`Pqfa;iMV=xj385#5+=L8BBu1Dgnt_4OcX0rsIINU7D$O_-1|)jRct3D5FS-kB zOR^79JhvzGi={X7eR4Pv9Lm!Y|3L;?P7ls&hN>1NyDTPCfuR^JDU94uZ_8wec6<^; zd^{So_72KjoJuedUy_%odth)Ql_0^8bcGf=u9dVKoMeceSUc}$3yxmCOM6OAtR9N6 z@=TLskgZQilCt#Il1`HEOWT8{BMc_R>?Ykwrg!6~#1X|Lq=XfwI6?2@QngbvTH?0A zF4-w5p()UF-gvD2l#-Uz8||zZOM!LtcqDKtsb(gvXXepR$_qG4U?Fq&KJ!mWJf=>{ zzD#!HP^vGXPrqln5<^-dfy@|D(nLy*PH~#LN78_0269+NRAI*Jfs6+;0sQcEJ?-db zUKtXdj1Q@4nkDdrNuJzJ%RF{QPkJv02j1YdmaLG)th|(Dw8L0Lj2xt;{I|oIj|{0- zo~dfYIhXg@7c!|>R@oK2`6|Sj79J@w7_oAx1zI}U@0Q}emuBi6LSHjxDyA0dG8Wm( zI#z=DScs$8ta7WqbxIsPgQH|i4IVO7!wWT2odFNAraDDtIt4C=&c>zr7<@%a zURl`0MLNS}U9Dvb$)#~ErBX|U)ShLo_u%sIa&x}2nxb;tp~8mY{O*S^KP|5+i;ADb zB`ka;0*7#oWWLB0*%HWb)J#d9sAcrQP2Pe>Y1*(fUTNhPC~xZ_o>MmI!m2X*P&!Mt zGUqU)d$HtsshIJ>;g>A5U$(4*(YdNLf7GkIcDSO~%V}CCx#h61)vIDLHFHjd=Y_8mFkf6Z7F1D zIFG`svQ@kIN;W$0(BqWR=KL`D5+iLRwHU@(KcrRvaF~dkRu6xOMi2BboiR1)@h99zq+eNg7lgM}$tj|fLOMwD zUbov-$n~f^cE2HLYmdm0jOadX>kjnl&P&Vt(B5^!-?;DIQ^?r9D&4+L@^fRkeLfW4 znUB}Ok>1Ysqhn3d@XN8y1i#v9+Rs6f&V{s2(RQ^D^X-%PMuc2ORHk;#V%^?+)qT`m z`}Do!rM=kgZKwP_(#u`9pkB?Cp1of^-;aB~g8Lff`l&|yq|55*M(Z`0O_=riS>=03 zNCj{w2mGD}fTV-|D}Zn1>EXx#EC51y`ygax5Hbn?Z1n4<_qId;z{i77$PgBwKfkO; z?Y@hFxVxaAgpqth+j)tDng3Xw_H{3_Nu*Q1IM`!eU^wUO<%?GbN241g>=IO!xpS@C!1&R33dG;DM=MvL`B{nuvy)~jSGM#s%+bciBAU~5)I+vw4t>-g+{xoX9GTv=7u5mKChc!iI>!CL` zj`v*2qu)dHJk_VSp#3~gjx`fRGeZQKd*PqQ!J5x}CGh8XSn7GEr5!%|s0Ry=oJD$u zNv^^`6|;yPF!>BvlAwxm2A^ujG>z?y<;gTf$2iLJ`~}9+DCxpijB(G7sr<+#HkKu; zv4zseQDf3+Bdz7?v}xeUg6ZnC)`#UD!KIsp1-Fhdk4US)XU|~42}s6lO61~K{l&Ns zi(v|jvl)gnEUWP+vsX{HQ?J+B1Qw`lmfTmDe^xA@bB||!NC##tp9#POq~}T>m%pt} znk)2vQdp|AUK@N~tHEC%6U`wy zn2O&DAJ$fUXX~F<3H(-JzKb9HHs3ZZ7SAl6KQ0bsOdfP>Yzx3=tUH36KWw~x8Ea>m z85mnr^O(Pnoc|@b)o;6TtI$7EVOHMJ@FKYVIdcu8bKyyF^yPVhdwl+N=fIdPjPi6< z-)@rxdzBP>m;Q8T?!zYk>M!~9AIi+Tm<5|TKR2yLUyi9aOoeHnyiJ00_I29qVIQ7% z9{v$JyE#3*h&p4#x%>h@1!@_arWwf>gvoGR=1o8S3f^7${J z-_1csog-6(@P112ahJ@a-l+5Rmn%Nj`PXa*NJO3!K1r%v16N&5kp-un-dU9Wu@~MI_P=zGzCB*M<$yob z+uu5_A67T)y4pXs2w!5E9wkv-57-^NC%;b=ei(p5@F(uJkFV%)t~*wOlHp6=golQO zZzc@RvvDp^25+Xx=Vre@kguN)`CofS-K$mJPpn^0!LQ$U{`oX0UJI?qH(C z^83^9>C58r0oMA(E2XPk@;~pYp7AE0juBtpKDu)Jg!O?% zP(k1J!%194$JlDesu5|r2!v_vZfdjQ<=N1Vm{swVRTwK89Mf5OhIsmdZ4-<8|-*dQ?}qRuV% zi=no(!YAJLr@(`wnQ}&pmEkz_zwI|g1UcDKpy7A>Xl*S1h)5Qw0GGyAAo~#$esnG}EeiKGMyL){Fhh-6qA}FWOB{DU!D3hZDCEEW|61F!O!B>VgSdStsG$-rq66X8M1Bm5%N~5 z^-3bjsygI4Vs5E&^`@tVjUUA~x{ep&Ru5zFpVo*xc#;Z<=Q7#3oGvw)w9OJ@(R5&o znOLbNe1(-#<8ddJ6k@89mT3`;3_AuCSgm}DXu#LvC}Z*)bt*4SO5d!&nGlHVFsM9A zYdHckRkS^0^Fzkta95n$CB-mZdPIB6svEutKSRbk9@4>`UVqA5X4D=>rn~j5^}oQ* zLRq-dFw;)H%vv!~an`6C8(j+tWMy1Wxwunbjf=03IZPX0u5!-=);$g`g)+UX31kQw zUD(u(OyB6Q4m@PsNemk$*d|h3xf{xecwpR*pMWzGj@AtO@XXr!D{wJ5%7wZf)hgqd>8k_{7hu^rCH`g#}e3j-=nxd4qB5R7ope^ol>qE!4KS} zOcstvv2&rv)JG!PuR%GT(70C1xIGQ|Y@b-@GxkVyBv9PBFLC5Ki?ntnYtj2&qbew* zjAAUDP7qJS)|X8?^HB?}_tG{vbnTu3-M+b(;!ru14r}d<@OLkP!MWKdaWb4T6EV`< zbCB@+zIWE!VpNEqLM})Q@9NPcrKugwZq?wjh~HlIbY0myr+fVXOcZxb{N4cqHq-YM zkwthp6r+9Q6OmP1T0aS4g&N3{GxjH^(Gzoog{@mx-HB2(F{10)xIN(*onK4uLH8;1RvUx5=d4Ki++q0^&e_ zKz}h8eu}t9P=0zFADOTt`Qu}R;$*K!+20L#{gj4#mFT2Pue=>r+HQ@8Q72qVbKXzx z1L%dH0*{dMksgZ`-TL1}n~HwmCU4+u;*1Z6zg?jGj6knT&$X)7mKB_uN8S)gCcNk?)C=xF?R{Fi4Gjdp8n`Fyqz5`Zwv73+1 z`tU=LoQbB>uc0~+h1q(Q7*^goU+rjXYb(>D6|}~gR~#8Osd9itX*~+egQq)5zZi&1}hnTd85Ew z58=ZM8*izj#fNEv&P$e3zvq#4@<+{*+ts|dNIg8jS@S1GvST+#pB-H5lxm_v$3SO! z-2e-Zg12rL2@PsH1VP$8TxOIp^P{`|2HO41q@Tk^Moq)gwYx1kYEtX4_5{A)b$?5A z&OU%FbKT-i@U1!9daZ0zz%ds)V_BUF!mvzbs8Xhr#_JKmJ|;6pOG6s5wR!$=yEt`P z1zIRyAh@Jv-%6K80vlOCdp1T=8B3o^{H{yumJdbXM3W#lvzn?B3#8A)^Pa1mRgVJS zwBfw49yiW9>&TtTX=1-2v-4W}4m5NASJNV74TXZw+j8y?EBW8JTjQTjUxwROb|tq8 zPkeP8KNl`E;=4iGPWIT$A7NHB9#RM|Q+f;P>tBDnmq(E3SrLpZZFIUdve=y%EfX!T zud)6l!9M9UBB>SdvO+p7MV%!0sPe&_9u#oA>S zlHmS1D_;@bJLhbkC-^2yRr5^KlS|S&{vEaguUsSpqov=}2gTSe-FZ=t)oP@NhUCq2 zk-lyw#)7>VbW95jwZP^P-&2>W#}&M6K--b;nd1QeI{fsu3rXlKu*!P_)Bg4+Rpwdv zcY!VRGmoEQYZsA4Puo=Mw3eHU7d?*ja8TDyL${V>>| zXCaoXfB3@H{G(%HCr0zX*c>BIy7sD3As%s82JH5#-s6>dU?FPcj%V2lQL!c3Y%#T4^#B zff-#SyQNV3v6}n0Hw4X31{vL22le~ezhb91w_w`~B1;PHmGFC*3Z4T5;@$bPNCWYB zfJZ=Xg;>mpU6^B|#2NBSN zf<)#o+OQ%c1;Aft2)Ncu7j^VG7Vn-+vQDl81w;Mrq25F(ro1U0+9^?sP&?0*5E*Eib`mB< z(#RXMe}QmN0jU3ga3leY2p|C96}&shL7JI3dNAK<$c8U#ve-qoLI)=t;x=;Pg*S&M zCr_3(vhTbzY=^^!m*k^ZHN!;Ess^-!eQrK0Pop(z9w2 zN>^ux2BCeTSJ&Ftd&lGQ=l9V7&JUD8CJY^sV#AL=S+K?a+y~hkb@|h)OdH{&Q?>tV{1asW&zoV_Ua}Mrc5m$YyC`Aa8 zlVjy>^Za;uz07@%)caN*7dGrE!zaOVmYdEdp?#1 z{+iUVx=z3FO`Yl!zRSKFsnEVb=5nR!Xez&)@E=cgW80775Nws!pO$kpT1_vs_HW0k zxSkQ-)6eaxXIcgsG_%iAg<8JZla@bYD^3|FC3WXoupWYEIXLZq(xdqeg#5PeP2d-@ADe-O1=Ql& zj^gy$uf7h^>r(X&a%$>(|DGo{^}5*A<>_hpN*X)Rl1CaJ%ej}zot(%$WX7qesho0DK^y*r2??mjI0O2gq=R*i2udssi%Yei9!u?jMZqCdIIjwV^BYKv^X9W z)y}plzio-j(&W5odKbabX5|c72xpF~P~_B_4>A^ql|fjI3Fk{sJqgJ{L;cM*;}!Hf zHePB~JS=5%Z#yQ-C6)3jW{o_Tsj93{S|)q1=DoLny2(B%WS(Jn7t#8q>i*%n{VI^M z7WTEsV(DWnGx6|xi2&J%-3y2A&qY{L{%_xpYb1SzcOR-a14kM@C4~_@q-REO2t{te zBxD_W5-5G(3|%Tg9hpU6Fa@tbm?^|66F;hk+a~8TPab%s)$*^V+xRM`np=WUSmp+% zN!{nWKbO^k1%fJa>&xQ=1tD9*t?j(lMNlT~_1?&aDi?wSt=2mygT-H6AusOa0mfS0 zXahB~o)Ee4qlE`O1pWZg@gHT{L_F-@gqNC;{2xX%+a3rCB4QPRK_mSKvjsiTG}Ui0 zO*V`UY1tQD>6}pGvy)5r+E?)+B$j3d$7BoY)J_J>`WG$7u-=s1E{wr2I<-q1Njtnl zE%1cE?LsJAjM&KH6|#zoDBsiRAzx zEp7r9QAIQ5RqNwdQr|3c@IRfKawQPq`MURQTswGRs^Zo3Cy50HtN5mq4uRutI`?yS zEHMK%)$%>W5JE5@IzP4Rezf#iAVliyZLQT#CXFy9T+o_BvWWX6ghxzmVhIzr5Izox z@_W6BcE!x;z|i_C+*#&2XA(rAaMQgs6cf1i6wmh?Hs~t&$9bn7MlB3gl&g)Gad$HB z+aQ|IERT`TO4t9$&D3;PV!0F6Pmx9+84DzPLz|hv7LhfszqTp~qA>TFN{T)!fAXPOH z8xIgpn*DPVq&#s^VU)|i*Uk)Wk#n>0$9SEhQnevNP{Wd`_hOK0(6(kq8K!Qn$e?;i#p*HBFJfzOGeia2KZF=p1U3VJV6ABv^Ff0J)c*4<4Q)zis z9Kc?OWy*ZV<-yS7Opq@8)T!2`Mls64I+}W}Jd8|p4(w4BQ30q0zD8gNA|Q$wFCzjQ zlfe~HrV+*@5zPIcv4OUZ?OPk;jg8_cHMADY+tAMX0*U#HZxUal7-zNs8kJ70C^6xz zMS=YWl6~-mm2g1t{ir!@UFyQyFMFL zVgD=1!z4(tK9^u@Uy}b}DkiEvpHAUGR`+2VI$mGMJ$9hz{Vn# zE>1=@))y(9Sm{14!TyHQG8({ zLfUu=s>R)QYYSy^AR0vhH8CLT{jgW(sfsgEM*s*pn7?V>uw13iQ4x<1yLQhbEJ|E&H(5)!$}qx*1>^(IjJH3&Z6?MFDFF8pa8lOKZX z@H5lV_8kJuyQ`49`3UDUt?l|vci_C)5pXUa@yx~moIA9CKx0Y+W2I|y$F90)VBTO) z8)&n?iS?5Dn0e2ngBHh?L$XjIsn(`5AC<7(oycqJ3(?)CO>UyNX4yAvn~s0SJyEz$ zFU<=G+~LA29e?IM%8S?rym<)>O)k&b#GY?y%Q;c) z;JMd(`dOKGXl$9;H&hfJg6)$6x6dKdI8Xil_8hn2?@pC91pq|$?rso=`z?sTz?RQT z)7$4vt0GUH0v)+IoibE?W>=GG8nrKY-bm6w@&YeZ=nr^>yGd1;IW|m1&eWR}=sk|2 zFSg^okgT{!-~+?+U*q=Q2;&_t65cM-aGUpk;q*mU^F2=HUD|UwU-0c;cs017=|Nyn zcIoPK>l3}@UUu<)w#be9PK)=>El*tg&3ur}c@XitU^aTkPZq&7YMycNZr>?^yafR$ zcb@OGRB4NV(*3}KBHa>{0Djld_fX(xRPUk!y~e!&HI%@w?m?-M*V`22gga0o~<_3nXXc73QnL7{`o|vzF2XlQe_tUfB zh_vu*CIq$)dX||2|3NPCc=f_U-L)W&APAmT7@SjodIH&d>~Knn zy_F4xxkm$ee8xrWy}x3=u}qSJeHao=Au7U3h`~^x2bXFYj{d+Hfteb3`f!4IZ#9YV z@Ufq1Eqt3kkah#2wa_))0Jh**(`Ij`5aZ!y2*fG5m?o~=6(E8KncsH|yGhPS_QHOn z%hqQi({0cflj$6b)uTi`HYc$frdbgp0j1z z#{1#dC$`*dG_r2MPhQKpnryZlk!(LTY=sSc!ql9s4Fs*w)buOBAOjgCUv7C104K)M zP}2pj$-u1RLavM=`d=3rPywX>wT6oLXAOk{r=W{6vCOn6t+YAnbYPxvv5k=JIpm!(nawk+$>wNXt75rK5AnB0%vR8AZ;KbhDyt?#!8 zS)H{V`JsEWdj$B0dIp7a13_W75wUEV3B2-&l5sT2P$`WJfz({4tn8!$#k^9ULh?Y0DxG3|GUXw^;-CqI2pwYnYpn$OHhRY*y}L-cm@bPT)Ru;E=;=$F7AWV<&y7)+lXV>dntYW30X&3Ya$DQH5gpYumkm9QP%47xSpbB z#J20T2qUUCYlz6I@6g_eG#o7;BM&B3$O4X+%aBG3PmIn8rih9*G{%-(5!SrEK30-6 zUFk^`ga7^%WwZt#!D!U06tjWuXAw4KYeE!l!KgTm?(j~dmgOCte%*;EA#c|kCklI^ zCl}f6aw25HY($bDKe7Xesf#z@ff!^S*+6*{_Anq34+tTMK(9MH=)G(+x=4oQ$Bh6& ztKt9@Od-MzVgD0Ub$@Z&R}>-0I)fXLJgm^15JC?gln|~%3!|vlPK1W9s)MEbKxi12 zJ@Lx9%{c*xvN?LXXPn&e_VyCVI)E8l^GxJ-@-Z1H=0;&;sjyh?Zud3xA3+`^vE~Ly zuYYJ3W?D9)i$MENUTJ>1nByf+dAC8-llz9f)dJbHc4*ZKjl%qABD8QTol3v;Rbj^N z$&h*0o?tg^igD}xl3h}!2pvMUQCLx?&Ag6vNr|2NijCKzc!&qcf~DE8h1(mO(rZ(tg`@o8It&5}OEswTWq|=qbCp3qd{Ac$MooxfX z6H)mv3rk)Uhm1@Xvtp^4QMVWWpC6jXUF9_LC`3eFu>-9lYnL7=SvbS1iGet8D+umwLjjn(|>u*S{X#vu7f64Tx_aepud*QpAjJlmpC+eth0RSGG8`$^Jd6z==c9Aa3X!1UaJMF1k~8z!u51 z!8#-k;_J_Gx`6zkF2#==mgKH|6_qxcPeujK6xei(NgO>SXCoT?#AC0~82Zb;EDjx(pclyN{-v#NHwGVguax3udbE!C@}KkvuFu?p=j+<-@=5Hm*el_h^dU0{RydZ^=9ux%ao^#;_KDZiNzI)m%4<0 zro({}=9s)o8c{; zcLe%|VwvYEVx$!bYYJeArHd6HyLCud)iqkPf%@Y_>0G^{YvK=60F|8QaIGS=m!^1I z=YDCS$cqC#`{^Co9)?Ey+FYgcDid=oKWcQsC9Tn9v7uv|7SWAICTkyyr(CPp+S|#d zgcDtdu;R}bOM4x7$IP2HI^S&%-**YHRJiyuPE7#C2Wk=*ZMq(}i21ZF#6oRiCmzKT zS$Q*0BF)m!7XNMz6D!7|Q*IB(&KK|HcH5zE|;t)O`11iHdz&v_@hc*~ zqtK_80yxZfjimi#bj-GbzM)w}S{sqreplqr2UY5#p){Kf**#7jC%TyR zefa*-CoJ!sh*DyN>w4&^XIBnwdfON!y+%QN;p`Cg3gxOQ@8$7U(`)X*Afof%8jnMX zp=^;w)oF;;yNltc^*kZjJRPr&?UQA!%&EvDF%@UMqT9FWCJ;5WJHI7N zwC{3H+>d*|TxR2R>`N&zEao2!o8~ph^y6b6A3I4DQf4Q6Jdb!QIS3K0~anAms(&qwqNFaK3?hhFB z*IjT5+oQEnbF0EdX3-?I3UhKdw-oE~P!`im%mdPHxCd}63mOOPBLq}By0uLK=Md~t z@l4NlKag-5#2C5DA$gvLxuFv1n!9O9iMvT}6B|vsmj@6B&O4qaI;Y~aOohphq=#qzhaPCn2DQx zfQMKMrwC*o48H-dnUOI|I}}(T8K~N+&+AH=eb@Y{aL{k=d;{vyv~v9A?2%wqQfOu5 z5%N}2#vsvfx#oW@HJn`l(^Vw)@QvYU2R%sqxUNm52y2i!h@N z2j2uo%-b)`n{^JF(9(ejpdPQ1EeAYIhCt>jzd$M$&ehHOhVIb`7T_tj2&w{$Rx{+%V@J|#(MB2w%0&j!~U^J8Dw~|OY zc$uXB7!XL`4lG>;k`bwZQyA#6W^1FvSOyyylf|gH8>(umR)wbQkHyqKsWy%n?zO2N zxT_g?#M+UAn~%pfwWL~>tJ|KY-jCt-^`lg(N}Cu%|7LOf#7NJ?(oEwhPZxU1^u)=^ z!A{KM%pfZ#WYkDzi_e-YmZu#ToIcBlrj}cQ$oq)UH8suzjm+F7w@nLA8c0qX9*TJT zoi6p1Hk+&|^FZK80)hFe`RZ&}b*kgDLeK!h{x+V=v7&{I0qwZ1$k&nyPM!Qx_*0pT zZQGO0G~F}^S(6eyKWH$}x&JiuD7h_$4dZMh-3p$RFGQeulMwj~*( zVwvKq6{|j`N@^BsTxR84k)r_vfG;r+6ot5>cCk8(wSjiFN0&B!%*R5}P-N7oEtWHM zEs%jVXNfFUV2Nl*qzo4=_SJMW+rshD%TWoS-UMz;J+2fUJXIHzdt8CM_V+ zD#v5X16t*!Q!5+_254mj_K`$+3<~6L*xsZR2R7uETLF0{vb51S_nh%r0Ocph1z4(z z5>g2i9YsAA>~{NZw6`=2&Q-#Lwo9);=314pw^iXU#Z?oQ$jfXO4u!0BB~}%+(x%y3 zk0nburMj%8iss@goT_`&5_+d9%=W3u5g}-E45(#B6ap0cmj)gQe{WKf~HG!cuKa+~^2f@!UFo+@xxCLABw8}qosXO%cV@plPlCK~5xxO_01RQ#*iA=AaYihSdX zgjs9QB5Omin+??i*`hcf)nXIXrf`%c=We>_n|SBoIE7 z9Xlu;R$d4h(k)g2l?hT!jhP;0osH>GEv|i9#ux3BaFAJPon;jsUeaQT(kc$Hb)Xb& zMn&0KXoVP-X@BIc8H@3KN4$!2b1 zMz^?iPitf+;w98BUjK@zG^tg8D^D*`RZp8&*I&GLGxRplt=f0kzKqs^4_X7fu0GN? zyld(HhPVC$>n@xvD3+~3s41*$ydhZcev+&~T=;%gx_)RnsEy2y!n7Vq-YyFG!PRnD z$}CaMXh4PeuoirK(P-!S?T~1urxM*z!D>IM;7A+ga0Mm1Dw9T6SaTf~=zpDHcmQ1t zKm(u$Vgdkwe=2&QaM37So_FSGi6$8x;Y!hlRmU5>Hf)_7iZq{Lu;ziMbT9$dCnzeek5@q!BD>#DS7&6AFzu#s=JIu22kX5;IA zaG)NCP}jpsZ+bqJ_qM|Ne!%@SztmazO`o&j@v`awjpmL1Q(SN005zr zEJ}*2hSZ43Vr$(FVmWRbj0L5lCW}|ifGJ3n)haUul_)4LiIcjA-p5n0;xq+-!6P4~ zfOfU01~i~ZW#&U-M(zfHT)Xx8L%UOz7eQh(mKP<+Wt4}dt94+MWNUezl@+;0juZn_ zU;!zWkaOq7C*AEi7CJ&*sgbH#QGQ6DFLE2ubwr&mACvHqu)7-pQZILm8Ym13op8KU=;v zbuwiPpfcWM0Ki_apun{=ucnq)=!(PO;9v^HVz@V03L3YenlySu1+4%nsh(#gAi*0= zMJk?cRtuFaumVu3=Jad}1&$SGt63Qn0&5t&e3liE8q-!wHqzDS^Mo{GeKDxGO~Gxa z`+3(5Jyu(EqYySkKjVltc^Z-l9?RQWh}Au=;-;U*9Fw|9h^j)~&Mg@~r=vG{0W*rB zawpl2q^S9j25sVd!xv9i#uI*@7fgHKwmLcY@#$*ZyH)nJYfs}_*B+0^&f8xqpp7p) zn)9NryP|l~7<`z$ZZ2NO*DCS9Pd*ya-e2urs^UU7l={_GJfRmpX zv0m!g3kP-e1)RL(Cb;%t6RlmH6N}_Bwzae1Pg8i$1PYRChu1fZgXebU3`JoP8*Dr* z7-%H&E5CT~P13!UUUh<7f7q;yVtn^;In=oXv@X~ROkfLqi)~PiXC@TsS~Gz0K4oG} zwGWoDbCr0~G0{39e#)eaDElUj|8W0XRCqpGI#t>Xo<1_tcn#~oeR?dha!3~&8R1d9 zhOL%TgKIlId5j`(mCOt!H>H5Io=R^jT8X3(YTA`hF%|Sy4W%B2oI*1^W4&mLQ=g0x zb1X3}T;zavTXHTiwl75@ez#m8Tp@`2F&{bR3-lm0AF1{D`DGTDiE&KK$9g^%5+RgJ zh)ey)ES4>oQlD5z?RYF^j478fmr}NPYAoUW9m={4-H4(ak#!k75bH>@=(W$R^+0FF zR_dWLb=D{;z(xY^4a@Zgky1thv0TC_(4s2Fn2b?H1VKv!h9b3Y^#15l=MNm3a%(>vKJ(97>!) zk>QP!bx+4}-PZyY@q1!x^}3l_cuTV#owNlXr`h+AhObzi(9?GM9t!Z_=-@yMs)*s9 zs{8#YE71D*7t_jMPYBz!)em8E7+mi`H8y%XF^ z^zjP5?eiGR)RYhh6$-$J7Z;<@ z6Dv;aR}SZ~d#q59m4;1Ef30)byTHw;^^okc@%f?~)T_-Q_SyPQydPQju`Uhff30)$ zFhPKdRYn=j9I}xjP8mKM-0b4()5^bYK6C1(s#u1vH9_F2_^aE64h6Wd1j6Cpzabhu>XxT+bvC=fD z^Ba;mr%^$-a`y@}l?-t{YcC2@n6SmxQO#!i;DKF&m-h-H8`d_E{%(GC+jL2!87(|v6#rP70l}c@uw;dMB31qn>9C;htuc3w*)tcHrV|4 zi+?O$vuFQ2*XpC;*7sgZbNd;T90)^Pwt5$jV!gKdQn`3muTrtk`P7`S=qPd@uHrE) z`=zboHmUjF*Lz)y1ga=1L%U;ssXgqiZu^uP z@cXEkWA13AhA98T00RK%BtZ-S7$72me;8nZ|C<38tRSnUko>H==z5N*4ooD{0{>xZnSUuivcz;yR^EtUcIuh{c9(Ft4o22DTB!Y zUs-MUYJ0y!hF(?UIFZVpfq_~1`eXIhp2=ZBP@hjiVP9pwwRWdBY#MYTG9V;h&?_3! zuKqL_W+-lAci4isut+jVPOD63T!}mkN7-m38U&#a4hIuwz@aSMNYIC%&K+FR8OTY2 z;Y6c5k-UdViG9bW+5s^mi35kDX35l4wyL{dvYRF4YK6h@#e7_*43b`Gkov5gDuW>k z*zR<{BM~rIRo3hD`uv4Jfz7zngFLKB4BhH5gQ&{R%zL_6-;*5<`-xD6<3QS<#p4xQ z%;l`-c3U+RKmG~<%Rq13Y9DxXxG^dOrKz~aNhKy+aFc=fqXYSLaOkdwCZ_9_BO;&m z*n5Kwg=>Y`xX!Do)$17ysw?UnKOwKj&GBrd_)UfP%kS-GK`fVt6U~nNL?n4TjYr=P zn0Ix2vMCU3BK?Ia%18_y6=E#kKPZ%W!P5lh26|{#BN71s#MbD*LPkYh+E>-J{0QuV zG5yefQ#I_U%yLcx2BKg0qKI7H^%EXv{i6q7=f6)$J%kXce}M#IYZiURNk^*3Q1>DS z2_v|U3kG3jCA;U_Ge5N!r|A!j6$cp6n8PQJ-I5GLIv$ckfodofm;!YPU&w*lsg@@Z zCM3^N0!l@Oq1cwm#GARm%kiK?zfmu*)e99@NusP@ys(dO1p_jBP)zxR8f zj^$_us7(nK8)%*wjzzE(WJiSFnFMC!elq~O3eO}`HZ~;sg&I$1( zJI}MQzBX=4i?N^0y#eU=t8i@SFvC&(N8ADiy(_i97y%w$HnpUFvulzQL1pvIWiEZw z3|T>}@AjZ5{DknJxeA-GphB>;ZKOGY8Bl~pUO9knRxjViv#!D?Pf}A(gmVIUcaRYT ztjW9J3eBI}Zx6G7d&1G3*HEkot)UYk2wFA~{yyJJ){cE9doJBQj*RN-6n7>U1wi5& z(ZPSbOBG&?fxwY5juJ3X@7=>WCY{-zAz(6?S-+huHaCF4^mXfdWR?gwyXpo0LaKHuyA46!Qz8VWN;BDuuIr& z7FFonchLaA{C-7|zhHc&?Wqwi4x4-E0DV2OFq!b)z%4Utj-sH8_RdHKCXKQOGRn2| zK786~+f;;MwBF;vw|r=S71L_>V2w=@an%YYchpY=uiyhrbD*eX1~bas`kt;`pO6r; zTtc1q`p5)hC|~M*ausJfKba6;n=W!3WpFX-w7rBlcMLq^5c%jw0W>0r9R>)Gb}}qe zaVPXGvNkV}F#vm9v@9R^fS2@3szh3nbI}nlnGpM)a!ewb7{LfR2@L zjK}nV#$0uOt);@$PL^J!lR{-^Yi7!ExZETcyw4ykKdwN&Q&D$;NQ{xjwf|AV?T0O)~!Wk$YI_Wu8O%D#5( zOF6mn?9~QhLo#H}o;G<-+jZ{Tw&8OV&fdIq`OeV(;`U`1tbb+Ufjm?dDMq67cfpD`gM)`}!W#7frA(uRjPHA+T@AUpPDr znN%uzETF%agkHyDYplL-6ccM52LI1g;dm0hpx$dd3i$-)Hz7pJv-{#HM=~dd%5mjF zsZVJ-FW+d7q>{O0j`1@c*s74lQiVdPkXKN&5G>_JrQ&$>%9UFB3W}?dCc4@}20t~} z$Q4S(bg|6}0_o|hZ6;g!nd~98>&vB1T)NlzTo!vwGpX&0 zXah=r%gTGm?hRUF=yShXTS{eBnnThnH$BzO!^zKjc#}GdbOo{ao$%Vem>i6!xjB3& zc;+xBwY2`paePmBm>=En6+R2o_W4_Wzaz0D>#|L@ZuzXKj$oFW#{>t9P81lj_58Wz zd5sKQ3mG=So(lydN!3?`O4P`VxXS}(MH01;?prbJgBtTxjFUx=-o+#bfE*lIKH;@O_BrC@y*X)Ean?C`|GgF*!Y6_JUo{l+ zpG=qC*Io?OKPOh;|6?yEB;cPDt0lc#=PrZk2O7+pPh`Nx3BX1tdfWC@9FtC}>rssI_(ez#s^a za3zGusMy@3!hE7{<>jTrRaK}BR^Tlet+R8I9WmX#B*bg$`0$jJgTwWHLBd(t-90Nr z@a5aTF0XF}hepv5km|-K!9nMR+TD*U{=#C`8gO`2MDn#gTWJRPBBRI}lQF7Gb^%zXL z)A|1WxGGRnMDK68MFbV2ax>(VQ9)BrKeZexi&|%O-91&bAAk;f}ZHl_w%N(^hW`biECa>-A(KU#TL}W7y_zEMLs# z<-0{wK!y?hrY1&kgyAR6_IpVPOv?0Eu(*aOy04DQZ&i(N(WakB!9NIAfv$y&P(Y{q zF^W~4MGSQUk+u93y@>7pO1~dZu?FtU2K)oT;(q>?P2@B8wHn zaHBiwC=R0gmcmJ#3+GD?Z23Yh`{Ja2jQldW?lw>N_9sf;GN$mQz>Qs4cFAy=9csgg z$hUu9aSJ|@Ah0vKmpC@=0ZB81xfOL=3!q#N)Awa$YRmOQN)k<@u8=;pjPHI z#Wl?5)xx$^!?8QU2qQ>*&G#ohpC1o5O&vQ-wVxg-$aYhNnM-Lm`*RW_4|`6MUzXuk z<{5!Vy_F%>VrlK_4ccgJF+1-emCPWQuAZ`n`L>bzE3);|xN+usS=(fxLX*sFxc!e^ zmaHS5MRo}U&vl&_CT(EuFXGPHw4jE~nv@t~*zXyOO42vp@wmmPUl_3p^b&v-tbjs= zScXBFq$OQa5J7H5ogH!9mQp2t*F=7p>3Wo@`Se+(lk-js=dY;G>yWMKB8zgT4*kT| z2BTRmbowC_9<7_^A=oap8GDa6>t6p|kEk5s9q;S;@tgt%pft-B!#olcW$lu<5TW?H7iFyDkzByd7^JGmTCoZ zdR*Z@BqiKM==HtPrlwp5vk3TGs1PQ2yrm zyMN)P*TRO=w}jj9?;EEUaon*)hxv(HheiWTlL)8A5H#6RI-$uR%)3r$VE(6<&+rgN zWvb<9QB_uIKXmA4?R1pBJ-)l}_$$i0jh?F-p0@JA{sQ&0)lPniehypFMDTIrSz9SO zWE1h94q}zce@On_h}m(A{A*JG?uCQxFl7?gWl#(Hu4T~v_boENcM{dUIQwP&ZTOr& z@ylatTSUx|U*Z4|0Hx6fAYgRLF*5L#I&N1g{-mQZsLCtV2y@M8XI!2sv6_uDpHu#He zgj_GwJ0lAZ-GNh-fP^`j{4x{st%+w2+D@}v|*^;2JRpQp|}-{|npA+uRnrMF$ICJfx|Zx8GL#5|IWUbqEip{Uz13#5 z0L0OB@S0cqNEnHT`k%-`$o~^QhW*c=iuz|zL44+s{J+A-v`_^%-Z$S`^Xw(0vA-5O zQOmUnBj!{VC1qvO=Bw1A$~+P9nURGT+TuYr=+TBy_Zrhq({gt{A zICY-VnA32Vms42V_)rY;{tgKV0ENY;s>>A&szyY`Kke)93!=RpUWBri=?y_5WSs)m z6ZP`%$7WfJ%i;1k2f%(JYe*6S{6QEErGS!71K=5`cKb`z#Ku!Va9C&znmmhnJWvhP z7cM|)FeHN2{UKKpb#IDT)Bsnx3H&5bDPIg5&d>%tlQ*D-j)+KVjO*uKDmu#otU4); zW{1f+E2U}$m!;YJX6tw5V%k3l(^e|f`dJFaE6KbMW+Q!^>7WD?YGEq^dg_7Iyn=M= zizpKrx<0N0rZ7xc-+WM7?P{!!ptC14Uyf#b*xk5~xHQ`C2FCOUT{qA@ZqpMaL-dVFs{ss)n5PvuV;sK3I< z;`oB124LVn{0%w;hT&*R(0~A>KgI&+ zSP;$zVYnmid(l_}5``g*6sL*+w}eN`?-c2aq(ABZuy4o(6R2-F%UHE;yKLl}E?Sui z8XSe1X|8Ko`^vg%Cd$ne;X6*N7OP2vh%o7;v!|DCWS-ZX<#;)Z5v_QcO@yi1^1TA# zb$vEc5|zCEvf~5SHN&V}4PhaRos^W)H&WnDG*^=Q`LNc~MnkyF3{3}IkHrha7-K{k+FDZVo0 zUapU_dM^JoOpnPhD=+xFVODt(lKHE`X~3$c%Cdhb5%99T%7FB_9kk;I1c#mzK*ZHB z2!yjz*$)SxQSAqDOtcY0Ai7JC27^J--S#5qj44KfybqG_gLVV!`XLrAjQhXI<16sD z#u{7K_``CQ>Ggz4>DCYN)lnJ5=tYrfB{E-J7HLae9Jwr0JX58wS7ffWhyb+++w4O- zE3$W2zVM8!^AqMefiumbx}CpYvFl`EX`Sj=y|`KJ6G|$aif5m^lS)!S2&$URjkTLw ztXwC}EM+QRS(mE1x!Ee*BP%FDnnO+Sq_mt+ELQ3KWO1fqXKd)?*rh`nHJJ*3O{-HVaCuy`S&dj>pPzJH|kamQ&Ckh-r977rp(@YOmcL1CIXB z^}qJIt(zh@myMlF?+pCb;PHas!L@q?WA3Ysgng;%dB>LFAkfNd(PgEhVTMSeo7#%c zjK>9_A@X8}K@+2P=*(B;ao@-{!B+f}8+0A&BnVB2(P+h6P!buXD6;lHRj}znKB5)> zT)?5HIkj%gE4Z-h;36s(4)&$Yhc!3tjx&TqAY6k+5WBVBJtN{>`}MDD3UDjnYDy5Iq_#Ap>QtwYN;HtSzQvIbLna_^@PX9 zY(!d+8O`sA0h*S?5QiXBrHhfQfP(Tocz9#P(KG2#(8fTob81db%OTf^y=;2nOcsph zmGV@^+$1X+K<@dhH7IMQNaYcHCr36Q)>o(aiCf2hhAp)R58KOIr5Is84R zMoNAwSQD&JIsNahE@bN>C=pP=PW7AC2Tx(mYl|uX$Uk<}t2HND;+TsvV`KwyEXO zmFo78Xp0os8qckkZuH3st)J4V6H8d@i9;IR-IRqyZ9G5{{hrfXQcB7+$l4=1SEISw zzQzUCMtkEfGH#P(E{^*PJe?mQWQkvBNgnDjt_umq8zoaQp zlo7J;lJLwiGsHhTOvG|UD_kds5o*^cIqAd_lMd`NE-%&Qj~ng(I&I$k^3Xxtet7ef z^|c8ych+<#Gz}1s5{Zb9z@}nONn%Y+XUSkr&dSWi%E?E=&`l}KFUR@X?x|JQ4~Y&7 zXtrvMZD|kI?eJ@E{ZTjM!9NfjZ9g&EHQ6+yf!d=sKf5YFyz#wZ>dU&B8r$pps>50H z?mynVqD?EJuD&Y* z8P1jgf0+^BC0=(@VF-*z*$o6-ZNOU`cCIDfOl=wnKme!oI_>}fVIih&lUoJ*2cFph*?%l?Ce;TPqlqh6`> zuKJ2mf-68DJQ`JEqk-S0IR2<48oU0GPaK7M@Mfy>rpTZ~#O5rnuKA%dc56nW_Giv| z5hNAjBoX3y&}yo*_p16q&*M}pL2R&8n6cuh*#2=gPZHSieq8RmzT)@5!$es#2^A%o zh(uhz%0CY$lmR)6W#l2Qj(yaq!6!O)@j|PacB6mUb!;aH(y|;UVZ?Px^E@f>Twpa; z%Zi6B(=1E-ear#&dwkDj&b|gXM+~a_=c!)${iDT6dMy?z1}~aN>%Qzc>w>Y|HD?r` zajuZ$`tgn%oT&+BVwEPE9_Q^8xeAHuIn{*hMHlUJ77I?jP>(H(cqw3oxTux51WkGL;84O|PS zRXtE2oYlPyQdD)L;M!xht+_s#b%4_JR4Pi2NvE=zvh^-1fzk=BO{Zin%1!5HMcB$x zAZDp0R94`trCWXrx0OrN#7KFCT%(qyYwbwnb`=_@P56*Mkd}PtNa6j$8@m@zvkPxF z^KulQk=C&X`WntW=yS%yHk&>wgmNmD?<~0uzK}lQK&6KM4})E$1yk-dD(&%f{pv5< zn?{}|-Fx7BPcU-=i1An0gaI#$a)5w2=OXzqma4uQO?F;+*xs11_4pojt=^Bw!c{Xm z6D9>)ZECPRMScQKp7SEKH`+ZK*Wswta?@$Y$C@WI-f23JS?jS(Qrzl%9z1(?bIu5l z{ugbF?u739G(yZKqS?vP%Oy(Tq+Hu=6d;reo^Cz+Oh766gD=`OzA+o%(HsIgG8`zz z84l@1#MA6U6{yyceX!;z(2{N&L`9}Q%Lssxjjr`+7tXgpZx+JYV1)YL4l>w34zf7_ z4n+K42N~@Dc94^!mA*V=b;T?qRR#uadkuOjnUw!&kxEgL&CF`f(#g&1U?8xU`>s&; zpBAa`w5FD+oX*)Udxz4o<(1X7m4?ZwskXUYRr;m%qvMm)h|TTG?DpL`Rs6%V-%rm2 z3R_pdZf@t8OTY)kza(<8h8Jq*C!`!sPbgBUE-29`p^zQik3m%9g3-96M_rv$c~VwR zDuD_eE}26KUmV#1;qiu&X($x)xZSl|PAC;(K)w*69JJ(w@>S@tfZeVF> zm8(=cJzZMx=uHX^RL873ynsj!Q;&pp?dN}yr%h&t)w`#3b-&g%A3B&8UxfBBG z-clvYZv_F`(Dg5j#$NEVZa6XN7v?)Z6zYy0H4vs{G#D*hm1&T8qj_x~1geedAe!at zxe~?xtBf?3>H72lB=F_MC{76R%$P5YKx6+iQF+W?0}jl9DnCqF>2%LcRjNGSSNmg? zJZ%djDMzn|UUT1`Z@%;>WzAGGI}>@}j66l()%_@W=#ULF!mQ;P6w}9{i6ZmcjHE?= zSdXT8sLjN>S(*oK8CtFn`>K$$8H5TMD3=|LaVbaOd3i#iFRn>K0c`qtoCwJ-u!vN5 zPpcSa1`wu0h2Tk*6}ypcQM*eYSzH$#x{g|fr6@&fFFR3OP!oJFMF)(4lr}G1)UvE% zEblJ9Y^dWqxrCoHvOcX{zjwRJKBIXhVA{Thsqxy+DBdrr+~1^$`E#yaQ4aRPW7V7A z;u#GJM^<$^0MF9Z-*q*-D}sTpZgclJj5#QH9o&a`aZz*SiElnepxsGNN34wn7@A(5 zTyDfKjXoWnpWuBMJ8pQpW@JCHhnVbZr^A@yEc9X+Th73Dnz=9%j|O$b_Y49N_*|*# zW`X9lB6+{=v}9xRoUVw167;b^;8sD<#z&YDS$4ojykJ#US+YOHr>*oc(g*3k;vj~i zi$7hm(znILME+}=g~dh2EsmvX?O>^&Mg7q_4MW!lImgn3FpH_9QS$<+i1Yz5gjhu+ zq+cB2tFya2b2l_QiEA$m>J?3trd~5R?Y`iItVfL(sJs_Ue>TY*s!f|jtH>e?|tM|%GR<@WXx#d3{u8#rga`{;I1BD&>$Tpo822H@58k3h=P zG5Yy>?FLB-Jpbt3Bl3CdoWcJH%ReAT!oQ}?vuh9lgQTzz2;2B|B19?J*3S^I(!ZNe z)YntvTog1KkqePeIUo#i8$ctJzhh(1gG`YZ66To4a7rrNOAQEk9NgPxU27#AkPRV1 zH!2|64a2~h|EcFb0DHqODl`oOu<|I_=}{2IRJ)UEcnHMA7#$|^ofqapKENJd6CKTV z5aM_lz!g;=S?gxvcak{3*ERoIS5)Mc8k)nm$Q$8{oPlHQBgFn3BjBk|R303UWFBGb7fS!EZq~|LvDx~J*0?&Y^{)t=Gh_P%^0e))qXvtGGD%R^utwdz zB~fIwP+6}%tDc?nnl5d;h)bC^FSyT@^h>2hV{s&oK%J|Tqq$5BeXp*@`%|F|%F&tb zb=!U}597}mYWL@dik{Xd${*YwL8k|;897bd#}{kOUOo)He&nwrcdN1%F?(3qzPd1{ zJ72)N=U6+hW~gm1Dkh73GQ-D=7qkVk>0Z23c{3?RGe2-_|Elt zxZBZOR1xSJ^jEhXS6tL!&7EJ=_CeBK)(v4=U)GONR9-erv0q#^&WX`pH7%)GUp23p zRbI7hxn5kg?uOD{w;iThU$>u@RDND}T((?XcixWBez#>@vcBnl+2Fb9!92dW>3zPZ zWpo8#+1&O)FIwLYKu})Z8o`Tu-wvX;lHUy@s8`*|W4poMju5pT-i>{P+Axcdl}6u; z(}5M;PrkO$v2Zc2!VgYyY8y9Bb3MaP&G5X@vC;~F@jcELpzE?O$jBEo&C80*jx9<{ z+cx8>(E9vdF*DI+TQ%_8YhE!->mFXy&#ZPuwMgoI+7xZdX5X@(L}1%?+4SM~Wp&Zb zv19s1z`AP&_k(lK5YLx$Uz>J@M!SajSlzmZ>qN;bhxs%=>Id_g zVx}+mxlH9X(?xY>EYqdLh zSKZx7uF94kX<0Vu1lQq8d)4#vR#$UY>V zZ)dyg%;tR1?4$v$Km#ip3&FE=du%HBU|7Y$0Zf{kkb&@g9W#jx+zkbH5|vQM?*4v6 zyF*@LR-^r_ut9@UMoiwVXe4R#E(*|t2)@8&0GK(iqQ_kn0nsuKwtu)7FaZVMz#kK? zpnQ!B(H_y=AOA#RlzZcC9>m&<^yU7`d)c41^9A2p108QGML~6n;vplEERonns7;9f zYWSm+9GB9%PXtaCQE-fp%X!`><-r$IiAzo>W!xv1$rRJ5k571b0KjXb$?43gC$$!s zA@A%3;BCgGwFa0W-7gU#cm<}6a36qU?*>%WFK~1f5dqrxh3>EDo zS;@@dxDnzgOzuP1WC>ZtOUHC=>jXBAO4-#0lx|z?lI{&cIR_CXRQ))p4mn9AN73K- z!+_JA0i$dyRq%Xlmb0q6i+ry-C4#F-als5``RPr8VjP|!?gIU>MBT=i!G8k&WrWF= zGYjJUkErl3GxGbFH~#ND_&=GEU-KaB|1l33ADn3trjHuAr`i36ZeQy5nAUGzcz-P- zzd$$!1O{mnlDmdcL_~$hQ2f;QB>SiEHH8oaJQExuI9I~iEscycsiZiG5FV@=%+EKk zq0!KVqNoiB$ka_tCHVzvS8%=Vri4HRF{G>@D2Ke0hz?Al6!FJ&oq=eS&7w( zOpXkR?r0v2Dii2L`NABTKP;fk>VDC|WE?Ulaup`tHB}pqR5GSnWE6Y`%38|Vyu|UE zXJ}<}HZ|`q+LP+z3rat;L$PImU_tFP`huB{l^~M2^A!mws0TH$i zVxhRJ4QEqAJXmZSt-k|$3q?f;w({A-kjUo@TrE@U{iCoB442&F0`3mRl4$kU`UI0h zkw7@{pu4sOgQ2LXc6_@)fJaGnq!v`TQ4tVVvw{M;#qA)f0?zFlWC*&cw7^^6RT zS^&lrRz=Oy6qcsUe*gkgqi!Hg*Rx#+Xq0GK0)eHI*Pmf+v|ExnqodHDq`2ZyoT8;R zM-DWK)jUquEg3(~#5TD#R&rvuJW0`H-zYO6aiWi#r#SQixE9uT2z!_NmF6f@~0#}Y~it@AX90*yigN1e*nZu zF47v<`5Gx0p_>hzY6;wK8MnT>4d}3ihauQ+#SrBGGZHE#vC& zg4=p-Z4B0~mV~q3v~NKcx;)t5VZB}&eh})ZHzjBd{A?;AVUcnyOD?c^nC@<{`Az4z zknVh~!R>Q#?+=OVJyYek_^^@-if}eJlQVvQHR?3MS|(rm{#LFGrSDRyZjR6}rn~sc zcB2@l+jGk;{?(4jRQvZ7WRhLFX^E~dbN8h{`O5+CUwG-$8J_uSViGC<;{d z0>?GAtebYA{?Z5*Ma6ivsk)cuLfI(WQ9z8xG;L<)Ime)&Oq`DZoK{goX=R}gyxZkl z4w{#G(_O-}w>W+dT^%GXMN6%kA8h8+u{s-#+9wn8OH#`3my?!H!3&jz*F5zSk^!f7 zMg!1=5m1DszP^AtGUTN!aKo}_o&`pUeMkHe*0MpeyLgclr(FPyrhsrFV?d+`F|mpU z2TC1dV3}8btBHjq*Ak;JXEgpOw*>k(W)%5OxZIl|NM6-q6rUkw5^N`^o+G|`4-3zn zbt~#{T1T3wRpWqfv`542AGaEf3rUQV5;FI96Tm9R6pq0)R!(jf-{WMX{4(lr-u-gV zw~Ji%R?8WI(1nD!V3T7R9BGlBv7hkZ_yeHaoS|H0c}SV$d|-;gWnT=VM4l~5UE(q1 z)CK!eRzsT4P)S^-rIg?#G7(b7er&m=)K6eE0s~9vysq7ntNtb7eTG_tc|>iNsE9Fl zl10}y+{p+bG9%Wr=LJp;WjC*_{i3GT>4FZ$=>5ktMrl#;Jz}Sr^Tjj1%|AV*f3Kru zY%8!Y$Jx9oqT)Ud3f@6qWf5KfJ4`ZXydX5nDt_u0E}Wq-koB#F)itZ7sWVURN#XH$_jGpM6M;`B&fal@#beEQhQ{mt`1CE`d$=!!=E1 zltmv*Og3m(Vq(q-84vs-Glv>w#mKJf&`) zAczdEd@~RSiY6^g5e^-PXc*o+kPH~-D@Xx2jTb?Ra0-8Z)aeLt@!7d}Owb~{q!W2) ze>@O?KMBXQrHjh{@65<+$IQ2_Q@PfT)#+>3&bRH`*w(HS*&ENzw_h(atv%1vH@7%J}9}{K)m-|X!*7QteM+Tp7%ZExVHZkzP|5c&$f+lAUa6Qrq6)cOVD2O2ILu# zf_^gnVfz)W8Hvq3O;YQWs~YP#T?9E>{~^pF!0^Bir**W}HbGILob3T`PP!$@?Fd98 zJoOI7Tq5q^tpN+Xq{`;m^@tTG6#67oQSRd-d;!>(>BT*Doq*k)jR|u{wvV4TH_$=p znyurlhh|*oqnlCBDF4NJ)36!*(_C)MX03FOnB3sedMS9sP$;&Mq7%qx60+KC^#};KDGBNB66wx? zp@wFL?(Xhp2M>*>;1g@_51_pI?uW9-(9ct$}UuMBjuCdRLIQ* zJn4yvM)SU>X#b{arZn0gS9fici1QV`_qcf2t z{=zF?wFvWt07U|3JV>7=plAoszcFDATETv4{w|%4GYrq7fz)sO#r>+ZZ3RgE1sEMY z0meU^ezWl*TjbYdqA{jG`{B*^RB=1PE_#b8gaU`C%6NMQ6q~KPR1#Ixgj~?4ULYza3 z;~DMc{Os%-2eFbX;|1*P!w%`Ks{KKS)L!IX{vZ;TBjPP(BXR@hg)W>)kHoE%M8Ym+ zMJoq-B#xk{_rpB#$vt1OeLRklLo)*jIywU^+AF|D=2SVLS;Srt;)Op9pq~eZuZr43 zQtT=Gr~{q}Y=?0=1aZekXo})ugr!)q`MonF5=ai`oDARl5k796N<$UI6_6sj7Otl( z6L66913NjH9aw)OgP$DG5Of;&8)5mH?S9e-xTH$S@$1CmniL?Q}>XneP*J$`(k?PD`!rOZSlS^;s&9 zSonaU51TTf!9*7ClrNu{-`ri$0yaHEjh{FrO!8P;GDj2@o2icjbjAr#cE~it$+CEm z$fC&fozj+A3rRQ542;SN5w^%*&wQ{?ugZ+>jx?zdkE%L}Do4xMqI3|gQif*dST^TW zQUrL(lGlXAU}(jf!_ylK!;sc|k7PaxVm=V6h{*1&FGIA?8B+603@m)+d^!R_VNz(s1#>M@|$Jak>lI4Hrz3S+&nLhcbkGTEACcmQw|K+{RSQ7t1 z{y7SyMbcE>8w$)NYxSA$E1)EE4gUF0xk;b7jPs-oYtFiUFZpsU{`|0Z>8g4AI>awL z6c(-J8{`1TiAf9-NlAUlpO%uum64g6nNean0qawpu@4VefJ{wKeO&rvwQ9NHRbE)Io3>wNbF_PU z06MqccCNe-{9gFOdDHz-dBA4*sc>i*IsSzL<^*!vLB?9k^P%tv7Dcs6DB*fUOf6lu zVG~IV@(9ctnW4!v_E)G(H5EJC=`>!;(s-)WJ@Pa`>)l!7MON}cK1cND9PL%vrQ#{* z`ll~v=1Mq=2wcHqCF5`+zITzGWM{;+v0dh_n3=K5joA2#=fb~sR$CS3O#*=skX`}g zN2v=(MwQM$@T-;Q#FQTYreh*#2_7h%Z2ALeX5QygD({qMSsMKHT(L&Fo2*Y4%)Iwz zXemp|zFg=Q&Y!Q_2;uF^RdPZioW2{W3I3d)uwg!XrDhBv_{I@cBvcr2r*x2Q3{=uc zf1DkjP2!8@6%52({r!8$fVsEyjtptvB;g)eat@2?G~+2s=tJDtTvU0`WrU*o7T?%? z^-v5;8wvxlW*4Juzu+4chcMqym}}+8`mILdvKys_)6%WQhf_wfm&VetHy8Vjii1)q zpR;x`h9|<5QcU*9%9PPC_myP;0KDzJB$>a}Hi=jutMZf>5dZsRVxjeY3_^g444NJ} zjLyl&wZta*HTry6Iwq}nOuUgDi=`82r}{;bn#2ifmVC5hIUf3C$$q9x4+mPhJGwDT z#y{68kJ6uD`w%logI_jHP6N(NMO^D&;G?9{1Bj+}<7Wa=|HqN8$PKX_?akc9QB7e6 zaH@V$=x`DWY4Iyi@v8jM#BSF43r>YP^H{F;PbV^m8ue99G1T3HtVm zZ}7YH18j`tCFenr!6}Rn?NMZkZv-8~V)osE8Q+mR;`*4eyvQLlWTQr(MDWIpVfl8PLrRf z7|U~LhBY1XWX^sYPIYg^T1)%0hm>vY13PzO0`bLn-_e84J4_jlyY~GcTMa-5how;kysIV7#&c$Io~`K4)2JnzHLLc&qHpefmZYDePj z9Fx#Tg)5h`{}jb^IfbA zPih+8`mGl%K)#Z{4u_5uHmr=-#m7}ZideXnO5(A~h)ez6=Qtq=kkC86O}uYEpelL|Zdo^TOZbWth_K)nKkF-W&=h?M&< zIbETuBNiCkE7kk~K+%`PHuwGZ*e?@jgKqa;(0}Pt6zgg7zsTFeJdG8S++Bb@E3@g< z7;oF{ewG;SVoXhAq@02UNRj_iQA%$7N=dpE_STeTjP8&hINX$FooW@qjs*hC9>nm_ zG98Omf5azmaW<%z`6>}LYGXc#$-HmZ>Z3ox-Ryz*QeR5uy3E5fK5t+hSSqBe@KMi< z*MpMQjA(iNqicw_e)n64BTfd59$SGDI%bEBOv+&6JdZyyjI(7@frEO_4_2hT=0je_ATS6l|aT9@HE-3A5)( z>_(E;l@+O9q9>bG>gCn+Z3?*KS!2>GWp1koWuCw1eSUDh1(MXAy9pA`$bRlQ4KUJ) zuZk;W`7vx_@TK5ZJ-GEWG2|gqfgr6WzP(VOtX9odCnFWwK@lkvrtY7sRl4z;K}-@7 z79G-mg?y)g`YZcaiQ4&?@O0M;GZ%JhQ!Rv_ZYv1C=7>^4BT(_CBd_4y z0e>zUL#jbLIm3m#dfX>33%!owgGPb?dv#f&p^2cb9vj`=^g{g z5*4r(Em9-~1~%GjG;LXRUx+WGOSGjT-dr-Kf9chd%Sy0ubu_^wkXJ%_egs0^?7}MF)RLfg#uB8}!BqqqpeE}psLIO`zIn7OVl zVc=@QQQhaq)|>vhC`UDY)(P=e!9lkSTeD!OmCS~Ib;N9)_JaxL0PpPwjpygk3MZpG z1G3y71=qC4lCPhJ2?joWbq{x~n&#`(8C)a#>F2Dn%5!)3&dCBrk_X?mdshk>_U5gJ zenS{Zoly~q8|>O)L$w-t(hIW=s1f=FU*XRMt0nyv0BIyXKzbb)z?$U!6S*+)d$ernb^Fp9hda zt2aBT+mmtae79N^TiSQotBXQ<7n_dH4|2Z2wF@T$_A655PxjegcVo~dd!=JA9Pqsk zn5A#NP7QS2pN>-i;8_6<);}D`#C)EP;PuYRGTWD$7@b6^hj@kja6ZsTf-aR@`s94u z{K{aTIABjcORthuoh&W(gShd<^Ff!2v&?&cZM5&$bxB*)NDquxg|{>#^?)u6+qJQN z2z$yarMp_g{A&U?i?=2;uJr34ME_Xet^WD3eyGzm!MoM!Jfv(_pX1+>NH|M)g+{w7 z`juW2O17Q9FNJ(q^azPv+><)7ni5a)C%7r|?!W2M9LgtZ_6Mqq`4d z__r7)7EZesh%Up%>aXmX9B!%<4sY>Y7xAwzb1l}3*fR)#A<=J%BcgVN>PH~wopuF~ zaF7;DQI=&XME@K+LTW7{pbCOcp%iuJF{$h3DyVzHs!L?=5iRN&;K8hD^wOeJ=3L+Y zK?L&lKDfyqP`m1dpX}ux>*cBJ&8_6)r|cuIuhJb6$m452o*+nNH-M=B>R!DQBO2IJlsM2HGa(@SFpkbVd;U{4S-L8NfcFYWN z>nz2%90Ti;nCQ%Q0jqNA=w$L^Iq8{Y_{oUtBUXH!nE6em!KedbJQiUK)Pu^QA55}4 zK>}s3;#=30KWBR``A2`+_q@>pqd>`NGQeL}I9N5_FS4Mov17bJP>*V_XysU9l^8#E zINl&%$T2_zlHzXU8zq(^BN{6k6N{*pYBt1Vn()#33<#Y}q2++5Rj2Sf!nNK7qVe6HHfN){|q)8ckSNCbtm+y$(=xstlK{#ju7(eH8Z?AHTk_lQ>z}Su$&ehPK0N+NXBrlkE?+J_4 zYTid9^p0+Vi2y>={2PL7mg=5CrwyQfX~ZkEwtxPlR%KzPC~6@6a1 zex-cjbw@#2fXyODF4|Jssxhm3Nx^jv%?3_hdUF1aRx)Ny(cyMc-W34~3M@vg=jyH$ ze)iD0no_~bC8DY9fCQH!;)Ld z*;C35rubXQI!(eBIQ7bT+g-5sh2CzS)lNa(RB7T`v`;m(N)$NQohImrg zhi9?_id0xqA=YU}kW6vRHM_`2wj`NJt1-AgE_Pg?PQst;Xp-y-G79GJ5$R@Y5N)pt zaabi5%hcxJPB#(LbL~+zsLjG`#_X+PHW;_`rH*W* z1iP|Tx?)0}{Nu{lV6TuqGT>BF@l^gJFCva|u^7OcTxc=E3-xK;iy2-Xe-A1|#Oku8 zr7EYvkIR4Fm3k60^Bs9?y@}FwsEU28JQzo8E+-Cc2s&1|%VlFbI!RO5yD!GN=Q>sH z+n4n{CKQP!qTl(}?6c;JOF?oFS~wONcdv!tvitx7epag89#&1I*FX$dcG;?m+T*s3 zBo$8J-%P3^?F9CyrOxlu?V{cq#0q$Jhbs|4W3GXmgNW^VFnj)Q!vt1oSIA<2LgZ>Uq`+pWP5?uJkSHcJs6Pc?D*6 z%1jmhms}O?ykqSFHHNbhIlcTHS*$H2iy0tOej#wEX`pmEJhJCTFtioJ&e_)COFD+V zf9T9J4;rJXNF&X~PVJ|V?x_~8npQ7RO1D#*)Dvc#L|bNWYKF6qsA#_BDaou#op4?o zKg74CCp6&9-tC0tCH2d!svMOk$@v3hP&c*5Qx(Ni*WGH5(|G4iY6q8cFNTuecFuG7 zd%-M|Cd#MK9`@HQXT9EozSny~T5Zj>QSIhB(sXwpd<6&|jmd$=c|0b#BoamJ%f)1m zfo4MmT}NcB|K8)uyytl(z7)8!Hm-MH1}?Y?)|QNmm>(VAla>%7idx`<4~{s0sFIP! z{i*!L%@2cF3?)C$N`#3ADQo*rI0pSZicfln7!e?fjRB}gDZR@ODRk(&W5M6N;ujx` z*w3Y~;|EE(`|;HB2!cw{;RK@NBO)FnZR;hBzlRfz3v1ejG3rfT1_2pnhqdwxl+6ZN zrEJw*fSUEAbhCI#Cx!ZIMNH@5JI4_jm*gadQCz8^a>MaP6~dnAVgVNr1Lt@*WDNB* z{)Tpk@uD7Q4(3i@1*)8B zxfaV=FIpo+?5l(ySf7FF)kaXK$dO8+o3H`mX{EfV&B3NNsxGm~+&x5c`Xg644}QAH z55%A8WPO{uH{I`2HZxjzG2^%_51jM_2qp(VWCl}jwCXITz0};2f@a%OLdB^?giJlY z<%apr^5jc4T=X_slZ3CQ`Cp9mSt8Wv-idxZ2=&E=pylq`jN@oaT4bBwI<91Csw;K; zX*s^K>Ja5F*y>IHIV5IE_h}fZcIe}|8YO+N{kEIIS?ux1w>1wj;1)Ua^TwIMO;6c}xJuki}w zM)%+V%hlJH$(O_Al5su-$`rj-U?Q%W z^HHl$ssbY#GG{;6Rq*=w2bGZOT#G}3uR2?=yiB>4zedw^eD-p_XvQ(u{eAnvn#RL`A z0U(#W;#|e&a`NapKWr@`ZjTHicRNf0A2bYGmN>XV{o1}X&K?f@MQe6| zykPhRp6tU=Jonm(+d+SR-)Fbr@valR`ncx9h3k~0`d#q(9C%fBDU3go(MK02OF3*U zA>OYeu5{X`N;1#-6e4+MPX13E;X=8!1W4@}Puh*cBpT`weY%C@BWoWHWeM%dA4y5f zA#&~&9#Q^FTl+nCa##``VLQFnUomrfm`py!n1h)fqCv{?yL=YP@#W7Oej&vboi#?~ ztOr|>uO1F+0XXeD^h)9**ofBTYAG|oGn`QhUSdFn&ptLI#}rXxVmNa<^Yejy>8e+8 zX@PoKn}Wi^I_Ry@Ic27tN1boWSyF>JHjTMo`47BUNi{{Pe3r<+f%bhn8Px9D8&S4t}}@YYYM#|(PWAMYRXn&)Ub<-REA z{BoqmAgAm6Hz=*!AH=)7aE;2vRySd}n_5_~7St_y!TyHCxUSV$&ETg5gK%DeY}l)G z)xYPTNjH5};}zA*y3eaM&{mtCT{dT6@t+XpDDs;gTqWcCo+x7x>GD>DWl@bfh&&({ zYpHkq_oQs9J*vt zH35dSG9A`_V|7;8s%yj4znWv!{|7w8rn)|D@~bubW4)c|%=#SkS6eau|A2=KXEs)P zL}&#I*!cw54F}Ntf^nYq7njDG(P4T9pUj+eF`fpD9zGZAem!%3=%{S9P7cOL(HQbm zMTLfWFLf6QNYkD9xK^C<;Ah&AnYW}j0AAjiQNe2JROIim6`bGWm* z0mgVOkwx|}+;43faU%x(uw5X1`>MlLaMS)ijogKasCz)*_Xl<{xmqun&kfh+A3GeY z!A49=2e1iHFrA0u!+oMe{AC+EV1VbLp(juBq!Id?pUz2ZQsIj}m4QCf%M_|9lb+Nx z0mt_e)o(i~X?Y8;PY-T7yTP#C^h&ADBk8m5PXSw$#V9EUS&R+M)N3?U5n+OZJT_v! zGG-cGaS&}BzEU*7jTT3>drtEAn|G-J*TMHt2A;}z4(%3f7F(o=s<9`3Ac z?>kc1vszd<6hR-wUuk&P5r6oL)3^=1;w33t-%Mi@{m17_Cu@DS5OW{d^dSAILp$~d^l-12k;H6DF zt&nunE}589(SyHh>po-EdDG*b!Jl?5gpmk)Cps<+R&}F+ihVb4l!KQ057%(;M6ox? zxwhrT$6OWK5@4~q>ubi+FDt)<#|QD*as9m!G;T+5M@Y?%t62UIkt!NhT0tz;_p0>I z-G9BOwJfS4oDq}eTxhbZ`h=Ew(!y(;Q1dXU`M7rB_+@3oDqpDw@_B}8mIqd+7%**p z@%=p5Z7Eh+JuZy&DpqtjP4vNg@H6ctbB0vtvOfR3{D-;lFIInqAV>4=;X~BYcQ`Jh ze#tY80W2Z{2v^HElE=)zc0XUPh-GNj;zLo(>ysV$w^*Z&eix@xjuiyTOfWP@4<)3I-QT~9_-*ek?hi9d_7-4wIX`-> zM-iR?j7l00`=S^XejR2g=E{7`uFvKM9aop_3gPm%<1k`rDvo@fibYEnt>)kfTnp!G zg~_djv#&*XfS8z*VbJ@C8(1`VxS=L{)PAObKBZfd9K9LDHD)YYbq)71JfaTpBIGzW z&LO6JA%vEKC+8lPxfYpF93umAX=E_|@3s@|e?;;B%ga>!&$iS5vzJ+hGdhMdJUP`? zIWS#1K0j8sfCB=}PcMzm_04u}*01dDVq=5qRu1-#HqSt(=iBN1->&mF7FO;*CocU% z!QSsY+0W83LkID6(2e35Mqtsu%>!dUE)T+&NO|R|wMpK@kTQZ(DFye0Tp85;m`6Qr z*&rvgL;2kVbn1CR-kHt^1oj$PGfpg+8^8j@ z_09~M7MRQxPPGb`>*C{?gvUFmjXIQ3i;65~%j?GN$###k>$->P13|aw1TGiamcx6$ z#^wDe_?Wt!zoNTHKDe$_W~vD8FRyuy&ZMw>FO!n+ww?7+Jv=?nA=>Y7RY}6A zaZmA(M>F*yuYO%z?5$#bcRem9-#6FGX?fBz+-S-1dk#bYdzax$1|Oyl*}qu*rW$kN zW9GRJ65T{20cM)HQ=lJChd+qlGq3~a$V~!O$tKOAoS8;M!E%hl#UZ~F)~3xc`>RTW zK>gKL!Gh67WeL6cYo&o(1%6g2BYqL@$#^E>?o_b|hDwkZCA2gxrUACLovdR)mczsv zf`r@Q_>t_T$wgCDiqOGtrv9OMU6qVl;Zw2r?p?BP$lZXP4q6(!9OYzcMvP-j7F`SD zWny@oHR%?oM+&$aVggaYk!#z>W$plplj>f56D;x^2>}ceg#V)hO-U+eolRwF?n!M! z2)M4g?boSJL(Oh(Y+dp7=~)X9Es(2jQ1$m&-Qv=3eV(r61d`Y9FOZAGVhjjhIc zY(3A-a+j8j`^Wg;Y@()J+tvZ)yxmlel;~p|+E#KnF{J%A)*c@XDT5_G{Q334Yh-6w za-@GT;dW1oP*WVmIQyGv>>-YoN%FE_^)C^#?bhE1-Y-d;e@+r58+x|JD0Kdkse`OY z#DrsSIXFT*Yw{1+TT>rX@I&n}zWp^idcAKhi;Es9_xKxwT64ZpV_2p*=W2KGNtieM zk;VcwX)lWh)QWJEG{B*%5+aq;2#UTh!o}_yqMUY$u9URCVy+&h?c|Smb+i1`yvsx+ z9iid}HXj>S?Acl%@*g(+dJim-|5qGg%%Ncf)XFpQ*JAoK$E_ce+K(};c*Js;eFb;> zk5aIDiVD@}+_XkkXz{xXsK4Etubx!AG;eWJSSo$y+#*lOSq|@xW0v0C7Z;$u)F@s2 zG+TS9$jzGiJv2_?&5Q4oxX%o9Qlq#KRD?^k?&n%CGzHmOF@6lvwQDRjm@UcRd=%S# zE3U*EjDP)Vn)MujKdBHwwK6y>d?k~W)x3?>A*1eatrb=$RDnhaB6BTl$(Y2g5}Qz& z^UD31f2>v|hC_)n>~j2^s&roRD;k*lqXJdGR+aSK)Ta=pCLEo$O3@RaX?G3uRP209 z{_B%Z(Gu+?okZ2&Siv;Sn|l%Gs+lbHQ~z617cMCV!{6z7w_A_NlpM?Z81eDXco$~) qJY$4*-jJ!BvYK-1zD1l~R@#n!&EI^Hv<|qfE`gEgcs~OHB>xwH>+8J$ literal 0 HcmV?d00001 diff --git a/docs/en/reference/images/logo.svg b/docs/en/reference/images/logo.svg new file mode 100644 index 00000000000..b5ab923ff65 --- /dev/null +++ b/docs/en/reference/images/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/en/reference/images/play.png b/docs/en/reference/images/play.png new file mode 100644 index 0000000000000000000000000000000000000000..b75aebe40895e1e6d92aa2189e2c910df3eaff5f GIT binary patch literal 26602 zcmb@t2UJr__czSF3V2aLrAfC!=%Vy0N>n-tC3F;N(t9sn5x4{aA)z-3ASLwP0wN+( z0@4X3)X+l-z2-gWectu_*ZSV|e(PJ`C+D2u*)@`@6;zxDbBRGostmz#p(?RyG} zBjC~7B?=0U2NV=5FDWRb6DcT|om1-}vcL-}bJgcc6y)=NpD?)y;K^UkFAUr$C}_W( z|NIfhPs;{8q;^+U2U8QMsp+rKg?)+mKtW-JQ&v*c@g5_dI;a`>pcapdVTBUM-(ZXO zZCQTLw)$)|{MQ4n`$iXkm4ZEg)nmhFjLl0ay6ch2njZo2n3`4h=o9u0Ga(8dY8d&A zhwu|wmR{=8nA4_ID#O3&u#V|S^GYbwOJs+$LBG35Pe1PW?4%T*Y3Ty<8SJ-}CnRu* zyq6`NF6Bbu)Z?R65`l#MRZ6v`;}X zz57r~>X49NR$N#J0*AEFF%l?skRc%=gs$7-^Ziw49VwzP)-TN(+*%4K~o%zsbVl(;0{YxXQ-D@~UlwGat0Ny4p-W z-NpUu810$m>6a_`;bH2{8G1hU2DWqa`%3qe}G{O!2)!GjD zPoAAv={Ve3JCYE70}gMWYpD8$eyQF0+I&OIA zyi`rXzZ#an`o675Ja7EfSjk9;S;^&$qGpkL>8ibQJ%;9~p#AkpLH_oYl@)3(|IHR^ zE}bbTAX2OQ$6n5J#ip2o?tANwK9X(=Y5)x*C(Q1!k<;nPqC&It%saz-Q5-VE!^745 zr%H7vBTYFvW)LQ3W?{1?|Ap?Px%O!CJdeU*hi0Jb$-&km>mK7Ke;*f@tv7rC+rF0s zo!UjTk=7eFnvT~Sm?8)x&1bs+J4+-e-onzl2);G87z{?HA>MAJm^ZKy(mf|w0e2;iRb`ya^9UuV5~U&{B7_pqRaI4;%@xouxIvp; zB0k@kB=lhRjh)AlS;qD^`|r8!-`{PK%t~3=*SS(v8c7N^wl22r0tX+0q7DYN$j%RT z*MgFV4!35u4hJ>3NPRQ;OSBu`56kfQvbIIiTlFgn+tgRQuj3~_r;xHoX` z5ueJIjh3LG=9DQF6&1ceyzoAYWYXyw z;>D3!YOlafe}>Vih3QyPh-v3i%{#u0FKXwjWb^oRXdeMckAvVo>G<^W<;&w3w~>(% zudtVvkx*&o98?$s=@O~3O?VOe#B18@5U>` zxzn`WBhy)xD($h-u5eZ+El5urmJ+l>_BiVaa+MF*nH^)xLT7ZV(xV%yO#-{%=VIib zgIpacfteC^MNgFsR;w!1w;P6_@d*iWhlr`Oy_&X+Q-W|%es_+nI+%BfXNB5qBfh){ zjyjQHr)54l95(>tMn?~TrGz5#Fh#VPmoZc#1!4Xe8b&Zwh})o8^4_~v3@sXGQ9YV= zVJ;&7omUIbQIQYE{1GQRaih|(o4aDDPfnC$YS4e;O--V@RRqTErF~N=+@1Pz+qu zm-S1j=i!?;Mn}*GPeyf5NRvt(msr4pF;tUX^l z7w@}TbiQgdgJx95eKs)rDtgFVgR!hDzC(dfS;4GIq{4SZ$rr`265~1&uzD3) zH)%DacgMu$vc3&&ZXURM2g=KeO*n`c9Bnh-wq#Xl+kGN~2Bq{ZZ!K?{i%KhqO+QGI zZZ|u4muAbS;_ckkFqhA-Vp~5ITgQRcwl(1On(`WJ2>YJfIy3!EE4R&Jxz`k5Q3Q&3 zs)q=0_H_Y@ysVu3d}U(yBz-uk0+bTb>WM8v5SH|FrH<{5uF~45sF)XwnD;O+7|_xt zsz+sha#h!UeM(z#6i2Xv@R`Em7~A?~kM+5K}}BN#2j(FWd?;=ua@vV6-MMJ)F#=3i*&F0a9ID;B~EKN~B)_#lHF&OE%q2*x~E{n^-fPBG4$ z^OBY0`pLm#&z>%W%Bj~bKO8%aBV8{G?5)Y_?*G*B*N)d7z*rFr)%*(wEVvAHHdFzv z_s~ejFU=F-b4$P}lvhm(GMv4vb6c-pCM@Mi51j8dNkyK7vr=j5 z!SK}bk+@_oBC(QxTfwTq&Bol;{?;Sv9G{PKuhY1@ru|*63m+!Vf2O1^v^svN@3tr% z7h7H@x%R-|I2R@c9@)It;`EEf(o` zZT2!yk%@?#y=}Vu`aGY3z+zR$WSHu^Bbmd6|d>taH|=u_ZcC>`qV&g^K3W)%M3=vfG1UlP zyL2FnTip_cKfw%f(S$t7e4!Sr-SUT>ry3%8T`WZ=F6Mk)=nICZmVFH-@T;6l1&4tC zPI(<`8f)nyzEN+gx3{TRLBT={N6r0pdwaVj{Pww~>-msGI+qA}+2)&(LZR))MDDJ~ z58{e$Z71K!PGix(N;@jpQ%^{oJf6beSN*O z!rB9w_AE3yU)>-+Zs=*$-&+7le{bnp{msqK?@pO2q?0qa%9Gbl=UO&T{JqUfA64wU z@j#8+>#dr9R=4kTH3$AK-;2@+tDq|ZmP%Lta-e#SGN3dBFSKCa!ON52)M3)t_^$@Y zq3O4iYd1Rurwy^y(K^mw3?*fdG<0L?!|Q>$-~g+HLQuF%Zb!D7MH|4I`1QVS_D=)m z<+Py^{H7x-N;Mr5a!l!FsdSoGqyfX@^-F(gSp_ck)vDx&3Pzt%?=J@`LjNK_qI;vI zv4@ujEDWNey_hF9yB`fs;HK;(jH)hIis$-ailp1)M zM;J4!zBuHceMB(oTvxY@sTk8V78>zn_SJPeDeg3@NZ{|Z&Ir)+dDU|{1Kbw~YgzXv zb}Pp*pS`Q;+Ry{@dS#cG6oOzSrju>(a{j?!FGf*&V(X{93=W-7BcD+IJAKbP$7N@n zYp0#^NYy`|a|P_|Fy_pgEjs8TwNfBIC44F_7sRG>r)k+;?QrcM$fLLA$Rsh-mk+~3 z^@7?$OqIQ|K!XkDu3MC$g;np0`|fs#SYz_+oZAY_mhN|jFaHsSv)|xkU-Qyue>yXC z<8mxuW?+{R2h53Bwk?aPTYxoNNeO28WHr(?FU&r6g|0xGmcZmoVA2qz9d`>RyAWLq zhNeHY_DiKEQSibb=lb-7bgwIAJJ?6UM!Lbaz&R6QTTtU|h*ALYyIo9XD>hh)Bj0sc zpb0A%x2srN;c>HSF|1@5s=V$zD*8M@9+;Ww znT=Bp4`1(2#B9!V_hcuk?Va@6@O3WO`Uf^n701t=gqW@v8;7iR`Z^qA0X7Wj@=ESv24kLXtF^KIp!*bia#wvQcnDNmJrd?sCI9>-m^=%NK=170s` zWxZ-%$-ImA+mMSeK)%~xy?IlsIPaz9NJIf>d|b%PCo`cRDt|B?%v}3(b-$U6>DTo1 z${eZieqy(%Mk5~^NVH%5)Y@5njd1neu6AdC2?x7-t#>BO1&mvGQj*=Ec#YM#j{Bp# z*d_B|quA3TDn;H~K6mid-=(umc~*9)FrXzPe@1@znv>W+Rl^@iB`244m>-^(ba}PBeFEmuV31m``1pr5*a_{p{Y(uC^OMP$Xs?8GcTL-cs+)wwI$ zjAgYG()1tX+m(`d{-!GAuBdZlMQNk&pB5Dam2FP-%uC`c?&3)WEsnoj1} z{-0=%WPwxqh?N2GWOz9Xhg-n_IJroew!)JGhClwpssd;H^C2pWw&JxwaW~YE>RtX& zzG;NVtwQ&{d$$Jv{;DGQ4M5wNJ4AFbUkdp%ooQ~E@&3)LTmB(WRv$9$^GKUS@=UO z)cHn-UFgZoKeUnMyh=+L_G)Wqw>sWLK~<>AoknuMW}RjwZ`lBI-N1Ga2C*=}t95H$ z*&oB=PJe`NX&~yn?i9!9t9Oh`E;(2|$utJLSdF;xCZxS^V1hANzV_U=aTZ~Xex6HJ zI0lb3>d>(S<5Jga0MUh%cU!rwXAHMEhrr8O*(+0s7bxkOLIAsvGW@TctnANzk4)}s zLq4nMBD+1a^L3>FHDHvgTgwdAP~T5>W9*xdlbn8#!VwG}@XL|#{Za0!Uy|u8EBR8) zfZEk)Joti@_M$W3TRqDSL(*b+6AD0wnhE_Qx>}IX!n#&b5PI}0qa&)E5+AE@A&D1s z;r`w%zZ?G~J;ZX+-_ayxA=7}pIz@(>nth==Eh$AGvX8RoKKD%A!t-t=T%qU**06U8FA0|QUstI3;I5nzX@ZkMj^l*5wA1^%JTH92s=Hur zKWB2^@VdFV$;dBl4(!ug>S^E}=(zd~0K3=T4l$?pBZ$W^6JP4*Of1EdHePl_ zJK5Vaq%she>7?1Un0%GTLQJ38y@*GwQp6w=o~AhGMuAbHp|)jpw} z{(tINl``=x2us{}HvLL!S?orVraO9d!P3Csvvz5JK^_XEO5)!|sNsG@P3z_QZb!Y- zDg9~ZGFqH=r=t*-v@Wq2T92k*DV3CYKubkkG-hOa5DaKuSL$v)k%jIxSgcXasS zt1z5Yed3FIvn`4Zb$+tRFF?C5c>e&~1Tb$J z!S6WnC5S4l5889_7J$(7lU9~0cN$vmGYZ)-F!FV-xHq4N7&I-dwOwBa5>1iVgW7>q zqnh1ReidV05ITpZ(-!wGHi^zH*=^s{`-}Zbk8Bq`y+(!A{X~+ape;Y(nv-n3f4mZx z2~ASPsRDNNySX(xvo$2xY&`i1uPL~?vNBtgJ!ezLG`O^#Uq!m%>M0)MC;SR9^)NHe z`f~SQws<0)B~N-}HgxK=h?FoSpXOxkTzvVQw#2%MLZ6nJ7`VrgQ?Rofie0lHi!ryOuhwK?V zBG`B=!q=bh58| z_}lYIDl|)SrE_v=(Jm!RU4eR?xflWgHg46$Pvv#qjZ^7tuJ^yEaaXuWrjF`_)_2nG z*@(&(PTtQ*23C2IUc~22d!Z<40Uv&g8^C-aFhTZ!SqRvL>pJ!CQ!i<%g)nFJEtz{4 zjwBbo?km%%=opieX8dJFYO3+_bm#eICJ40i)C+Rk?U%a#MSmfgfe^1jpCEk3wSm!Gr)Wa5;7lQ|84y>#qPL}-Fd9=m1U$o>b)kI0Wf0cx%4Q-%L&) z(TdVy=ju`rfdjQuI+;q=lERtVsl&L-?fRhFlPfnGPPTyv$#D_4>i)%X=-v$v7nqaN zM#k9=PmadPvS!c^=EtQ5+J}orP;si9)ck;g2tD; zax!eVN@rzO)4cGI9trY}rVT*~rUgMR;y6n{ofBpC>RNIWGw~5F{APvyP+vd4Ayj#7 z?dbTp5h0%t$8rURfaG>{hv`T!l~&~se*!`ygKR02A_nn)7B5H#yAHmBSU(WA_-r7` ztY3`+F^}rI|C~u|sM8asS^4|Jqiq4`mn6BzWo0-HRZ40iQ&o5#b(f$d!B1Sk9jT3SZo!4L)*IeB@e<5K7mf|rdRqS44LBT}RoWX9816aC zuN83GUR%gt8q}&;IHDR_{zn~aT^O~#Ws?s_v7=!=x74g6hVc2tHDhXObon)7E6%iP zmNX{Lr31E*&Q>dT^IL9V(p0%HLL-p}h_4Zoj*))Wf%cE3NNx)7h2Klj#2OR%)FOEd*{NHL-sT=(4Q8+P#kU*+CXi~&x6iKY-_MqHEsIMxKep9X z1>$)0XY$z|F;>mKaos$EM30%-ciqRvC47w2eucZq2sYrY2(D;mW;aKd^NpD6-rPx- zba`^zbP`U_14PZ!95_S{DV-8Da|Z&On@NVk4ywFf^Tn>Y@%zW8K#a0sRCBPSrsLrFThf9g=z#}hMVuM6WB0He~6Pl=+px4*wXI%3v#f{6ROK}>x zb6W=h(iTt8I}Y4DfA(y4%lpWK`O-x`*%`onz&yV77>xTH01rn^pLQsSeCy#?;p2?_ zp4C$3Xx9Ng_Vz(5-(ih>zL-nJ5#J+He}9t|073xAf(~%#X;gap`=Kz{2!Q8k&0w2G z8c@@O0uTVi9__9IFsc{73W#A0_smBNk_QqF{`S5iFLSLCmHqAl6QaYO$e#F*qw8Xm z3hH$l5F|F=;9RhQQsjYtA*8V#^P8K_cVb410%Kp?3k?fHbljtRz!(AtzET`G#LOGrVkzv zuPrfIS$3L`YO%Cfja&dZ-Q0Yz+OTJM9tVEGjp&q;zOOthED2z}EXuJ+L;w#$SscK6 zP1Vl33Pf-L9s-zhXZTEw08giUJb=DbC^IFY*_GPF*In1vogUFY~Bn0-bUOLjWXFP(Z-AIq(!f3e5bs z+J62j%x3;Bgw8mid;9|epK|Q&?LlDhGT%AY{;9E+UwCJSp0BeD3VF*1^Us0uR-u8E ziIGs!jou?6Ey$)yHG?jYfzfzGV>-pM`4;|eB)b@>0As$uCOZQEI>QT~{uGU}?`8Vw z#ey{Ch8Om}*NnX%;ky3h((U(+UnUI4C$`8pIE_i>+FC+11!2{3m+l`chgiAVIUuf` z*SYqjKrN&gTAp)>@-*8*inliE=ZXaI^!!!1W#>>_EciYQ0zX!Ek?J$WuC9LFdz|)c7`#(noBo&*P-JKhLUm?rs7birFrmqTk~(Ajvr#!?C)utacfIdkU`h6%*-djnV0E=e0rjtOv40p zbpSqOK2hKW@i20dQo2@u^`Rkd_EqT7M)&g@Y!dWrk8nX2&>hRRgfBlss1sys+G{@5 zGcd=w3Ig3%*{4^R;@$t!D^T={F>8)A&sZtRKTbACM?3vv-K3)r3&B2SAIWr~LbA2< zKJrbJIl$nM_PhX3nN;==z=VHMeQEIe&3Rc~EonYGS}pk%-6O96vU9Y8rl#X`1|Y;) zM&0t9c|N%3*}a$-KY+Or9zzti8%zZr6M@xpywsP4{Z*Wr4!M~kMBHau+R&hw7YyC! zA4!LxiZ^ESfv-%zLJFnd>pm-+^50gMPP1a@3XXu)qw)m7X)mtBbG58s_tF8em?p4% zWr{W*^1=-)yNj|~E`0^6RiO%Bw9)fE*f#!m+fGrf0?SK$O2; zsys;gO{yAV-{FP3uYA;HcJ>wE9dn5dxPyI;Kj;n^l>Q_W$X$4w;Lm_=>goZ-123P0 z`!urWVMs=0DY9~(wjxOsUT?M0wL`CbrWah?!ok0l|FKFBw)hW-JZIN;87^(TmW02h znU!Z8GrAOCDUpi>6-ZD6%EzC5AxZm;IlGi7{6FnF*I3ix@dnbfW5(%u|TZz94e2@Ph$U|FpYrUa2X7*uNQX$efI8z zsCAUJJmR`(Z63O{ z#KzvFCBQ_IV%vREeRR%d8yokdCuREgrr+c1Q@phnfeVhJOII1&#tX*@WvW}F z(=1Ujd-&jpkzy2}1sLEGC}aspNYe0-W%HG?P${53>41uvBMMC8-VaI^ve)5BTe7Ne%n@_ zW4l_+YVp#Fk#U_-!j4P|vcn8wHg2}BA>^W4==+dUD?gSO@ykow^iV}pu zBeZ^D!eLW9+{erAm6uvQo06hEN$0?A)nV9j{KYo|9(ZUyci6}zs<#Am9i|=N=E!UE z2@A5~j7{wATj)Mr6R|eg9asbe$@!9zYsr&3NFf6tH_O1(vayX#=sAF6o~*|M_#5nC zmd)l7O~?m_q?Q-m)((Y35%07fI77a_kl$?^fbcLyDXDz8+8G_=?Tq$z-m2GQsLHw2 zrVWxv7*W+@0I)mPqt~$v+toL>JPYtV5mY@B z&$6*`r8JL4Uhy&ffbz|3V#2(%+4_FZK;vk9MJb>HR)9iXl71S1=0PH3|F8}g+7FIv zzUm5aXSkYgH8(cU4%5_=iD!d;gYd8#p0;^;*TG|V#M#_;^@5=B1dAV&7C*&mX~;}R zm-$s%+I*AU&7W>9A^;q-sAqJ=I#Q5gx-QB*?)DmxuLGoG!9#^^+ZE+Gxm%jOx{8t> z)H02+2an|B4bn&KoWnQb=R^`~+7i*XWePxy-(Seq8Y$Fz z13@V)VRtqh>7c`a_1p&3!SSj<-FBeVb`Yg!tFJ3JHNEq{`IQI6iojlu>?p&>`Ox zl>u!f>o;jYJh_VgsW{lkSP+5(5YpEiAWx_7qnz_w{qb4BrVIRf$i&=^TTFY`oDdq< zE-J!cE3-h#olDkecZ$!7oiX~h5fDtFf=tEwy%yCwNM3QcaT&j){hVSfp?mGct6avM ziJRfn3GgRioF2HZkcaPJcN_xlqg_W7huYHJWO5md!-ZUTlUTWeZR&4cX#+xadYi@J z4Y##9XwC`0Ss3gL`QG_jvIH31ndt z3fbhgb3_%Uz}?~caQ=33o(7}*9nQwb1hhP3=F=|u}!~&a4 z7if42HPW=%p1Sg+1d6UB@hLCE?}Qi&ULCvsIADPY+Tqaf?I_7a;Jl#D%dnjP#*4saqbbj06SoSJARW4FU+ zd$Wr%K`*CbwEG@TdC!fuB)6LMHAb`l#J}gsH{&SEKg1s*Ndp|-WW)T3q|@A|V#iTu zvxcH^rwT-{Md0kI*i_eXxOFr~q0~vl;TrjDff#@j1_iN|D{P!pZJV^?<$awqO36#S zyLZyNy$Es?M#(@cK4xYl+UC9Wv*f$EG#lgJnA>SL#=Q?=Lu| zC+nJCEx`nR;+9!@k=Dyb$@n3r^TSoX4}eOj@ps-Tz@5TTXI6SAyV`u6PSr5T`%|Z& z7`{pUbXlKRmG0i29v&6O$k@9xhsCYOv`s-pYa2c;VC7V4!-@^8F-NTnz|oeV=Ps3#Z=) znVmM6ACglJfH|%d-qyU)*ErH0%XptWi^`1}ytHzRTAV7(_t}<6PEh(AkZi>`jHMrA z&QgA9o6JR^-3;f=Q{e^r@THTrNCS2H+O4xcX12u?yu(Rc7 zD;5#GU+z8c#INb_ZM8KV+eNu! za|cs6fke5z5q4=$!5GA$+AJ`0%(VU+%x2xsvijH-qdD+3Mamo$6zRL5z3*kgUw<{1 zud@^J`Z?WNmmZQQUl=d1)}1wYr6t%0$g`YlcWb8`VV4!p zk;jO?lXK!6gWbLK1lx2WY^u32GjL&RqRxwM114WV{yDyQhVmEn2E=dXwH(=dXSyjl zIr%gUFDsIBm=0tUYhr>PU!y&J{a`;PvTK8L)T?q)I66X|Lq-R&UsbTRPsdOs!ANV> z?Ebo7k=RPTH*`ttc#q3>`+#6c#L9c_{5)e-%)@zY{%FVz#u-U?9TA@%+wBn*oHv)s zVwx}dMXi?rZ%@0TxuP5EN)D2vQV&%3!sO%=FiSBt`$PU4?t6cBO@-FQ04!TC$7UvascX#ridh1fDFl1c<%U(ykzXiJ8+L0`}Yz`7lJwV6GO3fTC079a+c zV_Ht18lL03f*JCL{;m7XeurCgg!I|oyc++z-bbfd)cYfs-jzC$GdDQclKievd~rMy zSLJACy7H*JpEA1$Cx_+Ht=}vCR+E_GQ zh+|%nu!Ea|xtuO)`fmW-3^=A^Kf)_Sku6ua=r+(euLG`JP9R6y`;b(2mwJyVX3C1;4y5`?R5=+B@~<2R1D z<;5VAyPVNzMzGwHvtvVJ{m)e^^Ve2Ba-7i##<%oOkN^+7J)uAct6pAeC@;Seu7v6X z?VnGPMw7a&I(+i)%E5JM5*(@#1KIl=R3CZ@QE zSgD2+xcj3rmq%qHn99A}gI0w$E-KD@7wBTO`w00zM5&_66NYXes>aNBm8l5I@WskU?G9BN(q$qWSLoIB+=ARdn~M9 z38@$BwepN8paL)rP<&e$!_Mo?XdsSa@J0T7NF)Wyp^LM0|k=b7yk6q`@@RPgz1asPQQT!5PA z=tF-(bqf?Z-;%^LuKNps&Rb7T$9p!OLU zQG&q|BjDBm4j~9G{nP}}nlY%ykUKJFGZ_9Zuf)MMWKjGTrLo{HRT>D6YN{^7vS~ME z83>E}((pY6+?3%4bH<=cywf(KD_}@&i9=<*p%^2Wh8K4465tB`TnT@(68=o*=YLgvUKN0?ihDFZmItfknyxhoZo$a|a`9h*?=hq`69if$gFC-Q_p^5miO_f9`d4ysdx;4;wx$ z7idrQND6xMY-Rmf-_O~SgGUEOS5aj+HiV_ra?S867%-rp3>!kGmf$bS1wvAYI4~;T3IRj z8-Tl@-6Mu*r%}}Bb0I2G2ScnOo>0i0DCxe@g0x3JNL5QA69svRXw$)(10Wp`QA`CH zL8b`)Rmt+GE7u?EYl48q6W&(g0f#}^iU$%K7x?}$TCt*Bl_b7=5liMq#4d2(G!%1a zA1UO={8ZgmkIhwPNNKbMUl-Mu23*lS>sU+Pq~p&YtHejdfB6#gyh1mXy@K;bDoYY8 zglax?j~5Pz)!va+1bPU>luf}gukrB!jK5;XmOP%C{upWm5v6?WD)G;eRR&b~iVxy` zY=fc>lQ%fk!#wTo9qO?URl6Rc+|l=ZShwxK;qo(YZh*1H`yYoA?F2+Js`7jCMpgm$ zLWAL=!+j9o&jeb#VfsG#BQzMkaYq|MNRE*;%H57NF?Z6dzmHAqU#iP^uxy7OxC}^d zQ~uD*UVFB&7(Gos>^>#?RVv6J%PN~up%7PBBf!tc;QVsb4jpY?9ven;Ymn(7C&5=WKEG950}(!dUZXt7z#fFoOg3N$!q=ApljOt%zQ#606aW-aXLRBf=^qs}0%z)U@Sw ztq%bN2Z)sF%wzZHBtw7I1j|cel5V*>zb+<)5(Y% zKx`tE$?TUSU6kLG2#cHt1n+@p!_zVg0VbOF4^MB+=FxjZrgtwKf0oYDDD?X7$_R#y ztVDE_vjb5Ha6p76be-xvRSj5r;0eLVDyo)G{?Fns6j7N3Ec`SBR)$_~S2c6Ti2b#z z>gylxbwPg&K+)RGyt8=&L)BKfL4QV3P3pA+C%AyXV=laW_d2GN(_M18LVTFj9R*@w zm#pyz@=kcCZHs>;J1^U%u-8rQU4L%k*!lu+*e@Arpg_8)+kQ5$b4MU08U^1Pi*|j} zG^`&C*VI0}u8B}&7`kBF;reX=n#&AiM|&KNEv%^&WKWH~{24Bd$t><*`RC9u5(UJ; zz{ox5_=>0a5Fa&%V>gGn)U0FiP=qw5W%TDlWQg%qUmzT_%d23&u-Wnsv=Vu>$PwF; z@N_-iH_J797Mjr)(HRrHnqu;YXV$s2z+-{Lf@eq|59fSnFZcO76T+|l()Sd_y516|L>_d4^W(s`2T^y z6#uu`_#Z>~@5(w1|7Q~a*CC(x=09`t-<1Ib?w95ApX+~<_)q=wZ~rp||3`uP@5=vk z!v8~}1PcC(Tl+Vm-U`%31n)0mZ(-Re|D@61^XM(lNa*_ZeVCE3K$8B&y)PBTfY|?! zq5me;2;>F!3JL-zZqEDH=uJN91yDV%`A)!$6Z{1oQcUA^>2pxPkrpGnlqcS&*MB#- zz03m~zZ4%21FrY^RsQ#LHzE%ZZ{p)qsm}RLK{hL}yt1-ThoLG1`3q?Hx9_Noi3YD; z`!jOD>27Q_Dvu5FkTJ*m&c8HSb`c>Vlm_wi;YwQ!KBIgpPc zvEBWw|AJnCImL+lk6ApnYeIQC-<#iF2ZvVY%ri4Oz<6PXzc8>%xBVWpsVfRzA9IBk zXUy}dgv3LQb*y07v;2XM;G79v`rpu->B~Vr=(Q>(n=a*$bMi(#ITvBs#+}Be5i0BL z+`o~wMjZ%`EU8PmE|9c1_n(ZnYSg^BrH@gp;5l@>z+BKCAX zYDcN|u3TR=!!CL#oWTJC9EDQFKdr2Z{|)Ss{><`(yArZL0eu6`354%ORDyG+rA-$M z^=6uG{Y&$AF}FP^h~rcb!Ha zRyvHHS5i?loh~o7h12v>{X0k<#{3C2n$8ZibOf|KRYxH~={J*n(s_&T6FNIMmOG3M z19$gOIv}1Q&3nI`lxt5+U#N)F=cIys+MDaxV1f{3-0|p^m`83_TYTkAXGek6{qZgE3_^zujckZf@nljk4W^TD zR5g-@o>z7SX#EFXJ?~A?wo_BObkoExY}F))EydPg`GhO}N&lX(3Twl({h?y37cC2U zsAX(}{f&RuEN5evoZ63dU1Xymh`%3g2+_+1S}WYW12nhioZZ&3P!hD?yDv<3TI~r+ z;yjT^!IO7m_Q_RJ2PN7)0Ucc#X9O=nRRsNz>6nRG*L94Rn&?@(=93CtlOI|98DkFM zvB`?eO4)>Xp?x$tQ0fMY$7p@eB>HGIfw^(7MznRYwghI=*iYMyzu-6wReN)valbEA(4dcAKtRT^ zFg_8p);zybKE9SKyU&SH0i=WGexe!%RRKzR?XN2m^EKlQFc=FuOS#6IM?1m^57vH5(pe~WFf}60-8=3?~4X}o*eer z>&Y^XC>@uUZ#FwxokQ?ARMRJLS}2r@_P?&!RkO1D9_(gPy>|xS>c`mgQD)So^2$bk z7j_?GU-ZW%A+$CpTvFzCik|y5q9&qB<^%i>pG8l4o}5e&VQ7?1t`-m{uYS~3hOl2G z4Z%@qFEUZ*v*0IJX2vZ|wmn5WJ)eX_I`UqhHF=>gj{FIU;DMcuV0&qUj>9%OgHF7r z{7JCYH1cx+y=9b@aP>)5X;t#w9)&e?u#Y!+_D3Cp%ZsySe2kdJGm(`b<`kwGh__t5 zxl0)kAeTlIH}Y%GA2>U%n^``Jb(-%Er)|Jm*?GCAFUfg{z#a{soXw<1bNs5NJhC|R zwmMy|aYJruv2=W#!=z^$foNIb@4P#e>uHp*_Z%VAK&*}q(9UU+m&93Dq_KuC$UmG@ zu&mhinIi$Kv|&-^RmJ{<{s1yZl|#J2&XD)-%hodX?Y&`ccQqKeC8mVVY)wL)%-H!25pW0^Z>g(qXlYu95j~3tU3p-}D%WwMISM2s) zR1Msl`Osyxx+QGPDHVWfb*#u}suk$X$?52r7|60#GEw9?oY>o&01Ku9^0{WK>hDfy zY#o-7Li99TOqrZHIH>gVwS3NK35NCrm7faT?xbyI5ATQ-0FHpQ&A){?I(%$fl#W29 z3WH^0pryn7+K{IDzBx%Bp^e)Ke-1PaobqtcHVAHaZy&B(;>nlrbt1DFDLCbrfa$~0 z^~QSSX_^7cK3#)veH5^M8z#40%%&C}A-Wp2+a;WLkysJgChP`{M!JwGr4fOrL9KIiIa)%|jtT5Af!=hsEtAyr z?dR*qPFU{=6JpdveSSx8gT46iR&-dve)jk(^ri2?`^mumW@L(+^G;fWhy+XBMl)lR z_nT2ao^3vVf`0-wuzX`>&D-MD$@d%tZOYsLZmrg>W8AJcaBhMS(-ale89#Yuv%aT2^A= zB!+S-=Mim6mi0rK^uSu@z1M&O+o|g)RVaTTAJFP1e;jF~$Ivi$(EgNpLx3^?_NU96 z2>ybD$6=0Fy++qKGBix<+@`p8_5&se&N?QLc&D9F2adaz+~u4-wf5DX&tbibGx-OG z59)6i2Z?$t>ogQqEN&-l$y}a8WbtrR@0m73yRA+F#M^(y?HY&K*Lc-GLRjthdg7$z zgFdG9t+<;cb3{*MG(K8PX>|7_wQLuFk_>3m*5Xt*YvdeMaXkVm9*p;Qk9(z~8>;TR zws|$7C#^P?xHythF*i~%Yx`;IMEjbIx)UvcW7-KCW9+SG96HFnsW%GC5Im!Py3F|N z8k~%#>844eh@YkJ!feh=x==%rwd+C>4Eelfge$HDWRiP&h<@oLcS+AJ$19Yh8jD6^NL1VPS2_f6n{aucXb2!;pd!pX#_94Oc$t9eTvXF*&iTmmPa@SNG`CyW#`8aB) zgKWZj=+!CEITsTYw{n;v9<)XfscUr`W z%GL={jtoM`$bQH~I2dcTk-{YVZmdO8<4_o6XOP{s+I5ASNk zjaT$aK$(!{)$?&!7mT^D@T+Tzina?Eskpcu`*Qf@1h+_40|dP3G;Ot&u5GxD+AnQt z4d)d=?CP~;9U}VJ+rz6cz`j33-C8Ial@XblP-Q$B$_Z6XUBOfVv$!DVs|qGXBHDjZ zJ_EVE7~gwwP1b>Ny-+nCG2&tS7|DPKYzpNO8z0z0)T%Dha(onvKX0u7bL?q&Z%m{4 zim-kj87&j@^)c&aww2O8I~tqIB*la?&-sT^^RS;cw9AnJw+%&VxuDwu+F#zW4DR$X zG$PI5V8Ja~XB&nEF6MU@>0?&Wl`VS({JO^@E0%AaXuL+p6}~UrsUdmg8D$==D`0Ab zd6NaQ5rGR2Si#eO>1*=IK#}$ASnilhMsoC;+gm=`2d>N8VJpM4))RSM^;yY@Av%Z* z?oCx02Wqr`oJ}(?Dv|j(SbY?vAJb1y$|})8KH4t6QG3sNB1K^cv8arTE5~MXdn;6w zdnHRt{Xtb7y7}-`dDt$DS=8$e7nBtKtvn15HC+zp`hEDC;&O>}8M8_ZF6ZlSc!g1< z=o?X=sm-bc^wmv8#R10!87?!SZAm7i88or40n7D`t1I^0B7F@=*E)L*YnhoOXlT@H zFLLoRJk8G@@#RPMx;n48`%+O^6b4ts_iab{H>eFDmuS3>Zjk)x@(!*=D7oIgbjtYL zjA6s|cGu7Gn*|a;@XM)Fe^1m@-j(MnKOfMzTWM0u+P22yVFD8ouG0j8QPJmt4OA6; z?KoI|6D4|oM5lU@@YOvYw&KEFpLAErUrWZVt$h9)Ffu{gG z`WK4SnOX9r1V61mIS}%h=_2Rju3C$J27!jW@=wXJi9(zffrdEE()EJ)fCsqkTHgzz z1Au5mvimsfY&{~OSFAo&$(f_Bpxw#aHDp2cMa5@c4x;MLTQsj`W>%`1m+5#<)?e;R z_Hm0FAtx6X${U$k3!3aa+U zN-snB%H^*FBmEyw5%oO}ORF6psfybfP+*bQa8#~Jj~d781~s_^1k2xjwOn#TDMDHa z(`;4=*}w!iF5u6brMk&-W;@9zgqe|xX_l~NCD+_-RzpupM8GuD9;&LkAKQ}BiGzTk z_1nr;Clc<;nsr4FE*EZUfRT~bEmOYUT@h@I0dytbkdLjP`q!Tb(^}bE`AcsaymB+r zB-s>f?526mp)IdP>&2NZ&WLCbdn$IlTXIXKmb$LvI#h-n*^MwH4tZ)(X(R>0-IIO&6H)DUy!;(4UF{JHyHVu+!6Wl z)t$YLx3k~{CO@~$ect^ab8qb?V@FB9rQYny8 zz~WT*4{Qu$WnA1&^hd4Q4j0QMPn*cr>UOYX5Qa84^T*1k(ZEbcI62ksWMvPFOd1uN zD>yw$X)vEKF9ZK9K_=2(pk`{jZn>)COG0eP&p-EagM4}GFPH?9jWKnhRV%CperZF< zZ3VMk#QtUT z#G0(%9rMlC*FfNTpN+5k+JbvYx|ZjLgmuP%jswgEN;s8aZ%{9Gb;!Y&&3XY{;*&k; zDC;5+E1P5FoCBV$42F5`d~818_C?$y0(9(q^E{PedMTPW89F*2)^2LR(o!wNR31o2 zb8!d$nE3INam^)ptFGNf(H|vUsoygmS=q5m^(k`IG?yQk@Mvo&5V*ox*`BZ1G&5^n zQK#rEG=b=-rE6lq3l=>V_Q)npFt{MhiG@Sy#Hw}fI|+MT9O>2WbKF}nV5p%#WQ~3+ zty5-JZcX`{n)Z={cVEGc1^0AcV6AO0TzfC;rn8!K&CtW;eF@_ZrBEZk{n_#v3+L`HZ6Cn@%Xo8ioO8fIUCEwSP|wrg zi7VY{77=HmUvvKx(NF~ER3dGjd(n7h&P+T1wcF$w8oRQ2}yS{x!S?*|vo2BL-3 zNqC4YN`x$H>6S!MB&HAzg*V4MRrlv3f*wTgzeQEYmmw9yba`el9?C$=-2q_;Dsbww ze_(sTE0B$}kb60Capn`*Q6Cdz92tbZYVJA8MQkZ2prh9R4BJWuaY)Vmz=7^6nP6h7 zR3N#bJ2w|1G;!G!Rc%^iALljMT@+;g<{$Nhlb8|iogP+eU7RK?w=S=|KRFf^`o2(r zq|G(o;4erIRhoOeRoi7}XCHK*d2g$3mZ?sTHWytVmiKIS8y2;F_sx59;d>t+E;meU zI1QxKh&iW|0t@cx?A7vih8bu|ViY{4xkzW1>s(+{>{%4qpIevY~2F}-D0NORIT3sb;6uH73+GByOx)5ykK4GSly3 z>KV9l;84dn2u^#k?Nr)`ZmVu9tRW{>x;T6Cmnl}mf&6YNI86bQl=oD1Z6B3t_bE{N*6gb>?o{f!hb`4`><@*MamQ6N8#n;e z9Sm6BiMh>uwHaUP!pgn5Wm~%TNJ!l(KLlY7p73lDLDO+vHI(X879?d{UAXm+wrJ?~ z{QZe@L2>6;WDWoP*Uz z449C$`m`Q&E)EKHPrX1CpLzV)XXMln$`b5WUKQ4Y2JY4_HJI&9LVfnuq$c*)D;u`G zUg0=52e*^mmo4~3_uWu&O1I?c>_9pujXRR&l-FmoF&9` zX?xwWht_`VaqD!m>+I93VkfE3G|#7Wdipk^;DgOL#g5 zyj-@XzXO_e>gzh5z4KPB%qv-;vluMVNpq*MEg!5i&;T};V22X8p?gM5wbU18g(k#EqsbKSgnD3 zZ|BuDAtGn>NrEV1;eTKW{9y62=9Vj7ry1~$lKbT*aL%>;b0mZKbILESrN1rjY}oW6 zZ@3Ow6ngDC*ymYi{S3TE@38GHSZX<`xRbVLL4tOL1^VawP+UKGyla82uT?*G=~(^&jVj87ccT1di}E?5P_J^AwsA+>%gINxA?`xeF8eEG(m z*wb8t1;u$KSh*V4374I|V(J{il*`@Qloggz(;w5&>~5RcLahU<{w9dK*z!#PgSRo$ z7UbZpT{|_rpIQr|l>EehzH4aFUHNnW?DY2!L%n7u^|G6l?a^PV%jDMAR>zZ>#IoLx zx8A$Ob`NE#c%GHiMC#et)SAsYJ;o|(0Sb$#;O=s zC@vR!ShNR*^7@kRcmW$S!N>?a6zbn1niIY^vN%KYTw%tkvWvy*tVtKue96u8-(#jI zP5C2wQjR$6XW=fYG=*yx`SvowZ9!-{bzPc5O*X_}^`uTo2sKvZK-wkbJx5-oq;~!l&~^qWf0oj*uZN1UlocXr0DbkM zRqCH$+h)@`n1>%j5p3D zx|Gb;b;ZesHlv*~wT;!XLE^g_?Ycc!%xmhjGC6MXU%NMn|G9f3dvA`fD~mL`ZZk%h zSLfvSRY=9V+P2KX1Y(KA1{mzFVnPC^t6+=qP!f9*?xb zraj|SS|xlx2PQFdOm1fX zA=o8#vNE$O3A4bjf7GoM7WbI1HHR0DE?0cCP)K`H7ePhqE~#rkqrbW;D;|D)Mp{3P0JC-y zUd_`nQirk~UH`sJWg}ut>5*3d?CItLZ=Bqi&19BT72#JPmJBi!v%Up zR;#T>m()Nc`T$3l74OIn_v^NiO`8{kX4K-SjMbP|FlD46zv2 zzDdd)vu|@-;6GT(&tA_a{mM1TzEL}Ws8Tz2%w7*98+TREI?h*FtZ6E=Od6b!_84%D z_eHFc%mP84^f2?uL)xx5_$ZlVaF`CdaG39{5F()UEw5$~$`QY0WXIk|2&w;2Ayf?d zoB;{eMl19IDRIA+mCx-ToZvOxiXzAe?JfW(dy<_C8F`7x&W8v^x$ZF^zP1PohWZJB zmxIcW&_A3B*h6@6n8_^`0vZbwoUov}k|IdDYQg^XWN5OC^qPmRhTDuT)qlBFiKa>V zs7Pj*t)E(gwXZXA0EO6#tzf4*FX+!)=UGdo8#}FgeII?`H~ku%zJBlxUBf=|&Ty0Y z8P3!!6S&%ZB^V%eyNyy43yFv?6Q`cw;WTnFE=i3z=t=80#lUj=iqg4f!J&*?ax1~> z0jtlISWYPUO-I)fviYxvswkaXRSj63U}9W2p>&RM&;4q$R`#;zZh-a=o(2@_dn=Lze%lrFO`ttgChyVN)-&@IAle;r&Z*dU5+1J)4@sZ~n zd_DR6*Q0rXOm2(YO#iJGrux!HU*@qd07RjomcXeDU5r0z)~6^Bp$po$Ga38=XNKxt zH-{(RfeR8MJ8KDk=8&i=5V=Sq6;i8hkGhf&>u8 zgX}q2R8()+nNg-NFu=*;B>XL?XOC+ph*Rpn`zQ0zeT_^_!M{cZjYEzVpB4n480q8o zsw&2gnIW>lNLcww5ewT$qu%*d!m#ftrPd`-ofEzku<{&2_n~pUSv>Ryy#-}K*4Og- zXE^@YeJ6d|CI%o6-Y)^F^C3D-b~P9jO*kETHk66`!bznYbwB#*=bf9-QL@;f^yR%d zJh%UTG2+~b@yN~_Ubg`B;R$LSgwrsOG;F5PHOi!XCg7Hk-qYFaOA1vR6IYUXUOeYE z2NRgtH-GwJvf?dQ9eyy)ecWF^fgubD{(m^5e^2J5VeN5X*nn%UkQ+r`1!(ZGzXtw0 zf%1RxKmNY1Go2=U~|f4-ctWeK@j zW8*~C-HmuF6ATDNpsKX2o|%?Y&*$5Ch=m-kPYr(}A!!LtiL6;Yg)?^O^Aio(?sjbH%r3_xn!PVPw?TJVrbhR{7XR6MY-X;}Rt z$bs1J>?4KOqhS|Z23eP(Z9?oIS_4EK{qt+tqUT;VJrJ`9dwO7Jb>Mk#V78JoU0Cl+ z19%AG>J2*{4F83`%Z61zZwfjMFICnQPzLo8<&REaMGo>5MbFrY%}}vcgB~Qomn`j~ z5RxET5x1cRbgDcTO?E~uW!iG@vE5#y&G!rCy?~2gOf-77jk7Gj@ypp~$_hp_h_sKh zryb13*>Eij$c3;Ib%UpZ-^gYfg`lI{rN}34m-1h`+x8?sH$C^A;W-X7V*UA&LHZgW zq#B@%zuDQ{m;)|o)7A6V70pB*dJl&^Y&-{8s@(&XC_mcTbMZZ1M&p&IB=aCNq*3ys z2bPB%9&VV5>wrA&Y=VT037@@#clC)Lxzq6yE@;4F98qOTUIpg=c@3MF8oG+(ES6^w zS+5J$Znz9hS~_wqdvd&>{W1hVw?gms`J6`n{A!ykF$c^o3;>5sw+~o7ME<|0{r(?t zmghkuPaG`SXaL&#lz7+|5gL2<>-IbZ-hbhP|Fc2huU&8(Tu8*r(fHb9tR!oo_07JU zO)C|D>=>xuV2F8A(vq3q12U6vz6-!Xj2U_(2WkD`rZaU>9uBQwHu09zn$xLU7(S44>wfJP~s;qBnLEPG^R1Hc^Ec) zDy5-M-sM6-3Z9pTea2ait?!v<0JW!IcM+}5g**JqI2PdDw*4~ZC;>1o+*dUHOZ(_!=fW!SC$)gOUnFq1$KQ%{D0r4wD^M)++o;vO^N>i@;kY{ zXdv~UTMH`Y(Qy&SK{w>~(HAt|rg*_3<-)PUbA}vGSkB})D6dpr1nOhRb~U)mZj^16 z!af>{k`w*OUe_ci1wfN7{b%^teNF;JX_1&LYlhkD;VE@O3Fl)8`Wl=y33Y<}Wag{%X*UupIF$Z{{Q1KJv! zhO?`vse5|w$8}Yps&$uqp^TWNGqyYFU#gu8!S-Atm{s*g94|c8u}U*pPiFPu^_upS z02uIJ>PZNU)%)>5ePs6;@XY%MtmU`zpUlq6SlhI z^1#+lkk7&%9Ei{h`$!!jt?`Q#Yz|QhXUKm0o6db;Z9yrj($n8`(V^LJmesS2xXI;p zl)MOTYf;QALu+$T?fQl9CLF&LmopF6JZQGn^|PHrf#e+KEMM~$8)&ewUw3n=fl?=Mz$`vR^EKnnG+>1b_kMschG_m z6uGpWrVmMxpnbo{G2dJo+6eLpgqmt zf=(*L8d~spc*omUg1>3eqk$>h(aF#B7nfLEiY_6ss2hMC`XdhV{wQl!KMSG45M>P@ zCY8f+yk`_tuNJ^0>$5m$+y!Sn_8w}x-PUP{>d3PG0^mbFQ*@07Lb^E))JEU~JX?9Z zu?g>CfKV4JaB(L;>#un_b&KwzWkI5LUaE3iKE&EoJpkV08RME2a)bk3%JiPW#U6K% z26&w3jgY8Z2&;*@3+&^-bYRYTATy42WQ`MTaL*rfQbq)6$YC>WTo*ixSSVf(sN09kq>*4tlI(rbSqb zb}#}16)@6QA7nj>(Nffvlpw}ho@3f8AUetZ5rBVtdFjx zA2@vLHQpE|Crm61VK6_1f;uj(%s$NjdV{!y1QE%8NV?n zvea*{v(Gu--q-(seb-*DbqQn5@s8&m^U3?UpZA6Ea}G`=6J%theFy|#!vfey0PqJa z6hQJaaO)Dl@&KTx0Q|LpTqPjH9#CQfxEKMg_kfx(;I08cZGbXgz|Ib!!3ApUfI?G% z@CJ~=2*^ADc58vJ5I}?jsEP-M4S_K`pe7Gkumd_EKzlwgSqDsO0kyS&q9PC*3j}@! zGGhT)EMTq#%x40wB0#7w0OJJKVu4|PpfLpKmIY)40e*fU1$>GpV08}=5dq8}0U;1T zM->jpve`WKn8XyfL`09wL)Ms8`ww$3?V==1lTDD z3LwB20bt1)*ckv0hk%1#;CnN$;sfkf0f%G2!8c&13s{T+PL_e~0^oQa*qz z6r}S@GU6{upDGzEfq(NnHZxFC(NQxdRaVo`(&HuHtf(O;)l%gpf5$4$EN&pErl}=l zVW{@nLgI~zg^miRD*0nRQXVrdGhG8+HDe`GGu=;mMqFmRPJZR9E+}Pa7Fasr{1oZx=RtV{V|vB&BAg|HV*64g5F- z{80v6f`)2J#`=bD^z}boyv3KA`o{W3n)(K$f`W**MoJ^Dq@twg$psHqNE>H?jz_JbmB;fY;lcji_uZZCt<8<~wbhm7rNxE$x!IZNsmY1)vC)y?p}~RvZ+*Qz-LS6Cj`p_J zmgc6$hWfg%wKdgMl@;Y>r6t8hg$4O}xjETcnHlM6sVT`xi3#y>u`$t6krClxp&`LR zfdT%0zCKWIFHaA5H&+*DCr1Z+J6juTD@zM=GgFf<#zuyp4fOSNKj~;|X=xI5;{vySTc!dw6X!XqN1qGMv?;u8{+ zl2cOC(lau%vU76t@(T)!ic3n%$}1|Xs%vV$*3~yOHZ`}jwzYS3cEP%Pdi%ch50rl% z{%k(zYC3FgI$k|FJQHDTJiEHqY+~`*!eHIO*yOXdiPhrwLQ4~~nNEy?`*Jmo=%0sQ zz0e@$wRGGv5XIM#h!nVlrI6T?H=FKkLwJmF9SutSWn`v%7$rqA+RI@lb+_Qv%Q6n- zCks7&8b0Dy*FR&9)H8Gt8^n=f&LlMZ;PX=d7!zYRO2Fr|usX(|{u2M<#lGS|#G!8D z-GG&ix%)@{5^Y)5z*He+1X@?S@7oF-sgt6dNk7*$x9h@2uo`CLovpY8X)59@hp7)8 zDy~ivE%bc{m*x|?<|uw6YciczttKGdW0*vsg@tDW&(LSF5SRd*la%R zX8%I%ie6HfygNZiv`i}lyDPEjC-h0#pliu8sw-Ypa>kIw^>|Q+6Q|CZJ5(D4DSHz5$oxbT#=RLwBx#l5+;rD#t zbrQ*EtpEv~&#I`eG_`Tie0ZK5Z^wq`G;+F%2K_*h^u$?4OCZ_mSV9vG!Djg=Jp5e- zFF<`kYx$v4?bl`61U-uD@l__3oai8b99k}lpBR4^d-qeb-%QZ2vl z4mmQ$14eZ&bJF0#6~x?a9_s| zY0H_Jk<#%z>7 zO6$9RvKrVSak3sfQVrcG<*u{btUcH}VXa5=5!!B|p2Xex#+*}0+Vz;pnziS}YwMjs z$(*W(!$$hY`<}+QXTV{T8_esdG(qz0xF$8m>m;Whc6M6Y^6~88JFc?r*=c?L;|D*GQCN9=ZX|Mx6qSG zYXfuY#7T6~mb#~zxwn{3MX$&S2`(k&5zfvwXbD8Vu9%6~-}&;M!zIqEem8RmGr z5SgcxEZgpnjmdAs(LAHT@pFG08IMWdB)?4{^ps`C4{Ak8dHsIt zljr&VE|rv21Zl0}hW>zmbRi`T@362GlqJxcCy7$DOjugEH^{+B?Vehh$XiDL;P@-| z7%wkh;*VzuskyYkWO~%9G?E|G{79WyKTJ%O_F-s)YBFO4XRBH-OL$nKI_t+Y@h8eg z7(Huho5AKSdR@Jd!<5tH8$v!cD=jpTY1-)EL-mk#S}uAMO{!XGOvzefBI0P9&4U@P@=-eK`Eb&!hC*f zO?7jN_g>Ni$x*t?>RN=dy~@Rflt1V-Z9QbW(%DiH_Lgk`at^P1_lwO1Domo-5 z_rvJPQzgEfznGzGXM6FV#Xp$TTMX@2o90?)=?k)qeqK#Bt@+}uue2G_v!z$$H@mK9 zQ*tsCug9K?wV^*nG(Ce4D{W$r-hSX^KTC>x)X2-Y;~!8tM+I=SvD@qf8CA|R-sbF3 zVKfTri=1amIqLWly&K_Exkxi<-r>Aq7_Tz6MEAm?+n(4kY4&tMqWO@-a&b50Vb#hj zH1mN7o4vqyKbGVVjs`r5P4ZQ$)>r|qp*W~Xc82`2itX{}=$TRR==l6=>b&xU4NX^E zo@s1=webg^8Q?;4cMJ0|&&>6h)mp&X7ts4qdt2hrT~1;9DEoAtWb?4=A>ZyEqtz@n z({Zm6pRXTN_z1VeK@!Y*?}eM?_($B6qQU6Bq7dsfIVRH)+^7S>!5?em8z%+8*+z47 zMWYd})yy5+z4F^}iH9DABFtTx|*k?WZ;DN2yK7 z@Ez}_TK6{t_CxHB3omR4g&g?~9r-m)=HEHdEjr{pG~YJ0!#OmMZFYjV*w%_CUGJyX+_55j}S>{%$|S#0K68tPeI>RCDD zS$*hPi|eVsi)ph9AE#!#<;X!`sJz(ZN6zV-v>OD5(J#pwg zg%6$K0HAX+&;>K-QYdt#6uLG9-8h79;rr}pdSBu9-Zk?%4D~rK^*J5#IXm=$5cndS zd+!Z#w`)Q=+y~^Q(EVCy{91+l>NWkU-2AFj{c2kLN|*d9(EZDo{Kz@| zuM>FP6ZX8%>3NUSyQ<#ZJJo~v$iqe`AcH2r6+IxyEg(`eAfYATF=qfjr#n|!KsHAp z`}@GurGTu|K%rsx7X(3G_?{m_ZMO#P#1CEGrMbx6xBb*2_Zi>i-7}Y?+rbTNt}F3Q z%4s&hZD&0hXPIG_JgSgH7u#(0V0ri85M)PAO$SmMC*D%~2by2Lu{+rjgm!6!dI*Q| zgaTG$uAx)6jf9rCtlo#YYB`0Lg?6)tRt<&dG0PT11kvNmQM2!(wb;W+o5N!Tqas?Pri7yh--iyj zhH=wG@4JU@4TpJ|$8423Y^6oVHkm5AMT(V1eBz8P2@5KIZ`UzwgUY2|HEdoJM*X!l z2rE3c$vjRHJ)|`(0zEypKFti5Q0rQ_-E~^ZwdLrAwD2_ZFvhl+{-fvzZSj-B2@K^x z3=?evt3@Re=Y7lscCG}T@Pwtbc<%5-E|Elz@E8`EM6vWl$&rNZvhW`-oMqDD zE)m8z4=25sjl{;(SF*6}YK`b7uz#2COn@076CS63lEAc!^bNe)+4Fn+G(TiX>PQs?u4mt zR?_@1Q{z1x!;e#;BWZCf30mcG?`a}sS6tMG(n-pbR6HV#j!h{r!9Z(n8BToDBaZYq z{VQQcopzieZOV7)3|!Ncs}{+M0~t%N(jsYXBEubam+imFX4wo!dz7cW^+->^1kyuR zvJx$9dRw!cxU%MHv)9tICuP$~mQvD=v!`j324qvuhO-FR)BLGpOK3Cdf-~~8G9h=I zKhZ{F)8=B7XY8S6HE(Bln&wIhWg25<-j&V#Dx25FrGts(+C}TSD4RNmnY|~HbMGW= zOFP|nB%4krpFt;w4ZnaUBR_@q3mIMF{X01v+&N4cIUC_w6ru&x5ovs@`CJhNFWL(z zxwDtc(zDBx6**lBwDaV`gYLA)z4OfI5Q!&=h{ZX%eO0dLrd+ICg`J*zo~B%GZFrIP zXr6p)F4AF<&Ge^%CPjNHJ-cLGo&8Vw{kmhxC3YFF&FD&}O-k&!HLt1b-F61_gqL#E zxr?Rcicv(1{7!PDT8eM46yu69fDl{4@N-U(6n&dPcBT5F< zOKn9WB11m*Jw1!Zp<_T);o@@2#Ab0&}t*e~g)elv8!Ij-jmAGD2 zJ)A&emT4Q2Aw5 zo!F|%##7hIvXX$lYGAaHjHl|hRg*Jao%WSiUKI+^)dn7`^3056@zd(WlZH9&;QNua zPwC^HU#VxOFV`z;sAp|9UaP-+rNN56p>@BRim*lBqk)4bPwcc|^iK7w4n0er@(H;{ zg)fb_yc#E^+tfZarA9V=s%%sE)JA&xHS@z)mD9Gu$|l{@m-=JjpS_xe#d2M&TGU(W z3^SV}#4KLZ*ZXFs_B@2MxhB_PblSQymfKF>)j8H| z9@*PL-_@6y%y`-aI%NAwMkc=~Gfl&Q%y z)-C1kh^mp!^i>A^Q~A10qKSwf{Ze;l-q24jM@%%(R)_OUDOdGW`~V88qM9|M>Sc+> z^1UagPp6L-##ORL@^nU%7^a|=13j%>=r(;Xa>OiW=B4-0uuJ9^K* z(;L6R+KnVTL&C72M>|+?H#eIhSxX`rC3;W?tWQK2W~(Nd&&CD#W@F`MGe#FUp>y05 z1H5{(5SvlVsL`)iCM&aMs1z3VvSzS&#^m(IUTw?=WLJtRjQKInOXAE2Pt5ga%^zn@ z_lRFsgmNZkO$N&^3h>PU42uh=9p1#Ftk_HAYqOf!D}n2iCQ+F>jPW$kj(q3@5p=k? zD~LaO)fBq)D0`{MyZ_qwq6NcPNx&4&#BxIQ@}S;)zr^wzK43l?x(;GJE!-Q^(D8}r zaUO+L2gXG*?-B0I(YENt4u!QGqE#-w)#auQ4c?_(=xA+aH>E;XRduG-#v-h3hH!nx z45!?Ladw-y#$sc{40~OlaowtO&VOS4t;BSA_4d5Y#s}h^k*@Wiu14a?d19uONr@Ho zk4s;o1~#B0I3!y}$163uo2X1{%NzZj*_+MT6Xg><*?e1fPgWnUFB#x8A1llm6EE?{ ztUZFQdWlWGmYedV93c`|AI90BgYDgeZI5p3k>lTgg;?#D)ND@yJVe_Vl>Ewbx_TGu_skwi@MY8B+dt%Ic;DEbT|M9S6vN(TpK|A_1 zJ7G_L^C(9DSgB@zl<9cjY;R(6TOBs91zVpWcEUR=`X#pqeU92l zmb`(r9orSqZ+)H|zHEDh20QWDJW+68HD)@kVEkb=x!GKEfSNrQQL{Hoay(CRmZX1{ z0KOE-{VArigqSnFs^$EvvCzh^U#kC!+5R&&O(K0Zq68qV|iQ>L!J6ZCH{{x78eZ{+;j z9sMah@c*>*|GMzNpWo8|Pv8B&F8#kQJn)~X_J6&4;QvLV{Y(0PU3lPs>oEUym-BCw z{$Cd!0As+fExifJCZ{W;Z!Z*V&wm_INH13)LRTmtTKHtNK%^oY{F$&hC;fl$nco7; zuf_k?8~^|NiNJrtCjZGE!~UCZ`+xpTR-vHFAX>>z3JR{CUJmZwzF;%iBf!N!*fliF z8*C;A#e~GgL?gSN}NXQTf z>RWz010i}spO!?!)=yk9-KoYRS+Y4pnI`S)7BzzXs$z^0Ff}rZPU4Kwb#@+{(1iK@ zhkUS!-IFi2ruGgtRxY+~b`DxjULJZ-AAjdSPxoMBt4jhupM(UV-$fFFAc21%$jDET zehILPBz$pM?BX)`X#$8aWE3tJ3Oj!J*co%1a7Go!YG(zYNCQ>XdT#gGO?%Kn|Cavl zJI9jX*OvWqH7{PWcwO>r3KCsbL5>m^L;m#$2;>F0$IIXz(`y~zdvw2LcdqAS@$ znQRe>7By43W@09?C7Q-19eNd$TcuVKIis70l{T==6Tr^?S0~?03#ui29nr}TLW6`v z1cyb&1fps9XorI=g_QK`X&LER*-^=PnP6|fFuyPt1SBfU%K~ci8X}q^Tbiq5@!1ib z{2mf7U6T8lqhsR}^haSU5FbYwH>tn!ny>et;kpz?j>6`g^|(40q7pe>5>UH9d8oc4VY)VYy>zWo2!> zXLF00nsxu+@aUL&Xsc^y8Pc`7C^^Myi*}9ep^SGNt1C8!x+Pg`e4{7+Lxn&Hxl^+b zsi4OdjQH7>07~)r56gZIm%N$Tr4(E9DLW&Gb!QsVj22s>`2;Gxgx^43#5{hx^Ty+b z=IdngD86UmhAH2|U*;Xtw-wF2N*DgR(6Q2|Ht2e-l3u^v5lFM8vzub+>JV1h}?-* z+AZFdNe^Ge{-Lr=f+qu)nTk8*hD_`g`*UIm8Uz_`K%$_ffF=PN5a#c8C3ayzf7unn z+{FLkGJ>5V0V)${g_j>Gl7p7y|H?Q!GqiXMt9j*9+rInade=eGB*WqEP4_3a4b|sZ zjbN|7v8ge>?~h~Vj(56{2Cl zM`8-l_W6f|=?e9w6J`u$>L2?KEaYNrH;%qYYuYMXJ8Lo-dOR_qdI>1~Hm<_% zd#A>r2eRX@n5BqreVL3aFm#iY%?Q&JBU1AfL8HwzZoV5BEUM2e=Y(+TYx9%ZLzv~W z&Fm7AJ0m!ZW~}&Mr@(+HUR%tpviPqq2|@w1;vpm+PxxJXI8Ux8x-(3AT_#p~^ocyTX*!hX7X6Kuig!$tH8v;*ouZ|=)AerG=@dFj z^CpmeDMX`rhVnpdeyB6iVV?JhzHI6@_=KyB{7^Qg9~>cn_`qTUgg$%gxAbrz-~ znKQwCvcJ(;|AL9+%<^P!(H16oW@8DgAPGg;EnXWUu^lH6nzX{+>YhO1drC9ybVHhB z8l6HWb=u{catqHisLtrhF67eq%I>$K-N;>Ovs=jB8N<;*P44AT@gka_QTC?Waf|oC zLw+Cc%e_)c=Eu_HmgvNL>B8gxrJ3+Z^tsj%p-@!6yfiCOj!-%x5>2Jt6y>9=Un;cd z1if^Z1Kdk5w^i6M!FG>qZ0#T@mz`Z*Kwg-=gQ+*v(J#O^zzh;>B;x526^RlQ7Uv%p z9-3sGVwvEV3C;eJYm%Rr?r`VA*&?Kn1l1ZIcepvq$GR@JAmUCI8omRy6&bU`Omo(! z)oKH)XBzK8)>fOrL3v>&mmu`W5J|`m5(>B;3(eEE(XH0J^xxuG zv}SrXd_8)YvF@Z<_NEmKxjOb|6sGpQSO&Ca0P+g52e*Axs@i`H(`-nE@Gu>o2@he6 z5zg3}SAUOwaEt$sGE|D2&6hn1m{r;$g^j2@Ls*SQ8qx}=#ZCwqWpF-|)u+HBo(I%D zO@DL0FPSywiQ2U{OE05es)>k~f1w@7ls8qs$<3xd6shqg`PoVTqFsr>Y+KZw1BhdV z+3w1Cdr8h@jolAI4*n7Yc@+C|L!E85#JCg9Q_DDaZm4Ai8dcaV9>n4ohT2(Pw0dr+ z5rmE-zZohPT8mx&Z-&~PAR2=(R50wMRFgR%!VVfRU4OW{R23o#v*A72UzqeXWwPb@ z0aj?UFcMqf^z3B$HmTiN+KVYm?!5_(Hj{UmfnA@#2}~D%Cq-&R_l`3zh=?02U&RWIazK zshJ-jc&Sik58pt)fS^!MT58uQH`=f;|F}fYgruaDRNwRr=E&$gR`-m|khGGbtnxCS ziprwk)SCFJ##E;KHiyEm#a+ow84>L+)Un+U8oEb^$Lo9JbNih;C#C}@o5z+$)>qf2 zYO^=Jn2^CV^5paw4BW_=cZG6E1+GG{X?a2o;_B^gP=1eC@~NwLMQ75!68d?z2}+FZ za0TNDd1C=4qlw zJ&WTC3g4x99w2gn5vBc>tUvly1>a39woh*(C|hwTZ?8Uep)mkJdlog-b1C_j&{a8_F;@--&>)AQcvo| znS5ETPjLloqD~LT=R0k0GxA!0K4_TUwej7XvpYHbD~&{z`N)TaNF%9IryS0(Q*j-y zK1T#z&1vUT-Ki;;yCzH1uB0wBGj5c@nltV+$*D6Q#4NQlp3F@&vtEw|G-tgz7HA+S zG~btIefUw25|A;`wdVYI@!aQJ{Y4mCRsAGThgAcgNC@BamsW7U7bq*zniTxr;Yck+ z7`m()`Yv8LDV#5#ASps4+?+B}!9+O4haD2P7;Uw`OdazCeS0z1X@x*D&W+eZGv0x5 zB_$quhM|?{EFq$m@ymRISE0{CueO4rtUC^qAFM)U-abknvv3{XWT$`b-_D$AA#NS)siA*& z-3At=ZxwB(u^(Y;^W`NI&Kvx5J@fu^$OY%;FEOK@`co5dhRHWJyMwP7BoGa^)X6qA{HI;xt zr&lcH(p_`5pV_K8*eN?d*SEB{)^+lLzU8=BA9N-Vhzw|5-&BUz=;LeHVmO{KmKGIwW^}dC3I`kM=PO1m{vKQ$OhhgV{ErqF_l5%i z689uYo+_^^kPUD7tunCZ&J#hUOQLCMK65`xO+%WmSZBB3OEGX~s zsBEq7&+aNn=!N2?4w7{Y&P>g<52u%od4HQ!GB$HdhD?DTQ6AB>8Gn^XBaxs9#CY zb3RlTiD11w(LnH;ymc#z*a-EWyP{nWyb5@x9hcUZ@-jgrN~9>gKSQ*DrK+tcV=zat z-Xki#D6=O|voBq~y*O*MM2bB#LbN1%{I}!+1&dg$H1|HL%SCbluL9`3?Hv)xMP!)( zN>{Al$9?+pN6Wp*j}s=WELax@vY*vM8RS@X2a2^|15qdYOB1Q4&9^qfIn-w}{Ls*d zWjQxSvCSH6rVW1%2KGRK^eyM_NPJCQFX4c;>T~V_KsppHoq{;e{+k;=^`xx?PC||!#~u^;N@8)dB`<*;(ZB& zxkXq9(z!zLhx*Hc>JR4DAT;wI8dJw=;aY3}j6W$ANJOadOqIa9oXIL{05e+$<;hsR zq3#vp4%58Za&MByQ2O^w^KI;uq!S&|Po~;TnYeFxA3fCUwG&8@*1cWyanM$&hK={o z?vG*?i$vm&Viukw+eADc_#0?usqeLF>@R4h`o{@B!O~A?29;*Y?i`wd$J8rqOH;qc zR94dZ^O#EHL5d8<)Y3RNs$@514_au;j4NF=y0RBbxR8oBTe%yB7uSqgf)DS8nX)ha znM{Ho3wdkYRevUy_mqD3xLRkSQie-{q`!h21fl>c38+Gs?PAH`kc-q8coC_WRAH|k?Ms1p!VwsYu&Q9cXsI3fU>f5=PySN*=d3bnvoBH@d9Grqt41E1;ppoJJG0`TmapBh9iFRqW z8K(GTzl36ge1udZgyNXG2HwSY5-TX%t$J{?rtnj}@ zvFe_7B(S`@LeyrI))y`w!{<%=mbyP3*Nc4ILQ(x~hCqtT4~w9+p-8n(o6EGU41ERq zF)Kq46z%NFP1k3}GfMOouS?loymGRNx* zsY)({dO~bUu2&7wm7U0;W=if<(T7SNWZ9wQ9!%veF$T7iHEG-ix zzX_jDtYW`OUf$IF?k*cmduIo%tLJVDIG;H9?!v+lVk-ZqhsZ}4A#x$sKJGk3CjAv6 zMJZl@Arb|5zuh!XrOtx{SC2mI^IWs@H63;Cl6-|We*w!T(h;NO$`ra39P6V^?ZK?e zHC-7LtF1`C!$>7jk{5XIDf?&Z&c9$JOBAy|yBAA+86RP1C;R*7-3u^8f@LR~T|B&d zfqEDNmWg2Z;>R_&DF;l(7Dam;)FDMDWC>#3n*2w&+8U1}M)ySa$Mal3ntBdaEqK_c@ijDu=%aPV1bHC?AGu zpcJ97T97msyzEpqPYRLOGgl22vw5EsCKcbB5H6Yjo+|8FdFyFqlKn#Q8PYi)KM$FcBfMo}Kwsvs9b7*vMWNh-= z4LsOf_l>EkiKW$*)wSM@vGr}z+4%$Hp6$`yr61q7_FtX6^N!_q+=O60bpu;rR#$_T zueUIfK69OFcxFsBeYes6*5g}O?i;EK`;x5cTCwF(J`W)O$VM6Fo%|$(O@p!H{pZOx zO9n|7-(l}XF7Lz9ai+u{3End7(mfPqUf}wdEh02fw*pMDw5k=}puHCi$xsrZ3NRw2M%t z^Lz`1pT7pKOzU(neQRoLk57J3E7~FBw0yKbyym88gUNXK(GiEwOwkEL0-wz3wyY4D z9pNpvI7bpjd}V72a+!EnqGMBK=Ns8FG1&YK2n>TG;p&s2)LH0*nU+}}zB6=H=#v3_ zRX;XjcS?F65W}GKlOV)M3J{m#R14%)xUU){YUrLA!s75gF;q^wj3P|xxqGsgz@>8# z%;3D;tqr+>aMF~TAiPidds}&qCK80>wU(>#=}HWAC-*XAFcA6DP0;&-xr{cbc_N5R z|KFu~NdG&{L;81W-p7C#en(6q!W$ScYnBS4XNyR!Pz5R^qQeeu$t=ozSa2sg%$dVdadV<37aM3puN_=~YE&jq}Eet$mj zjkMN$kc@iTeDDX8<@pdrm-`E$s=-8h4>%ES`kw=#?!*W9gNT#kcE~Y*cgWRqk|E=vA#x$T3x& ztavh&oh7&DRHD?#85E-xMC8^Yc~%%yAM!*RS01*pWQY}XJeajI>LlCzWZ1=o(wX0V>v~jv&*PKR2fb{A z9fggT&`I~cy?Lj**Ds@ywKwp=bbW77(UoC;NHs)ve^@gmYk%ZZ$@>22YZS=AnDLl8p?7 zJ8o^qJv-P(fjzm1RtRt`|2#uOi?zc#k5&+z{|n*<*F(ff5MOEe8d_Rwn%X)+>PB;S zJ*=;xf1nL)5A=?Gn;09NnVxK)@17lA8Ca{`=-XOeYTcQd-2VzWIsJhExj@7{_A3J9 zYRQeOcfCp>5-F4iCvF@OVO(UJ2%!r=p}{m3&B{*lTxcPo{|liBU!4p6j^KYUbgi`g z@4#jDUol_sTp$^l{|kXcE}aX+Mofk{Potfbw6muJ*4y7E#MUs<_-tU{+r%{4)XenU zd|&rc3+eFa=B1|9<<-T#mEGCHp8bQP)*p$NGud8F@#$Z_MDqdu7M<~YPZJJuVmT=B zh_zK|;_9ASlSC0+!C*;zW`C6}{SAK4T#G(aV-zL+bwzO__NvapW!&*DRg8C-y2AOY z)iJ%ynAStv6jD)~i@fuRsDNEC=cVYhbCy;kYSKIzf zAUj|FN+7pg{<{RS+PZ#y13`be@Vxx?@C1^E)Q+)~8Kl1)zf*L$0w$1-sC+cBj(9ji zO3oMxZb~lK^f;7U?`VU}7h+nW8Ao#EP-SPjv%z>Lrg$@QhX*!GO5Q-yzcYah{XK!) zMJfWLIoN9alY|C=6oWbjUcSV9wN*GHNo2fJZDR_}Wkl0`!8v)nt$5zDDMzj^-_?h0 zG^Iwk`_bGwM`IX0r|{F?3c&M4K+8cjt_?;I7-+tIZoNSj(M|m*J(4hKfP!sw%&?+= zll62jxK)|jLVKaOU{h&~OCu+2FOTDKrSTb@W@b-M@8srg>*C=Jb`f2D9eo0vf`UE3 zlFUCcFgh|KJ}$;H$u}V^Jvh@YJ0LeL)jdBprr3`7qU1uDa4xu;bHW%31kvGjCU!eN zfyVinpWBVV^Al*C@uEDUex5)R>W~nZmJJPv2+k=iN{fliD*k6D(EQevtj0k!B^OGk zufC*vxv{mm{eAc93=D7iWNGyf7>P!Q`yBxXAudfBgx|s8HM0}ij%)D1zhm(-^x8}# z^yXuSH5x<8R(GoV!H>!wB-P{4$ZLCX8mfsz&}&(ITgKDqieWcb|1NyZ^SS{t z{Bm+rbKzK1baGUFNqlByO>IfVSFjr%hljwL+QGsEaavD5LC@gm5Li+4QrzpBgHcRP z4KJE?vH)DL%z-Gd@8e_QH3X8!wvD4qX7@qnSk7 zTdsk56)ByRuYoYKvoG1>_z!7@_uAbVxr@Cp94M*WA8>X(EeoLPh`ebjdt=3aNi>Gv zTm6m2gBi(K;uxwR%rB~~0WTe;Iz{}J-((Ax>=eG2qVJ7V_!2%}#`aXUND&w>#y+{Z z{=rjXws-zGIN_}*)$G2w`Czs}*?S%Jy8ETM(z%WYZN`;>hTjT8J*C)c%L*1@vAm05 zQH!7>B_s>T7OnPYeQakGr+Be68m|H~>9i=(A1SwmZ7@WdZcP?_rd#AGFV&u@i{ox4 z9XH?mHt4D0V{^J^vN^71E14x|ho~rCJdXW6uUp+Ms5vLsfe5bE@lDMMJM_O0+#Mvv zDUgl?c18a#uOkz2oALNPuNxvq!JI?nb)^YdjE6LHzMROybAHc>-BlfYMfO8g1Nitb z?)gc9UC{urJWUFEYgnch{LCgzHRQEFMp778{BUBpiU%iUgseJ73iLii36a-PM?XQ& z1M|9(VU1WP91+bp8{+hoIFF51%>;WsOszzBiE_=P0Fm(f$-$Uy%W>gP2(=RuCf=vV zr#Kwnk4;n1*2*yb)er^Mau=!W@BLKyzr#<#7x@?bRL8&Ir-=UWQx)fepjp}xo=0|K z;{SbQccMhln`;FH>ziAu8(TZTv$C3CbzQyn-}+lYPt`NpH$FNtJvGrb2b&pM?q980 z@7-KlY}uZi*sCG^Bh!FKf%9(8zpKXke+FG}Jx3Q-$)7mJf{F3^Q_>^ry!bOAhd z$@*kOyyI|vrpcQ`FqzKW(CH333rI~Y+x4yrn3Y!N8DUxKPTF&kt|HXauFTnOaD1W z3xl2cyv^sEqo-yE%Q4qeMG)x1FNOrXn=^sNZ~w~|fDDN^$dxw;U*M*MO=z+d=X}?O zM%j%VDx>U58J()+L2J{Z?8#WSH0?>-I~4EDJj0RTb!W#c4tf*LOr)AlpxZwRHzdk=`cAtI zY~@eomTEPIi)ZH{NRC^julYeLx>?1uE=H={U2aYNz`8Wh#!MgUV z`2>MfgR+2J1hg!>Tjz$43PC!D@gNs49t0H)9&}^eK_o9RB|IcGF*zeJD;o(sBrmrZMr~+gP;(6i;)uLXT9)o_{p~D$0|P@N<74Bn$*Jy{+0o&JiKXSP z`Gt+;t+k!Swa)KThl8v8$19tsXN_1wx~P|h;oblLqcD1?J=T&39(M;a+V2XpO#AZx zUX)zyAmUHnmj^29E%<0apfB|NkUvjUG(?Jp8uRmzzibHNkiSCQo8pKRtNP}R+J>6C z#+Is^7-HO=Ac)=#!zc$&MjRXwA08b6SZf@<(?KT`W zo*W-EN-F+Lv2yYL*g-(RAM(QW?myGDbvpeM*QSekTL+~;yr(1wB!5jJ$X_ZT`RfIQ zzBJ#x*oSOWx02IH6&AHabmY474>rhxY6*&jgHRuUr^!#q|58R||rF{$RQFy0*3wMC1mGTq&7u20dIs4c&kA-E`_ zY93ob&MyDfsE)P0^60NdwUmsYGd_!fk_*0G3z-wK4Ze~axvb0#=-QSPJ!sMoNj<6S zWa7N;_J+oJ6P6#wL+OIuX5DDdmK0H*-9W%s=84bH$!S3N>KO~hd?1+Ss0Bz}EpDqBk}{9|2fQsndAu%syc@*~P<_2AYNKiZ#&>dyh9 zub^hZ+g?=Y|2U`lURw_f4`xfQ?9_xxl&T{q|edC^5**Uov? z7cS}7bdtb}-;j$tkh*g+$R$Sja#@fu-G@LRn!-$J5S3eUdQ36&@7gV`J%RJn zbe#RyEz)jm9NZy}XjiA_ijpsAojKrStSO%~HLtFA9n7-UvR5>=bu=<_cC+wMAe8$3 zEKpwpvY#@5jPY~lh@e#QyCXtG(iKhkU`3bL`C!F-DmYm2;`hM{jnauy%D)+`p!`26 zRidF0A>rV(63D1Z^t}-q8=aZ~o;>QG9hsF!nvhh2?3Wi&kXi+fECtpCHDovC)Yi3x zb_ADpdz9Bze*M-J8{Y%POZiqmFxd!_QRk;7i$;C==7$z$Haq9SzPD}c@3tTHSFD!c zacLr`sE(-IX8bK%v2`w2v7~eNnBFhGAc7!Oj>o26Q5k!qivsXy`QZZG}H6RfPR^_6ah6gs~TtsxKC- z8Sk-#n}1TO3})FHee?g3QpKzQ`IAybT04P+7_2~4G8Zjr_+Z7amh{OkD$dXB#TECr z>;*-#3(Q_7x?*sB7_Efo|s-s7C3$ z0wyW?wEmqU?V9VX@_Yqp8>VV^vPWO*l*tkg9vqoV+S*x&JHB<5ad)$Siy`p4iNP0# zl?i40$YOGT5Cg}KA`OB>+SzGI3OZ0G&V5tO>}f}c5GyPaZ*I|LEn1s##*ny z>fGMc%+k^D;r@@og|lPlzIS>=m5!Tub+Ipe@7B6JAVxiH#*KgEK2Ix=$!0)N488Q= z_1gyq5?AmGC`(J@=j&u94sW-9Q~d110@O1J%Jp91QGv zk3b5ZGXxR%1&0TQMnnft+=C|gP{hPUrKFQ0#tB2h@-pKSOYnT*1BF!u71fm${&jgx z;Vlv1&SNMId*B09{qO;*-sgCVgJe!{&ZFb7vccFhkn{Lv!SyHS(N1q)eD(ACRBjm- zzH&s3*5uYTRXnDb^~|_GQn@)?NrK(3Fvz}^xjg$EdX*J(ti)yJ8V1p$)DNxTNsQxAd<}%v;V4bYIpo|oi?%b{CWvnnlM^5kH8hFw5K^;nc6p3 zMY11#ow#vFdQ}n&!FiNkM?yuK0SVY8jPj@k>nl|era6rC10`1^uo^HQ2@jQ6!;PYZNbgNN~BA8ePKE5~&A#Ky$V%)!pq#o1Bc-PqL{ zYT>KuZyMm>BdmR4|GVwO zT@*NifC~I4SdJYMFD9nuR%}Ti`6k0%jDnFn0e#BH?rHhLFu;q2Xytp_tE5+coc^?6 zXj_&d#+4O9u!Ylc;}ym|-qH=`NZs2q5ZOq+?k zx;v||xVmmj;}9GY+}$;J2tfiAPH=a3cP*?6cXxM}!rk31NP-hAxFpad-+%h~PoL8l z-F?@$`*J^f?zQHe?-+x>Ku=eb+j^;5-#VIQUDPtO#MHdFGu{1krVZsDf#lca#qzK@ z?0F}#>*WL-<$tXlB17^&3fTXsoFAx!|Ee58kN=~}G5oKUBXZIIk9L9Mb_VuUyKtW$ zC2%`p93!-ev>W*eUCKC4;uvHSFOB?X?a$d-nD|Ag)i_C5KCLiW!s$&xiWo4BCYe9{ z&0d08--zhH3E-}vC;m&h{e>3)Jbpch{ta4utyvhT$$wyGIFDMY45oC><)N?^DqAgpp+4lTs6}DLYrSgeqa+4LW$|R-Vv>xNEXqxmt zHNjK=sz9Ir*bbuoUu_2g|LnD({x7zJJ)izlb^iG!{D0IV2K<9KH13UJa<>1}2`hw2 z_=`AbL681H9P}2BW2||J;QGQzWST3{YA$Ew`;p)8DK*=^Y;0zFE80fFxkIqN4xpKy&8+Tk* z%Yn+np)49a`Y8WSYW+7bvwWx585+}n(V73LGW}@i|I{N&n#|<>Lu!#>2>!p3S_S@? zzjFT~4pcE8uP=6DKOeusHGtPqIB&ct>rp|x{4j^zA2-6^wF>z^#|r_$za1~6Vt!#>VMuX)Sy6aNXhl?QPJKcnd2>NEsH&l(ysNX2TJ7)gGNh_HIyp5k zK0Wse@lGs`EUb*KtxvxyZF+iS?q{f3;QJD`n9VT4IOXa?PzHN&Zr|X=?<|N0| z|M7qOd-0cmVuS|&+Y0M{4V-{h7Wo@Ep`65mmtR;=TvS@Zm{M64B+SY3XBO4SDO~*; zIFU$PFTm2<|LPCXPn-nz&oqsPlyG68a~TA{nfW>^uAo5pZE>G}BF z_HHzs?Cn>mS7F~1GC&T72U^lWha2sHm7*^2T(jE`nI4H)bo>Uhog)nI`r{s}5DN+52Kvh)gXLm9=q+Z} z_-!`?VA=C?fk*2_bY2=Cr(em6&^c~zi$agU9e4y(B;i1?^FfKs?NUHYftdRlseG<3 zT#Nf$scM4|=;B7gg0$hh!1n3McB4Z(1yTM}&qMUFIUXz!7rrf-j`>!NJ*N8DADdKKCN z?bNuWWla|T9`!JrRj%(bhH)Bs26*Yt>Y7CLjT$eqD1Iy3+LS}8lqw;40DU8`&Vf|p zSP=wcQzSlm!&o8Pp=hB8PlR;uGdXHK@vbyjZ6VT|e$otBp1KSvM=i@#hs>W1WKN?j z%U_*m0_n0u7WWc#)1hoigxx*54$OS!TWJq`kA(%aZIc)YXR_0#4$O5kjpz<@hacQa z17P->Ew~Yy0}P@`=5}`yFm45mUP)>V`A~6+RBL2;QF_BfKIn+mvaqs4B_$TOHwEw+50Cf-aOE zfN9y64IAls`wz4^bV={&ys|dZF3>MHDj8lnMWDU*%YWUN)8~VuX)*hw@=MCyzREFM z38SQ>&xoSAY8`mx%InmPi=08Gw^21=3xupP(E||(+gsK zC7hb_rK?50VA*bdNZ8Rq#*koUNoLqFzh|y-qqE)|g%8VvURkOyO>E&&wx(*lraUcfbPw1uw(W9MUn8C-Wi~9% zLt4UCyNQxJR3AyV^WAECEsGp}KdL_-^RmAmm7vZ66Q8hhXA2qb7@~tUCI#p)m1gAw zRxq=&0K6UQ&=wZ+npqR1DfMrZFy+zO4azSHTED7Vx#Pl7u=x#hd}~D(7R}H(F8#u_ zcSED&rM*_e_^z_#%e3$J@r=d9!{oC!GGQHugfgY@&DNtMoNHS8(hiB#BqUhi%Xz8T zF&g&syb5j0RkFpo7G9_}tcYc#%w!``AvTgJeK3kf8u5xiBE>`qa|X$|UJ(D3@K~rv zpCRhcLu4$cNX1WrB*+A=7(Yg2U!3Et@^($sd!=q+%(4q99C=FH(TB|A62HP@EccgV z!_0cjGrZ2Q@IMJCIf}}1#l7WrsW7Ig@OPO2;~1w6UfBM$4rba~nZc9D$&88&43sMpXXkEG%yd112irChv!b+}3R>uy6l6~fBc|DVm)5V$<78Z6 zgMlb{Gv-&GfSp`ebYW8_MG;?_5DjBWuOBw@Ir-9vvylXC9!`o!3cCQYYJizneOwk7 z!xgn5P}){8Fsbdj2pLo+Ys?5k@ecZpcy5oJyH;=(rAUcTCj)k9I2;9%Aum9#91)u` zn6$L9h+X$c)|fDWbo*VY`pYSUd=sNeKJHDaw5}CE``bcFxl1`mNPrdvJBUXV+~yH^+)iL`&OReeK&gY%a@VY9bKsY5`v+U9gq9sI*H>%1woZwItm%a$^!M ze6=@KorY?>OCD{A&C7|Ic+|4ftqEkjWRkOD8^zAJ9{bcH!|$mV{!@PAF+Rl+XX_fd zksIhcCZaVmFrD_RSv7Q-EfCQr`aTyyu*gtU!B$Z5BhEr1tzIx90w-2@9D*9Tu_(MO zA7NgUjX=uGXWaVQH@F@Tp$az`vZ2N= zR0&Isr6j+PlrkO{O68l%)hCbD+8-Bdqnj$t<$h?bKQ2M0nyTF=f9O3uF881}*M`ZR z7!f_KjL0|Fr@Ty_n0_b>dfI}*fb<~BUjT`IZ6hf_`tYVMLO=Za zh8Y7Hpp*X@uKnwuz=^4!(cYOmvFQA$^q*e?Cv@g$d0R`mQ;u2-zwXj2@au+?T&8ya z+JBesNrOLm#G-hJC=B(8;im|f{{mGFK(tbjBoEH}u(2y}H;?PZdYy~c*(Nn?kCpoD z5-*vx9EI*R0~P{enS4B2JoREFj|=`-(u&A@2%SG!2^7oGJabxppPJC)-lYwm^h$pE z`oqutg=|-MQt!+g+iSy(tzY-TM5yu04|&hA49&AwHU7_BV$;vu!Eb`Le}vkO8{2r~ z@q21Z=rhyU@;Rix)t229eo;Fnp6{WC)N(rAzE49lt&h-w!6V~I=Wyw|LqzV} z^-f0R)=s^0-{RPPl5z9rRAUGKpz2|W?kc>Rw79h?j@~J+>h+?UBaQmepuV)Lx(g){ z)V_BTtlugg5oF$%s@a%u#cIu+b0S3x!fVLxX4vx9h)~U>+x-hPnlAF#|FaINld3Q|D=g|LUHp;_o-+y~gN=-+-0 zLJ%X$KL8R~DE5^EXNhYRxChhhTYVV`q6N`I3ccssot%5=s~W3FMZ^$sVX#*zrlT|$w*{9f<7{zzV%*!a8a1?NLU@k%a^yN zU|tYdK?Ag-p-JlDQ5FG79EI1KwS3rx8W&!zVv8xlZR3YJ9B$V)Yf z5|0r=@{MW;r&bLHAI4l9BG=Q$b}WK7@4#=tR;{VAsIH=2sY+W-e#npjWz|Rl*GNNr zl`;G<#FfYk@%T&Acomu`+YwbPjer$=>otuq)RicY`=~3rK&eH(U5~(OK2UE73eJNR zo~18JLtM!ch*T=%78EW504EK`9MH$L0OFw2L}1fE&5*>hP$MRQPK<{VYFg4duP7!1 zsK-3^7@{o;NQNf}QlyXBX-V$%2qzQ{qh?UG;!~z4@Mn_JW%2ZHcYFz*2BdP8LD|p; z@JR(YKoV+~60|MCgf&$?`4YuwqTz?UUoZ`bZ)0RFL3KBYVEiQg(KsJX88QY#HVIC~ zLg^%n^g_f4DE^>KKx|z}5K>t(G69!5f$!~Z>?2mn@6tqFf4PE`*YLn3j=$l7#q9Z&iR|oU zb>$URui=4Vwe{^Bjgao1Xo|n#frI4aL%oo(k(tr)k;%m+(ERM$^x*Pl^V+w$?;Be& zUti-H_fMKukB)~=ui`sy3v2G`uR|Ul8(yS;65RvTdz7Btuyg>229?3+3Z3bQKlKNL z-Vm^I9bQnfMO|;(dedFv3t08Y_bF?}@3F_80K#3Q(Dz&7XDq%i5K!VzTYT!DZG4d1 zmoDH2Yej&@j6atY0!22s7qgX%midep9x<%Bi?Lfdvlvw?RT~}!y_!OAndh5&UmPe= z(xXch3Ppd$6Cv{!eKmCXa@%>ctYP5jrhE+F#>dv!+fDfX8RY?2BOnE#(@_MmF10!*aJ7^$|CgWZ9+cG<~fiPtu^fBJ; zU8<&yByW#u$Nk*gJ6!N@KfsGj1A2}=3=W{Jj| zX$RR-$)kI0Mr%i~n4W8+C5aq1-)Q^;meS1A%!?V061#&z1%_TpD|29EAURSn)WnF< zTRDhEc^S(i{c&inR$l=^K4swNh>~dOic%pVjpdMT`g^#_;xTEfw;y7)PHGFMN)4Ej z5mKqZ28zvc8$^SaVDq$>=M7U_|OgpUozGOx*`>KMLsgwJM|TLsiaXf5}6QAm~RS3OdMP z2^1wf8lg@REY49l^VaOX?*z+q+fGSc^o4u}7f*Uifa~+zZZofvgeM{`ay`+tEaHWB zG7Dz`WLb5IE>1p*1&TseW}_O$EcN{{Tt9A%XYB-7!gG+npLQxNc(F#2mR8-ithJSv zxAgRF7_}BG9p6?7b=kIjJ$@FTXe6Y){}CxSqJ94TmM5DJI~sN~x8YO(G4nlaIKZ|m zl&Z`9>(^%%f$Bj2ACMi75|WnVfSBjgub)V#SUJwC@!58-Nxe^wVy@)N%NvOL_hSuP znYA(j=wBU%aG1(222{mP1oJ*`MK)mw#h$p#%RQ|M4$#ZwT)wbS=x5zLn4^aarKuA! z-?;xqv+rPGN69o<;M)4?k}j#;`SS82vo_m-CtQ++;Y}Oy?hbRP@%hb+BHS*)tMI>@ zrO`;*4SV&mzL{HUL)K8*TM4ymKVy7D^fNy8o2D_`a%aqwn&BUG-}!U#V_yyhHw%Aj*skOENRtk& zEcnu*!`nvbH|sdK7D(v6B1UHP!`7_O@TjZlbAMi`Xyx|osaI+{odlLEM2x8N9W}Wr z{$(skwvgg|EVQ`5W!#tnHFXe$6hNRpKG~%Z5q~vL-mCsatp1W3TZn%Q9ZdwO2#$u~ z)2Ossc70M-ViL1C!#E|mAskh15vyw1n0j%%cCAYhCQxbuz333H>N5~iv25Io$2n~( zk%r53bWBXEj$Ok=a-Xk>e2Rs`UaV5}a71H>f34QhQilp7YHvuncQ$i3k_u9Km`mcx zVJml1aQ?gkVE|TyC$}%SrD8RwIVi&}#%H1QruwqM1wQR!zcM+@Hs z6s*`H$CO@Tv)jZJ$ArlxHuUz8AvkJsBfrtD`_6SL!l z9Yi%{R=1Q^36HNDyxwIBBAC?L`LtV{`DlA8= zpJ_6xX7cyXJQN)AnId;ij($)|Gvz9&l;W~j;m;SXMmi;ClSzO7=#97ND~}Va>Wd{T zL$7EgjLW}XX~~ONw!p^eqK(c{r7*p($qh`-!Xv4&^iE%hM*m*$PN|AS%IZr3>lCIGZ!@^FPK@o_ zmRp})X1rJqR5KpG7QHr!k~7C{-2zMX^6zUbZLJ~OYZHEIZngd_HWU`^b6!GjExw&6 zW@K$JU|Q}bc-vE3g~zQ;Qb@}(v7K6g*0eagd)a)|nM%#tA~_jX2g_?W=v_2Ve+R_N zp13q^(?NpxJRs?zJE!=6L*c6NoN}MK%DD1LgP^v}c(VgP-$9`oAbI77kUD3r6CZqv zX~Xnky-CS=IL=b@WK!I8N{QLtmma2>jhMQw59&M-{p3BP8||u)@oUKxp)W1w_Ya

)tVttp`^7_wBmx@aUR{6nxL3BQiH?(D^G@ zN$kT@%9iZI-k_!q6v_{0Eki4GY&_}NfDY2uBZlwm)8vkWbJ{K(_j#Kcs@ymAwZ0E$ z-&K=rA$9x) z);f;=DwA|g$O!D_B8j`-KHJM-8!stRZcW$otLxmede*+mo&P7ifbj+YtbJ32Qh3H? zOBkd8MY;g>I}^cu{j#?0b(VJYe+&@(*`kZ3D02J6~*fFJY)R z)EYq9c;5p&p8?5tE6bKp2QL;G>i*P{zLfVtHip1EH^=b-U`dG|ZK`j^5HQP)a7CTA z{my(o%wJd|g#0IvM+BI62h;(DsNIJc;D;LVg_>xDnt6m;q=s5SLTy$;?e0Sz@WY(= z!dx`|6&~mX3G-R`6CN0V|2H{M0~G230;Ph&Uy}n@{w4?FgX8(Yi5lQ!4{&NKI2{7c zTmfg_gLCo2^ZCLHHNuNM!b?-b%OT;FE8*4m;kEb?^?d)E@W7Q&h6DIF76Ho+kz=I+ z@9s&5Adzx>kyHSn7*dFTGf)xcx9+03OWzd?N1Z@ysP1g{OCxmXgB9t0YN#j<%7o!?NgFNTzjo4QAYnRl?dZL4`AXxZCPDjM^R-Q5WCgNMDIly_;f zW3stjDT2Q$LyGzXNQ}?@@iwVv$je|g`aPd_VAzX8kh=wBIVsUCQ70`?njkL4AO*k0 zDon!Diy>)3(w~UmC-4Y_Q0m0&ZidI_tjC+6AQ@lDAAkXICIq;NySqryxfw49%9q49 zNX4@}IJ5BwRvHAnZw)jw&oFMu;2I5V0%yvp2hhI(2k|F6NhF6IC4tL;OPa}FG%XOr zl4Hw~;|a_YkGx@)v(iJ;l5^AI!#uNEeUc6wm0MFX&HZ2KQys&C;J}V63~8PzSxFDc z4iD~IM>aFJsVM6Hd#&D>lDP<@X1Db26e%k{uJDFAOUht&O;ihR!%N|`h#Wq@33?_ z&rHsP^e(A*RlH2$lEBaAE`So3iB*@n)k695BFxnE0S4qD2IrZwOe6JTo}zZ;8f=FR5mRK4n_}T$pRa2cC~u!ahm0c=9@E3JE5_>t5&ReYspELe*V&q0i%hz z4teMN7DS9W#9^h!xcP#IF_BhfU4&)&w-tSiuif-p8R5!4ud*j8q!fun#&BC-uL3{9 zayKh0POoy*mSpsja-XqsX{q$uwwD5h)q;7m(v~$JQNqG@tEAL1B(aO?i^pn>0QjF; zHMd?hKLu*;AFELeYaZKb?$hC5GT=1G;iY93~{77RDL{Hlh}_-Gp%6gl*hJ=GKUfLguLAl z@k8zTqwXBG9$lg~ql#YFcm4w$vExOm%p_imYt3_U6;&Gjba8#PB=R}cMo%g>T2tMX zJ^gBK<{NQ-_99~83~^BzJ)S}BtwbIJYWd!zM#)9J6B8L96-EoHrirG;`aNB?FMMt) zyoK@lD~nBm>-tRa{2O~pi48B}jWo*NZyGGxwf9{$?WyFCk{a*(^b}MeSs|<*6<@^P z30`iuXpA>&WynA7HkBlNX$WC3EdprYw1M8WgW%NaQD7RG)XbCAy02w=7SuXX)C1u) z0vDyn8`VZC)C&gGN3~^NGhc_U+j?VV#|C7>iZqrg)Ux5;2r==PR>w?fl&{63#si2iO|t$*AI)}nHlVazEMh3Qk2ZoM%tr5Hf>B? zS4>4=MxyBsw?Mr)uTS4_2xY(AOPj$!Ldgrb5z@&9g*_Yzb4o+-ui5M6&Xs@pr zNNa2#DD3Rc=qTB39~?j)aOucRlvX>D%G`yPl2tS1?yRI{0^`U52bpO)hJZgIn?$Yf zI+`}#jTi4&^Mw0PtovQwb*~IC6|VDZMtoR;Z&nk>xA75&7F3R9>PmF&>2lR1Q>9Yp z9__*G?Mxc3fH9zoq+(a?mWG2=4tA5_z))2VMOpX3bTBdaC|^`C6%nI)Oz`mJBW0cp z-!}H;MSNZ~7-(P`>NpvY{naLKBc-U*5mbcLYx1UxS*`zF zE@t;JQ@^UlxZu|pmBJI8jflQ2B2Dv%$;S7gnrUT`^C8%$vgY zoqi`b8RI&;+%ZC(HHAeoxAk@MDY6}&WrBrd!qjypg=@xSWiFX|zG;1mylU3BV#L03 z*2H3(CT`|~?%dw?Jb~>3k?))!3HoQ>G4aj?{NGdb8$$;43xvd>Op|lklZ#WoQ245* zyJhE%E#`Gi2Na@4pll}1))zpIB(hZtKXT_4b(gON7g2NjEOEJL%_DCz+t^#L17>YsK2D`ANc$=po4-u@h=yU1C8QH zi$f!894lMf<)aH5d+X!N`@8#t?BBkv{n$IdzKN}BZhQ=WcnU9n_IlVcA0=hg`;CCi zIr8eS8G=MX3xmUMDDI1<*@=x*r#=)@KjdhvE=MY5nZ6K3Z5dB8VnL&xZSzpSPZvjL zydO#DY(DPvNfoWJ>3{7|x?2=hazP(mWSRxqC)}xEpp9QKMst*0Ly8}s|+FXw#3R*Xp)B$+bjooxZ z#fX$*p`w4C_P_F79n2(sh>`D{xjvqYF}APv{orgdUE?jun(cl4EobixL0;?Q(OSc* z0z9tH>FM$zNt?ZX`X|k`A89zbG#4M6)+At&HMZ0r;NRxhJ%54ERfZP=CEFv(@2z>E z0hM*!cR~sU~K=W)cnbNS4J&B&Nq2048zySF24IjssKop;vg96t zq}43LY00t#+h((jF5Bho$Sm8h?~O$+P(E)wD+pWQAWPGv?m$M9t8t((K}iJuAIp*? zy|Guz5>`XY(!%g#%d*ty@bcpF>M=%~`kvasSIZLV3S2E4=%9+k=C8It%L*>kv+b|pzwOSwJ^tCO<%v-R_mb*J%~o)?5jmdd)?4`=P#vDU2p1K-z~x{(vsFJ3zF zGAb&om#JlH`mdL~G})7rC*bjDN-8hMn6qJuZACgyah*?GjzYE> zuV=)(zroJ}=vW$OwGuPA=A|WdUD&0?Sh%Zu?w@Lww4HS?de(q)7fTzozgYUw*(y2K z@OcGKI;^!PZu{|DY&p7m+ML6Ouh*V_ZrSoywJ_?r+d4FcC8R?df3u>q_f^nm{_MXk zN8{bF?*7OR`SgqYb1TJk=S9cakgr`=`~^w%RzOQN|L`{C*keUE!>VefF#)ZWF+S`o zJ5@3w#?Orm*q^YNInI7Qkn!#hK1U9cY@EkYrFv|KRj^ZSLksJ>?JwKD{u1+_^80m;HyJX+P?7%ou>}4Z>r@sh7x&DhrANxs z$VUGvxymP{IZ~I^LmMi4E%ktZ#x+`R^8MykbW7)KTCHKP*G2~qQvJzqTkq9=JQ%Xn zc7-?n4(&4H=QC>jD9F=x=_t!oW3C;TgpR@JPvlnv;=S+ot$^gaUw`S8emwc|Q?H+J z71cP9p-mAP|3jrVMwmJ;xlo-))>Q^}l>0zTTr9}0(5QXNgMX^d(JxE80L9Ql9H-AN zB7ke;33cU@==X3x*o7^MtCkUQPBec}<=x-;`ynqF9{kbh4e>Q z5@SgyLHQZRBuOcPJQ9vz$3}G|@k3EnQAcoAzcE(Lil|<*VrT&>Esr$6`_nB51e zZ2|f|o;8l-*0~+#7qfAL&N*jMgB`Al!|_E!jMP3$ee!TTM+F?huX466$mri_^aNr@#l{bqP@{Nt2jkAg2QBO#@G%Gj zIj;ktCg}k)uBdU7j3Bpm?RC~-dHXr+WDeN_xFA<1)95MJr;{CCb8fzyv22ny(kFDFY&}OpR82mshdj3 ze$dI~NiWsgiK0BT9?ndREj72@R5?Ud=p0Bdx2@e&drVg7-Hk1GJ>Jv=pj8?m%B=Je z-qwQTDvj{QSB3;`>tdrSO~@juN41h!>&K3$`LrsQ`ME8)7QdU4XjfK-s#U0>Rav%J zsaA)}hUwc{s!F!6lx9ujBZeG43wnlc7$r8lv9RDMYA-H?SGZt|9|(F&t4#ZqYuNuV z!!U>cnjy^{p@L)SxPu_4p5N%YNX>|>8_^Qol~^W|WrtAFE?#z-=rZ?>8 z8b7l;|AeP_-}y9w&4!hI30EUsHQK9Vc2%gdYaBUAV1QQfnn__j?W!+xa^tYUZ6XkV2( zu|DQmXL0sfH2NsN`BsIZtqmXDXMSbV{J7~}E41rc%EugMSaXPj^yH@Qy!mZT+E{G! z&-hs?D{do$(|nfGb!Z*;x-kaHoedlK`I8j4hTL{t16ahdjLBUmZN`~1Mg@EkSfaZ4*xz~zB$A6ndWO&e7` zV!h*^oFuu1NMhot__r?Fre3S4c81End??L+zJ6)t^*-i_@oE*haqAxMyzp-Eo~B`Q z?V!uPj5zq9({$#V?fPtPa{yTpQ*fKh=-jdR)VVC9cRTdrvlrX#)eV2{F?dyOGt=d> zVXDBmOhkGZQ}XfLp6%D#*2lXbw&x#V)AwC$WcPtNANMIdJjUtt&nK1zb;5dnQy4&_ zY+!WX4=ebc`FH<1ROo)_t$Du9>HfXr|LNDx^z%ak8UArf_wV)^|I53VXD3%GS!@a= zC+g-|YEwrV6<~nrcN9mO07O85@f_9VH!A!*GBo@^BO_WD_&^*G_1k+I9ZKq2VT=wA zS|T8l9}X3}6D8FgaQ7#Wnv-f)B#1MgL~{;!K_ZeD(_ zotc?ITEVMs@s9@!u&AW0uDRtk5-O*mwEYhW59w@r^??{2iy9oCnx5&OoPBk5te9VZ z4bg~MZ8NvBO1EOgCL7u~Uf6;dVB_-d#4{kkz;M(2cseQWYBTUah0tNl5#ZsTeVZSy z`Bv{%x3H@Sa3C#khsC*s+Nq_gx3{9+EfuLZi6R!HeicSJnTBJt8IK{Qq^P%dKN}-!(aK|`i4t13X?ZAZL^{D#?8ZlrqbiZ{pcT#X~oDRN^ycrdTkm? zgovs_^l4LVJ1&RX%?odo9#dLgP_YpR^AL#a2n;P?U^DF83`el`>wnO;ogbH-7G14< z;CLCF>l;7qD*sF%K?sxZWvEmyz@dmrhdA5<^~2^IX-!LCW_ESM*e|Opo`kT9sjkb6 zVhVC^Ed(gW;aoT>ol0$*H~KUM9L-5{-sgk&2!~fx6e-YfeHBAsBpfvDBUiDIZ|(UD zYie@b#U$3pNF~|zv#gDwi5c_*268YrLkjaAfCb_hY#IuRmN=_-#;^|T`E2x=cdRNp z4L=gpKf&-4Sa)Fcq3DEdv*M{yk6rE*UcJ937ja)hcw=mWc{?W^CM=3uv|K%P1k~sa zEwKhs0LHTDG9ae|>OoJ5_LUS|6xAi+E!0pkI=`-KGdF#Z+TY3`9fEnPrH*s+zc9bt z>?VA_rBtqb9cb)u+MgXJ24h{;&ZXFQ(|+;F<9eeR^9htkZb3*1v-in=l1=9u)An!N zR1kexr8LJ#-`>cK5a5@Lg$4ZzIW0)oP&GD?oH;lO7qOQD3d#pA0VVkH?`*hsE11tW5>21OlLD$>QxOc3~9qsNSI0Dei* zA4Dlq0oN<02ybJON0^%1mfFXd9!GG%1cy{lFD&A40luy<$PHE~o*VQ8Ra0q-oiU4W5!FRrg+3k*#B3 zJPgfR^22!ekxXh5(Ez>y6b!xr*rUV6jECiR!b~e7ZZ>ms+=8d!1~Qk)?`=Ba;yOj` zTar9L9M1ccJYjj`B)d^a$Nm$4Dtf{>$9vqI9r|&aQ*}BHo#BudZ*4RbTO}u-Us*8B z>AjOYdKNB;xkNV)wOjF9d`Nq!DD2p%ZMqZ|YE}ssrZf)-cfsb~XO>E-eA;B94>gr9 zNZx&HKKt>yL>R4HF^o_JDpDnli-ew!NL;CqwU>aGJ4B^Kda+#Vrd-2(vRtjjE7VoX z#r!llhEXeRN`uh8!a}Y>YdKw&m$)e#Un~}!jx1;WIjQQ;MODr*v}LJ@&oRn^Ieu)~ v7W3xsMK#PyB?Z?F%i(tgi$%JO=~vWuiP{KWj6`T@lyBoQT_wn&pg#N$h3a7L literal 0 HcmV?d00001 diff --git a/docs/en/reference/interfaces/cli.md b/docs/en/reference/interfaces/cli.md new file mode 100644 index 00000000000..9ef1cea280a --- /dev/null +++ b/docs/en/reference/interfaces/cli.md @@ -0,0 +1,181 @@ +--- +sidebar_position: 17 +sidebar_label: Command-Line Client +--- + +# Command-line Client {#command-line-client} + +ClickHouse provides a native command-line client: `clickhouse-client`. The client supports command-line options and configuration files. For more information, see [Configuring](#interfaces_cli_configuration). + +[Install](../../quick-start.mdx) it from the `clickhouse-client` package and run it with the command `clickhouse-client`. + +``` bash +$ clickhouse-client +ClickHouse client version 20.13.1.5273 (official build). +Connecting to localhost:9000 as user default. +Connected to ClickHouse server version 20.13.1 revision 54442. + +:) +``` + +Different client and server versions are compatible with one another, but some features may not be available in older clients. We recommend using the same version of the client as the server app. When you try to use a client of the older version, then the server, `clickhouse-client` displays the message: + + ClickHouse client version is older than ClickHouse server. It may lack support for new features. + +## Usage {#cli_usage} + +The client can be used in interactive and non-interactive (batch) mode. To use batch mode, specify the ‘query’ parameter, or send data to ‘stdin’ (it verifies that ‘stdin’ is not a terminal), or both. Similar to the HTTP interface, when using the ‘query’ parameter and sending data to ‘stdin’, the request is a concatenation of the ‘query’ parameter, a line feed, and the data in ‘stdin’. This is convenient for large INSERT queries. + +Example of using the client to insert data: + +``` bash +$ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; + +$ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +3, 'some text', '2016-08-14 00:00:00' +4, 'some more text', '2016-08-14 00:00:01' +_EOF + +$ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +``` + +In batch mode, the default data format is TabSeparated. You can set the format in the FORMAT clause of the query. + +By default, you can only process a single query in batch mode. To make multiple queries from a “script,” use the `--multiquery` parameter. This works for all queries except INSERT. Query results are output consecutively without additional separators. Similarly, to process a large number of queries, you can run ‘clickhouse-client’ for each query. Note that it may take tens of milliseconds to launch the ‘clickhouse-client’ program. + +In interactive mode, you get a command line where you can enter queries. + +If ‘multiline’ is not specified (the default): To run the query, press Enter. The semicolon is not necessary at the end of the query. To enter a multiline query, enter a backslash `\` before the line feed. After you press Enter, you will be asked to enter the next line of the query. + +If multiline is specified: To run a query, end it with a semicolon and press Enter. If the semicolon was omitted at the end of the entered line, you will be asked to enter the next line of the query. + +Only a single query is run, so everything after the semicolon is ignored. + +You can specify `\G` instead of or after the semicolon. This indicates Vertical format. In this format, each value is printed on a separate line, which is convenient for wide tables. This unusual feature was added for compatibility with the MySQL CLI. + +The command line is based on ‘replxx’ (similar to ‘readline’). In other words, it uses the familiar keyboard shortcuts and keeps a history. The history is written to `~/.clickhouse-client-history`. + +By default, the format used is PrettyCompact. You can change the format in the FORMAT clause of the query, or by specifying `\G` at the end of the query, using the `--format` or `--vertical` argument in the command line, or using the client configuration file. + +To exit the client, press Ctrl+D, or enter one of the following instead of a query: “exit”, “quit”, “logout”, “exit;”, “quit;”, “logout;”, “q”, “Q”, “:q” + +When processing a query, the client shows: + +1. Progress, which is updated no more than 10 times per second (by default). For quick queries, the progress might not have time to be displayed. +2. The formatted query after parsing, for debugging. +3. The result in the specified format. +4. The number of lines in the result, the time passed, and the average speed of query processing. + +You can cancel a long query by pressing Ctrl+C. However, you will still need to wait for a little for the server to abort the request. It is not possible to cancel a query at certain stages. If you do not wait and press Ctrl+C a second time, the client will exit. + +The command-line client allows passing external data (external temporary tables) for querying. For more information, see the section “External data for query processing”. + +### Queries with Parameters {#cli-queries-with-parameters} + +You can create a query with parameters and pass values to them from client application. This allows to avoid formatting query with specific dynamic values on client side. For example: + +``` bash +$ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}" +``` + +#### Query Syntax {#cli-queries-with-parameters-syntax} + +Format a query as usual, then place the values that you want to pass from the app parameters to the query in braces in the following format: + +``` sql +{:} +``` + +- `name` — Placeholder identifier. In the console client it should be used in app parameters as `--param_ = value`. +- `data type` — [Data type](../sql-reference/data-types/index.md) of the app parameter value. For example, a data structure like `(integer, ('string', integer))` can have the `Tuple(UInt8, Tuple(String, UInt8))` data type (you can also use another [integer](../sql-reference/data-types/int-uint.md) types). It's also possible to pass table, database, column names as a parameter, in that case you would need to use `Identifier` as a data type. + +#### Example {#example} + +``` bash +$ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" +$ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="number" --query "SELECT {col:Identifier} FROM {db:Identifier}.{tbl:Identifier} LIMIT 10" +``` + +## Configuring {#interfaces_cli_configuration} + +You can pass parameters to `clickhouse-client` (all parameters have a default value) using: + +- From the Command Line + + Command-line options override the default values and settings in configuration files. + +- Configuration files. + + Settings in the configuration files override the default values. + +### Command Line Options {#command-line-options} + +- `--host, -h` – The server name, ‘localhost’ by default. You can use either the name or the IPv4 or IPv6 address. +- `--port` – The port to connect to. Default value: 9000. Note that the HTTP interface and the native interface use different ports. +- `--user, -u` – The username. Default value: default. +- `--password` – The password. Default value: empty string. +- `--query, -q` – The query to process when using non-interactive mode. You must specify either `query` or `queries-file` option. +- `--queries-file, -qf` – file path with queries to execute. You must specify either `query` or `queries-file` option. +- `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ by default). +- `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter). +- `--multiquery, -n` – If specified, allow processing multiple queries separated by semicolons. +- `--format, -f` – Use the specified default format to output the result. +- `--vertical, -E` – If specified, use the [Vertical format](../interfaces/formats.md#vertical) by default to output the result. This is the same as `–format=Vertical`. In this format, each value is printed on a separate line, which is helpful when displaying wide tables. +- `--time, -t` – If specified, print the query execution time to ‘stderr’ in non-interactive mode. +- `--stacktrace` – If specified, also print the stack trace if an exception occurs. +- `--config-file` – The name of the configuration file. +- `--secure` – If specified, will connect to server over secure connection. +- `--history_file` — Path to a file containing command history. +- `--param_` — Value for a [query with parameters](#cli-queries-with-parameters). +- `--hardware-utilization` — Print hardware utilization information in progress bar. +- `--print-profile-events` – Print `ProfileEvents` packets. +- `--profile-events-delay-ms` – Delay between printing `ProfileEvents` packets (-1 - print only totals, 0 - print every single packet). + +Since version 20.5, `clickhouse-client` has automatic syntax highlighting (always enabled). + +### Configuration Files {#configuration_files} + +`clickhouse-client` uses the first existing file of the following: + +- Defined in the `--config-file` parameter. +- `./clickhouse-client.xml` +- `~/.clickhouse-client/config.xml` +- `/etc/clickhouse-client/config.xml` + +Example of a config file: + +```xml + + username + password + False + +``` + +### Query ID Format {#query-id-format} + +In interactive mode `clickhouse-client` shows query ID for every query. By default, the ID is formatted like this: + +```sql +Query id: 927f137d-00f1-4175-8914-0dd066365e96 +``` + +A custom format may be specified in a configuration file inside a `query_id_formats` tag. `{query_id}` placeholder in the format string is replaced with the ID of a query. Several format strings are allowed inside the tag. +This feature can be used to generate URLs to facilitate profiling of queries. + +**Example** + +```xml + + + http://speedscope-host/#profileURL=qp%3Fid%3D{query_id} + + +``` + +If the configuration above is applied, the ID of a query is shown in the following format: + +``` text +speedscope:http://speedscope-host/#profileURL=qp%3Fid%3Dc8ecc783-e753-4b38-97f1-42cddfb98b7d +``` + diff --git a/docs/en/reference/interfaces/cpp.md b/docs/en/reference/interfaces/cpp.md new file mode 100644 index 00000000000..a7b4188799e --- /dev/null +++ b/docs/en/reference/interfaces/cpp.md @@ -0,0 +1,10 @@ +--- +sidebar_position: 24 +sidebar_label: C++ Client Library +--- + +# C++ Client Library {#c-client-library} + +See README at [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp) repository. + +[Original article](https://clickhouse.com/docs/en/interfaces/cpp/) diff --git a/docs/en/reference/interfaces/formats.md b/docs/en/reference/interfaces/formats.md new file mode 100644 index 00000000000..801b7c1a14f --- /dev/null +++ b/docs/en/reference/interfaces/formats.md @@ -0,0 +1,1707 @@ +--- +sidebar_position: 21 +sidebar_label: Input and Output Formats +--- + +# Formats for Input and Output Data {#formats} + +ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read an external dictionary. A format supported for output can be used to arrange the +results of a `SELECT`, and to perform `INSERT`s into a file-backed table. + +The supported formats are: + +| Format | Input | Output | +|-----------------------------------------------------------------------------------------|-------|--------| +| [TabSeparated](#tabseparated) | ✔ | ✔ | +| [TabSeparatedRaw](#tabseparatedraw) | ✔ | ✔ | +| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | +| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [TabSeparatedRawWithNames](#tabseparatedrawwithnames) | ✔ | ✔ | +| [TabSeparatedRawWithNamesAndTypes](#tabseparatedrawwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | +| [CSV](#csv) | ✔ | ✔ | +| [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CSVWithNamesAndTypes](#csvwithnamesandtypes) | ✔ | ✔ | +| [CustomSeparated](#format-customseparated) | ✔ | ✔ | +| [CustomSeparatedWithNames](#customseparatedwithnames) | ✔ | ✔ | +| [CustomSeparatedWithNamesAndTypes](#customseparatedwithnamesandtypes) | ✔ | ✔ | +| [Values](#data-format-values) | ✔ | ✔ | +| [Vertical](#vertical) | ✗ | ✔ | +| [JSON](#json) | ✗ | ✔ | +| [JSONAsString](#jsonasstring) | ✔ | ✗ | +| [JSONStrings](#jsonstrings) | ✗ | ✔ | +| [JSONCompact](#jsoncompact) | ✗ | ✔ | +| [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ | +| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | +| [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ | +| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ | +| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ | +| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ | +| [JSONCompactEachRowWithNames](#jsoncompacteachrowwithnames) | ✔ | ✔ | +| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ | +| [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ | +| [JSONCompactStringsEachRowWithNames](#jsoncompactstringseachrowwithnames) | ✔ | ✔ | +| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ | +| [TSKV](#tskv) | ✔ | ✔ | +| [Pretty](#pretty) | ✗ | ✔ | +| [PrettyCompact](#prettycompact) | ✗ | ✔ | +| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | +| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | +| [PrettySpace](#prettyspace) | ✗ | ✔ | +| [Protobuf](#protobuf) | ✔ | ✔ | +| [ProtobufSingle](#protobufsingle) | ✔ | ✔ | +| [Avro](#data-format-avro) | ✔ | ✔ | +| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | +| [Parquet](#data-format-parquet) | ✔ | ✔ | +| [Arrow](#data-format-arrow) | ✔ | ✔ | +| [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | +| [ORC](#data-format-orc) | ✔ | ✔ | +| [RowBinary](#rowbinary) | ✔ | ✔ | +| [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ | +| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | +| [Native](#native) | ✔ | ✔ | +| [Null](#null) | ✗ | ✔ | +| [XML](#xml) | ✗ | ✔ | +| [CapnProto](#capnproto) | ✔ | ✔ | +| [LineAsString](#lineasstring) | ✔ | ✗ | +| [Regexp](#data-format-regexp) | ✔ | ✗ | +| [RawBLOB](#rawblob) | ✔ | ✔ | +| [MsgPack](#msgpack) | ✔ | ✔ | + +You can control some format processing parameters with the ClickHouse settings. For more information read the [Settings](../operations/settings/settings.md) section. + +## TabSeparated {#tabseparated} + +In TabSeparated format, data is written by row. Each row contains values separated by tabs. Each value is followed by a tab, except the last value in the row, which is followed by a line feed. Strictly Unix line feeds are assumed everywhere. The last row also must contain a line feed at the end. Values are written in text format, without enclosing quotation marks, and with special characters escaped. + +This format is also available under the name `TSV`. + +The `TabSeparated` format is convenient for processing data using custom programs and scripts. It is used by default in the HTTP interface, and in the command-line client’s batch mode. This format also allows transferring data between different DBMSs. For example, you can get a dump from MySQL and upload it to ClickHouse, or vice versa. + +The `TabSeparated` format supports outputting total values (when using WITH TOTALS) and extreme values (when ‘extremes’ is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example: + +``` sql +SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated +``` + +``` text +2014-03-17 1406958 +2014-03-18 1383658 +2014-03-19 1405797 +2014-03-20 1353623 +2014-03-21 1245779 +2014-03-22 1031592 +2014-03-23 1046491 + +1970-01-01 8873898 + +2014-03-17 1031592 +2014-03-23 1406958 +``` + +### Data Formatting {#data-formatting} + +Integer numbers are written in decimal form. Numbers can contain an extra “+” character at the beginning (ignored when parsing, and not recorded when formatting). Non-negative numbers can’t contain the negative sign. When reading, it is allowed to parse an empty string as a zero, or (for signed types) a string consisting of just a minus sign as a zero. Numbers that do not fit into the corresponding data type may be parsed as a different number, without an error message. + +Floating-point numbers are written in decimal form. The dot is used as the decimal separator. Exponential entries are supported, as are ‘inf’, ‘+inf’, ‘-inf’, and ‘nan’. An entry of floating-point numbers may begin or end with a decimal point. +During formatting, accuracy may be lost on floating-point numbers. +During parsing, it is not strictly required to read the nearest machine-representable number. + +Dates are written in YYYY-MM-DD format and parsed in the same format, but with any characters as separators. +Dates with times are written in the format `YYYY-MM-DD hh:mm:ss` and parsed in the same format, but with any characters as separators. +This all occurs in the system time zone at the time the client or server starts (depending on which of them formats data). For dates with times, daylight saving time is not specified. So if a dump has times during daylight saving time, the dump does not unequivocally match the data, and parsing will select one of the two times. +During a read operation, incorrect dates and dates with times can be parsed with natural overflow or as null dates and times, without an error message. + +As an exception, parsing dates with times is also supported in Unix timestamp format, if it consists of exactly 10 decimal digits. The result is not time zone-dependent. The formats YYYY-MM-DD hh:mm:ss and NNNNNNNNNN are differentiated automatically. + +Strings are output with backslash-escaped special characters. The following escape sequences are used for output: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. Parsing also supports the sequences `\a`, `\v`, and `\xHH` (hex escape sequences) and any `\c` sequences, where `c` is any character (these sequences are converted to `c`). Thus, reading data supports formats where a line feed can be written as `\n` or `\`, or as a line feed. For example, the string `Hello world` with a line feed between the words instead of space can be parsed in any of the following variations: + +``` text +Hello\nworld + +Hello\ +world +``` + +The second variant is supported because MySQL uses it when writing tab-separated dumps. + +The minimum set of characters that you need to escape when passing data in TabSeparated format: tab, line feed (LF) and backslash. + +Only a small set of symbols are escaped. You can easily stumble onto a string value that your terminal will ruin in output. + +Arrays are written as a list of comma-separated values in square brackets. Number items in the array are formatted as normally. `Date` and `DateTime` types are written in single quotes. Strings are written in single quotes with the same escaping rules as above. + +[NULL](../sql-reference/syntax.md) is formatted according to setting [format_tsv_null_representation](../operations/settings/settings.md#settings-format_tsv_null_representation) (default value is `\N`). + + +If setting [input_format_tsv_empty_as_default](../operations/settings/settings.md#settings-input_format_tsv_empty_as_default) is enabled, +empty input fields are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too. + +Each element of [Nested](../sql-reference/data-types/nested-data-structures/nested.md) structures is represented as array. + +In input data, ENUM values can be represented as names or as ids. First, we try to match the input value to the ENUM name. If we fail and the input value is a number, we try to match this number to ENUM id. +If input data contains only ENUM ids, it's recommended to enable the setting [input_format_tsv_enum_as_number](../operations/settings/settings.md#settings-input_format_tsv_enum_as_number) to optimize ENUM parsing. + +For example: + +``` sql +CREATE TABLE nestedt +( + `id` UInt8, + `aux` Nested( + a UInt8, + b String + ) +) +ENGINE = TinyLog +``` + +``` sql +INSERT INTO nestedt Values ( 1, [1], ['a']) +``` + +``` sql +SELECT * FROM nestedt FORMAT TSV +``` + +``` text +1 [1] ['a'] +``` + +## TabSeparatedRaw {#tabseparatedraw} + +Differs from `TabSeparated` format in that the rows are written without escaping. +When parsing with this format, tabs or linefeeds are not allowed in each field. + +This format is also available under the name `TSVRaw`. + +## TabSeparatedWithNames {#tabseparatedwithnames} + +Differs from the `TabSeparated` format in that the column names are written in the first row. + +During parsing, the first row is expected to contain the column names. You can use column names to determine their position and to check their correctness. + +If setting [input_format_with_names_use_header](../operations/settings/settings.md#settings-input_format_with_names_use_header) is set to 1, +the columns from input data will be mapped to the columns from the table by their names, columns with unknown names will be skipped if setting [input_format_skip_unknown_fields](../operations/settings/settings.md#settings-input_format_skip_unknown_fields) is set to 1. +Otherwise, the first row will be skipped. + +This format is also available under the name `TSVWithNames`. + +## TabSeparatedWithNamesAndTypes {#tabseparatedwithnamesandtypes} + +Differs from the `TabSeparated` format in that the column names are written to the first row, while the column types are in the second row. +The first row with names is processed the same way as in `TabSeparatedWithNames` format. +If setting [input_format_with_types_use_header](../operations/settings/settings.md#settings-input_format_with_types_use_header) is set to 1, +the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped. + +This format is also available under the name `TSVWithNamesAndTypes`. + +## TabSeparatedRawWithNames {#tabseparatedrawwithnames} + +Differs from `TabSeparatedWithNames` format in that the rows are written without escaping. +When parsing with this format, tabs or linefeeds are not allowed in each field. + +This format is also available under the name `TSVRawWithNames`. + +## TabSeparatedRawWithNamesAndTypes {#tabseparatedrawwithnamesandtypes} + +Differs from `TabSeparatedWithNamesAndTypes` format in that the rows are written without escaping. +When parsing with this format, tabs or linefeeds are not allowed in each field. + +This format is also available under the name `TSVRawWithNamesAndNames`. + +## Template {#format-template} + +This format allows specifying a custom format string with placeholders for values with a specified escaping rule. + +It uses settings `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) + +Setting `format_template_row` specifies path to file, which contains format string for rows with the following syntax: + +`delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, + +where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), +`column_i` is a name or index of a column whose values are to be selected or inserted (if empty, then column will be skipped), +`serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: + +- `CSV`, `JSON`, `XML` (similarly to the formats of the same names) +- `Escaped` (similarly to `TSV`) +- `Quoted` (similarly to `Values`) +- `Raw` (without escaping, similarly to `TSVRaw`) +- `None` (no escaping rule, see further) + +If an escaping rule is omitted, then `None` will be used. `XML` is suitable only for output. + +So, for the following format string: + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + +the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase:`, `, count:`, `, ad price: $` and `;` delimiters respectively. For example: + +`Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + +The `format_template_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) + +Setting `format_template_resultset` specifies the path to file, which contains a format string for resultset. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: + +- `data` is the rows with data in `format_template_row` format, separated by `format_template_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. +- `totals` is the row with total values in `format_template_row` format (when using WITH TOTALS) +- `min` is the row with minimum values in `format_template_row` format (when extremes are set to 1) +- `max` is the row with maximum values in `format_template_row` format (when extremes are set to 1) +- `rows` is the total number of output rows +- `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. +- `time` is the request execution time in seconds +- `rows_read` is the number of rows has been read +- `bytes_read` is the number of bytes (uncompressed) has been read + +The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. +If the `format_template_resultset` setting is an empty string, `${data}` is used as default value. +For insert queries format allows skipping some columns or some fields if prefix or suffix (see example). + +Select example: + +``` sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 FORMAT Template SETTINGS +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = '\n ' +``` + +`/some/path/resultset.format`: + +``` text + + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + + +``` + +`/some/path/row.format`: + +``` text + ${0:XML} ${1:XML} +``` + +Result: + +``` html + + Search phrases + + + + + + + + +
Search phrases
Search phrase Count
8267016
bathroom interior design 2166
clickhouse 1655
spring 2014 fashion 1549
freeform photos 1480
+ + +
Max
8873898
+ Processed 3095973 rows in 0.1569913 sec + + +``` + +Insert example: + +``` text +Some header +Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 +Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 +Total rows: 2 +``` + +``` sql +INSERT INTO UserActivity FORMAT Template SETTINGS +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format' +``` + +`/some/path/resultset.format`: + +``` text +Some header\n${data}\nTotal rows: ${:CSV}\n +``` + +`/some/path/row.format`: + +``` text +Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} +``` + +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows:` in suffix will be ignored. +All delimiters in the input data must be strictly equal to delimiters in specified format strings. + +## TemplateIgnoreSpaces {#templateignorespaces} + +This format is suitable only for input. +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It’s possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): + +``` sql +INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = ',' +``` + +`/some/path/resultset.format`: + +``` text +{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}} +``` + +`/some/path/row.format`: + +``` text +{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}} +``` + +## TSKV {#tskv} + +Similar to TabSeparated, but outputs a value in name=value format. Names are escaped the same way as in TabSeparated format, and the = symbol is also escaped. + +``` text +SearchPhrase= count()=8267016 +SearchPhrase=bathroom interior design count()=2166 +SearchPhrase=clickhouse count()=1655 +SearchPhrase=2014 spring fashion count()=1549 +SearchPhrase=freeform photos count()=1480 +SearchPhrase=angelina jolie count()=1245 +SearchPhrase=omsk count()=1112 +SearchPhrase=photos of dog breeds count()=1091 +SearchPhrase=curtain designs count()=1064 +SearchPhrase=baku count()=1000 +``` + +[NULL](../sql-reference/syntax.md) is formatted as `\N`. + +``` sql +SELECT * FROM t_null FORMAT TSKV +``` + +``` text +x=1 y=\N +``` + +When there is a large number of small columns, this format is ineffective, and there is generally no reason to use it. Nevertheless, it is no worse than JSONEachRow in terms of efficiency. + +Both data output and parsing are supported in this format. For parsing, any order is supported for the values of different columns. It is acceptable for some values to be omitted – they are treated as equal to their default values. In this case, zeros and blank rows are used as default values. Complex values that could be specified in the table are not supported as defaults. + +Parsing allows the presence of the additional field `tskv` without the equal sign or a value. This field is ignored. + +## CSV {#csv} + +Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)). + +When formatting, rows are enclosed in double-quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double-quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double-quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). + +``` bash +$ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv +``` + +\*By default, the delimiter is `,`. See the [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) setting for more information. + +When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported. + +If setting [input_format_csv_empty_as_default](../operations/settings/settings.md#settings-input_format_csv_empty_as_default) is enabled, +empty unquoted input values are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too. + +`NULL` is formatted according to setting [format_csv_null_representation](../operations/settings/settings.md#settings-format_csv_null_representation) (default value is `\N`). + +In input data, ENUM values can be represented as names or as ids. First, we try to match the input value to the ENUM name. If we fail and the input value is a number, we try to match this number to ENUM id. +If input data contains only ENUM ids, it's recommended to enable the setting [input_format_csv_enum_as_number](../operations/settings/settings.md#settings-input_format_csv_enum_as_number) to optimize ENUM parsing. + +The CSV format supports the output of totals and extremes the same way as `TabSeparated`. + +## CSVWithNames {#csvwithnames} + +Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames). + +## CSVWithNamesAndTypes {#csvwithnamesandtypes} + +Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). + +## CustomSeparated {#format-customseparated} + +Similar to [Template](#format-template), but it prints or reads all names and types of columns and uses escaping rule from [format_custom_escaping_rule](../operations/settings/settings.md#format-custom-escaping-rule) setting and delimiters from [format_custom_field_delimiter](../operations/settings/settings.md#format-custom-field-delimiter), [format_custom_row_before_delimiter](../operations/settings/settings.md#format-custom-row-before-delimiter), [format_custom_row_after_delimiter](../operations/settings/settings.md#format-custom-row-after-delimiter), [format_custom_row_between_delimiter](../operations/settings/settings.md#format-custom-row-between-delimiter), [format_custom_result_before_delimiter](../operations/settings/settings.md#format-custom-result-before-delimiter) and [format_custom_result_after_delimiter](../operations/settings/settings.md#format-custom-result-after-delimiter) settings, not from format strings. + +There is also `CustomSeparatedIgnoreSpaces` format, which is similar to [TemplateIgnoreSpaces](#templateignorespaces). + +## CustomSeparatedWithNames {#customseparatedwithnames} + +Also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames). + +## CustomSeparatedWithNamesAndTypes {#customseparatedwithnamesandtypes} + +Also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). + +## JSON {#json} + +Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information: the total number of output rows, and the number of rows that could have been output if there weren’t a LIMIT. Example: + +``` sql +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON +``` + +``` json +{ + "meta": + [ + { + "name": "'hello'", + "type": "String" + }, + { + "name": "multiply(42, number)", + "type": "UInt64" + }, + { + "name": "range(5)", + "type": "Array(UInt8)" + } + ], + + "data": + [ + { + "'hello'": "hello", + "multiply(42, number)": "0", + "range(5)": [0,1,2,3,4] + }, + { + "'hello'": "hello", + "multiply(42, number)": "42", + "range(5)": [0,1,2,3,4] + }, + { + "'hello'": "hello", + "multiply(42, number)": "84", + "range(5)": [0,1,2,3,4] + } + ], + + "rows": 3, + + "rows_before_limit_at_least": 3 +} +``` + +The JSON is compatible with JavaScript. To ensure this, some characters are additionally escaped: the slash `/` is escaped as `\/`; alternative line breaks `U+2028` and `U+2029`, which break some browsers, are escaped as `\uXXXX`. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab are replaced with `\b`, `\f`, `\n`, `\r`, `\t` , as well as the remaining bytes in the 00-1F range using `\uXXXX` sequences. Invalid UTF-8 sequences are changed to the replacement character � so the output text will consist of valid UTF-8 sequences. For compatibility with JavaScript, Int64 and UInt64 integers are enclosed in double-quotes by default. To remove the quotes, you can set the configuration parameter [output_format_json_quote_64bit_integers](../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) to 0. + +`rows` – The total number of output rows. + +`rows_before_limit_at_least` The minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. +If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. + +`totals` – Total values (when using WITH TOTALS). + +`extremes` – Extreme values (when extremes are set to 1). + +This format is only appropriate for outputting a query result, but not for parsing (retrieving data to insert in a table). + +ClickHouse supports [NULL](../sql-reference/syntax.md), which is displayed as `null` in the JSON output. To enable `+nan`, `-nan`, `+inf`, `-inf` values in output, set the [output_format_json_quote_denormals](../operations/settings/settings.md#settings-output_format_json_quote_denormals) to 1. + +**See Also** + +- [JSONEachRow](#jsoneachrow) format +- [output_format_json_array_of_rows](../operations/settings/settings.md#output-format-json-array-of-rows) setting + +## JSONStrings {#jsonstrings} + +Differs from JSON only in that data fields are output in strings, not in typed JSON values. + +Example: + +```json +{ + "meta": + [ + { + "name": "'hello'", + "type": "String" + }, + { + "name": "multiply(42, number)", + "type": "UInt64" + }, + { + "name": "range(5)", + "type": "Array(UInt8)" + } + ], + + "data": + [ + { + "'hello'": "hello", + "multiply(42, number)": "0", + "range(5)": "[0,1,2,3,4]" + }, + { + "'hello'": "hello", + "multiply(42, number)": "42", + "range(5)": "[0,1,2,3,4]" + }, + { + "'hello'": "hello", + "multiply(42, number)": "84", + "range(5)": "[0,1,2,3,4]" + } + ], + + "rows": 3, + + "rows_before_limit_at_least": 3 +} +``` + +## JSONAsString {#jsonasstring} + +In this format, a single JSON object is interpreted as a single value. If the input has several JSON objects (comma separated), they are interpreted as separate rows. If the input data is enclosed in square brackets, it is interpreted as an array of JSONs. + +This format can only be parsed for table with a single field of type [String](../sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](../sql-reference/statements/create/table.md#default) or [MATERIALIZED](../sql-reference/statements/create/table.md#materialized), or omitted. Once you collect whole JSON object to string you can use [JSON functions](../sql-reference/functions/json-functions.md) to process it. + +**Examples** + +Query: + +``` sql +DROP TABLE IF EXISTS json_as_string; +CREATE TABLE json_as_string (json String) ENGINE = Memory; +INSERT INTO json_as_string (json) FORMAT JSONAsString {"foo":{"bar":{"x":"y"},"baz":1}},{},{"any json stucture":1} +SELECT * FROM json_as_string; +``` + +Result: + +``` text +┌─json──────────────────────────────┐ +│ {"foo":{"bar":{"x":"y"},"baz":1}} │ +│ {} │ +│ {"any json stucture":1} │ +└───────────────────────────────────┘ +``` + +**An array of JSON objects** + +Query: + +``` sql +CREATE TABLE json_square_brackets (field String) ENGINE = Memory; +INSERT INTO json_square_brackets FORMAT JSONAsString [{"id": 1, "name": "name1"}, {"id": 2, "name": "name2"}]; + +SELECT * FROM json_square_brackets; +``` + +Result: + +```text +┌─field──────────────────────┐ +│ {"id": 1, "name": "name1"} │ +│ {"id": 2, "name": "name2"} │ +└────────────────────────────┘ +``` + +## JSONCompact {#jsoncompact} +## JSONCompactStrings {#jsoncompactstrings} + +Differs from JSON only in that data rows are output in arrays, not in objects. + +Example: + +``` +// JSONCompact +{ + "meta": + [ + { + "name": "'hello'", + "type": "String" + }, + { + "name": "multiply(42, number)", + "type": "UInt64" + }, + { + "name": "range(5)", + "type": "Array(UInt8)" + } + ], + + "data": + [ + ["hello", "0", [0,1,2,3,4]], + ["hello", "42", [0,1,2,3,4]], + ["hello", "84", [0,1,2,3,4]] + ], + + "rows": 3, + + "rows_before_limit_at_least": 3 +} +``` + +``` +// JSONCompactStrings +{ + "meta": + [ + { + "name": "'hello'", + "type": "String" + }, + { + "name": "multiply(42, number)", + "type": "UInt64" + }, + { + "name": "range(5)", + "type": "Array(UInt8)" + } + ], + + "data": + [ + ["hello", "0", "[0,1,2,3,4]"], + ["hello", "42", "[0,1,2,3,4]"], + ["hello", "84", "[0,1,2,3,4]"] + ], + + "rows": 3, + + "rows_before_limit_at_least": 3 +} +``` + +## JSONEachRow {#jsoneachrow} +## JSONStringsEachRow {#jsonstringseachrow} +## JSONCompactEachRow {#jsoncompacteachrow} +## JSONCompactStringsEachRow {#jsoncompactstringseachrow} + +When using these formats, ClickHouse outputs rows as separated, newline-delimited JSON values, but the data as a whole is not valid JSON. + +``` json +{"some_int":42,"some_str":"hello","some_tuple":[1,"a"]} // JSONEachRow +[42,"hello",[1,"a"]] // JSONCompactEachRow +["42","hello","(2,'a')"] // JSONCompactStringsEachRow +``` + +When inserting the data, you should provide a separate JSON value for each row. + +## JSONEachRowWithProgress {#jsoneachrowwithprogress} +## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress} + +Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yield progress information as JSON values. + +```json +{"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}} +{"row":{"'hello'":"hello","multiply(42, number)":"42","range(5)":[0,1,2,3,4]}} +{"row":{"'hello'":"hello","multiply(42, number)":"84","range(5)":[0,1,2,3,4]}} +{"progress":{"read_rows":"3","read_bytes":"24","written_rows":"0","written_bytes":"0","total_rows_to_read":"3"}} +``` + +## JSONCompactEachRowWithNames {#jsoncompacteachrowwithnames} + +Differs from `JSONCompactEachRow` format in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames). + +## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes} + +Differs from `JSONCompactEachRow` format in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). + +## JSONCompactStringsEachRowWithNames {#jsoncompactstringseachrowwithnames} + +Differs from `JSONCompactStringsEachRow` in that in that it also prints the header row with column names, similar to [TabSeparatedWithNames](#tabseparatedwithnames). + +## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes} + +Differs from `JSONCompactStringsEachRow` in that it also prints two header rows with column names and types, similar to [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes). + +```json +["'hello'", "multiply(42, number)", "range(5)"] +["String", "UInt64", "Array(UInt8)"] +["hello", "0", [0,1,2,3,4]] +["hello", "42", [0,1,2,3,4]] +["hello", "84", [0,1,2,3,4]] +``` + +### Inserting Data {#inserting-data} + +``` sql +INSERT INTO UserActivity FORMAT JSONEachRow {"PageViews":5, "UserID":"4324182021466249494", "Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} +``` + +ClickHouse allows: + +- Any order of key-value pairs in the object. +- Omitting some values. + +ClickHouse ignores spaces between elements and commas after the objects. You can pass all the objects in one line. You do not have to separate them with line breaks. + +**Omitted values processing** + +ClickHouse substitutes omitted values with the default values for the corresponding [data types](../sql-reference/data-types/index.md). + +If `DEFAULT expr` is specified, ClickHouse uses different substitution rules depending on the [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) setting. + +Consider the following table: + +``` sql +CREATE TABLE IF NOT EXISTS example_table +( + x UInt32, + a DEFAULT x * 2 +) ENGINE = Memory; +``` + +- If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type). +- If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`. + +:::warning +When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`. +::: + +### Selecting Data {#selecting-data} + +Consider the `UserActivity` table as an example: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +The query `SELECT * FROM UserActivity FORMAT JSONEachRow` returns: + +``` text +{"UserID":"4324182021466249494","PageViews":5,"Duration":146,"Sign":-1} +{"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} +``` + +Unlike the [JSON](#json) format, there is no substitution of invalid UTF-8 sequences. Values are escaped in the same way as for `JSON`. + +:::info +Any set of bytes can be output in the strings. Use the `JSONEachRow` format if you are sure that the data in the table can be formatted as JSON without losing any information. +::: + +### Usage of Nested Structures {#jsoneachrow-nested} + +If you have a table with [Nested](../sql-reference/data-types/nested-data-structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. + +For example, consider the following table: + +``` sql +CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory +``` + +As you can see in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column (`n.s` and `n.i` for our table). You can insert data in the following way: + +``` sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} +``` + +To insert data as a hierarchical JSON object, set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). + +``` json +{ + "n": { + "s": ["abc", "def"], + "i": [1, 23] + } +} +``` + +Without this setting, ClickHouse throws an exception. + +``` sql +SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' +``` + +``` text +┌─name────────────────────────────┬─value─┐ +│ input_format_import_nested_json │ 0 │ +└─────────────────────────────────┴───────┘ +``` + +``` sql +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +``` + +``` text +Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) +``` + +``` sql +SET input_format_import_nested_json=1 +INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} +SELECT * FROM json_each_row_nested +``` + +``` text +┌─n.s───────────┬─n.i────┐ +│ ['abc','def'] │ [1,23] │ +└───────────────┴────────┘ +``` + +## Native {#native} + +The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is “columnar” – it does not convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. + +You can use this format to quickly generate dumps that can only be read by the ClickHouse DBMS. It does not make sense to work with this format yourself. + +## Null {#null} + +Nothing is output. However, the query is processed, and when using the command-line client, data is transmitted to the client. This is used for tests, including performance testing. +Obviously, this format is only appropriate for output, not for parsing. + +## Pretty {#pretty} + +Outputs data as Unicode-art tables, also using ANSI-escape sequences for setting colours in the terminal. +A full grid of the table is drawn, and each row occupies two lines in the terminal. +Each result block is output as a separate table. This is necessary so that blocks can be output without buffering results (buffering would be necessary in order to pre-calculate the visible width of all the values). + +[NULL](../sql-reference/syntax.md) is output as `ᴺᵁᴸᴸ`. + +Example (shown for the [PrettyCompact](#prettycompact) format): + +``` sql +SELECT * FROM t_null +``` + +``` text +┌─x─┬────y─┐ +│ 1 │ ᴺᵁᴸᴸ │ +└───┴──────┘ +``` + +Rows are not escaped in Pretty\* formats. Example is shown for the [PrettyCompact](#prettycompact) format: + +``` sql +SELECT 'String with \'quotes\' and \t character' AS Escaping_test +``` + +``` text +┌─Escaping_test────────────────────────┐ +│ String with 'quotes' and character │ +└──────────────────────────────────────┘ +``` + +To avoid dumping too much data to the terminal, only the first 10,000 rows are printed. If the number of rows is greater than or equal to 10,000, the message “Showed first 10 000” is printed. +This format is only appropriate for outputting a query result, but not for parsing (retrieving data to insert in a table). + +The Pretty format supports outputting total values (when using WITH TOTALS) and extremes (when ‘extremes’ is set to 1). In these cases, total values and extreme values are output after the main data, in separate tables. Example (shown for the [PrettyCompact](#prettycompact) format): + +``` sql +SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact +``` + +``` text +┌──EventDate─┬───────c─┐ +│ 2014-03-17 │ 1406958 │ +│ 2014-03-18 │ 1383658 │ +│ 2014-03-19 │ 1405797 │ +│ 2014-03-20 │ 1353623 │ +│ 2014-03-21 │ 1245779 │ +│ 2014-03-22 │ 1031592 │ +│ 2014-03-23 │ 1046491 │ +└────────────┴─────────┘ + +Totals: +┌──EventDate─┬───────c─┐ +│ 1970-01-01 │ 8873898 │ +└────────────┴─────────┘ + +Extremes: +┌──EventDate─┬───────c─┐ +│ 2014-03-17 │ 1031592 │ +│ 2014-03-23 │ 1406958 │ +└────────────┴─────────┘ +``` + +## PrettyCompact {#prettycompact} + +Differs from [Pretty](#pretty) in that the grid is drawn between rows and the result is more compact. +This format is used by default in the command-line client in interactive mode. + +## PrettyCompactMonoBlock {#prettycompactmonoblock} + +Differs from [PrettyCompact](#prettycompact) in that up to 10,000 rows are buffered, then output as a single table, not by blocks. + +## PrettyNoEscapes {#prettynoescapes} + +Differs from Pretty in that ANSI-escape sequences aren’t used. This is necessary for displaying this format in a browser, as well as for using the ‘watch’ command-line utility. + +Example: + +``` bash +$ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" +``` + +You can use the HTTP interface for displaying in the browser. + +### PrettyCompactNoEscapes {#prettycompactnoescapes} + +The same as the previous setting. + +### PrettySpaceNoEscapes {#prettyspacenoescapes} + +The same as the previous setting. + +## PrettySpace {#prettyspace} + +Differs from [PrettyCompact](#prettycompact) in that whitespace (space characters) is used instead of the grid. + +## RowBinary {#rowbinary} + +Formats and parses data by row in binary format. Rows and values are listed consecutively, without separators. +This format is less efficient than the Native format since it is row-based. + +Integers use fixed-length little-endian representation. For example, UInt64 uses 8 bytes. +DateTime is represented as UInt32 containing the Unix timestamp as the value. +Date is represented as a UInt16 object that contains the number of days since 1970-01-01 as the value. +String is represented as a varint length (unsigned [LEB128](https://en.wikipedia.org/wiki/LEB128)), followed by the bytes of the string. +FixedString is represented simply as a sequence of bytes. + +Array is represented as a varint length (unsigned [LEB128](https://en.wikipedia.org/wiki/LEB128)), followed by successive elements of the array. + +For [NULL](../sql-reference/syntax.md#null-literal) support, an additional byte containing 1 or 0 is added before each [Nullable](../sql-reference/data-types/nullable.md) value. If 1, then the value is `NULL` and this byte is interpreted as a separate value. If 0, the value after the byte is not `NULL`. + +## RowBinaryWithNames {#rowbinarywithnames} + +Similar to [RowBinary](#rowbinary), but with added header: + +- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N) +- N `String`s specifying column names + +## RowBinaryWithNamesAndTypes {#rowbinarywithnamesandtypes} + +Similar to [RowBinary](#rowbinary), but with added header: + +- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N) +- N `String`s specifying column names +- N `String`s specifying column types + +## Values {#data-format-values} + +Prints every row in brackets. Rows are separated by commas. There is no comma after the last row. The values inside the brackets are also comma-separated. Numbers are output in a decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are similar to the [TabSeparated](#tabseparated) format. During formatting, extra spaces aren’t inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). [NULL](../sql-reference/syntax.md) is represented as `NULL`. + +The minimum set of characters that you need to escape when passing data in Values ​​format: single quotes and backslashes. + +This is the format that is used in `INSERT INTO t VALUES ...`, but you can also use it for formatting query results. + +See also: [input_format_values_interpret_expressions](../operations/settings/settings.md#settings-input_format_values_interpret_expressions) and [input_format_values_deduce_templates_of_expressions](../operations/settings/settings.md#settings-input_format_values_deduce_templates_of_expressions) settings. + +## Vertical {#vertical} + +Prints each value on a separate line with the column name specified. This format is convenient for printing just one or a few rows if each row consists of a large number of columns. + +[NULL](../sql-reference/syntax.md) is output as `ᴺᵁᴸᴸ`. + +Example: + +``` sql +SELECT * FROM t_null FORMAT Vertical +``` + +``` text +Row 1: +────── +x: 1 +y: ᴺᵁᴸᴸ +``` + +Rows are not escaped in Vertical format: + +``` sql +SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical +``` + +``` text +Row 1: +────── +test: string with 'quotes' and with some special + characters +``` + +This format is only appropriate for outputting a query result, but not for parsing (retrieving data to insert in a table). + +## XML {#xml} + +XML format is suitable only for output, not for parsing. Example: + +``` xml + + + + + + SearchPhrase + String + + + count() + UInt64 + + + + + + + 8267016 + + + bathroom interior design + 2166 + + + clickhouse + 1655 + + + 2014 spring fashion + 1549 + + + freeform photos + 1480 + + + angelina jolie + 1245 + + + omsk + 1112 + + + photos of dog breeds + 1091 + + + curtain designs + 1064 + + + baku + 1000 + + + 10 + 141137 + +``` + +If the column name does not have an acceptable format, just ‘field’ is used as the element name. In general, the XML structure follows the JSON structure. +Just as for JSON, invalid UTF-8 sequences are changed to the replacement character � so the output text will consist of valid UTF-8 sequences. + +In string values, the characters `<` and `&` are escaped as `<` and `&`. + +Arrays are output as `HelloWorld...`,and tuples as `HelloWorld...`. + +## CapnProto {#capnproto} + +CapnProto is a binary message format similar to [Protocol Buffers](https://developers.google.com/protocol-buffers/) and [Thrift](https://en.wikipedia.org/wiki/Apache_Thrift), but not like [JSON](#json) or [MessagePack](https://msgpack.org/). + +CapnProto messages are strictly typed and not self-describing, meaning they need an external schema description. The schema is applied on the fly and cached for each query. + +See also [Format Schema](#formatschema). + +### Data Types Matching {#data_types-matching-capnproto} + +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. + +| CapnProto data type (`INSERT`) | ClickHouse data type | CapnProto data type (`SELECT`) | +|--------------------------------|-----------------------------------------------------------|--------------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md), [Date](../sql-reference/data-types/date.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md), [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md), [DateTime64](../sql-reference/data-types/datetime.md) | `INT64` | +| `FLOAT32` | [Float32](../sql-reference/data-types/float.md) | `FLOAT32` | +| `FLOAT64` | [Float64](../sql-reference/data-types/float.md) | `FLOAT64` | +| `TEXT, DATA` | [String](../sql-reference/data-types/string.md), [FixedString](../sql-reference/data-types/fixedstring.md) | `TEXT, DATA` | +| `union(T, Void), union(Void, T)` | [Nullable(T)](../sql-reference/data-types/date.md) | `union(T, Void), union(Void, T)` | +| `ENUM` | [Enum(8\|16)](../sql-reference/data-types/enum.md) | `ENUM` | +| `LIST` | [Array](../sql-reference/data-types/array.md) | `LIST` | +| `STRUCT` | [Tuple](../sql-reference/data-types/tuple.md) | `STRUCT` | + +For working with `Enum` in CapnProto format use the [format_capn_proto_enum_comparising_mode](../operations/settings/settings.md#format-capn-proto-enum-comparising-mode) setting. + +Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` type also can be nested. + +### Inserting and Selecting Data {#inserting-and-selecting-data-capnproto} + +You can insert CapnProto data from a file into ClickHouse table by the following command: + +``` bash +$ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema = 'schema:Message'" +``` + +Where `schema.capnp` looks like this: + +``` capnp +struct Message { + SearchPhrase @0 :Text; + c @1 :Uint64; +} +``` + +You can select data from a ClickHouse table and save them into some file in the CapnProto format by the following command: + +``` bash +$ clickhouse-client --query = "SELECT * FROM test.hits FORMAT CapnProto SETTINGS format_schema = 'schema:Message'" +``` + +## Protobuf {#protobuf} + +Protobuf - is a [Protocol Buffers](https://developers.google.com/protocol-buffers/) format. + +This format requires an external format schema. The schema is cached between queries. +ClickHouse supports both `proto2` and `proto3` syntaxes. Repeated/optional/required fields are supported. + +Usage examples: + +``` sql +SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType' +``` + +``` bash +cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" +``` + +where the file `schemafile.proto` looks like this: + +``` capnp +syntax = "proto3"; + +message MessageType { + string name = 1; + string surname = 2; + uint32 birthDate = 3; + repeated string phoneNumbers = 4; +}; +``` + +To find the correspondence between table columns and fields of Protocol Buffers’ message type ClickHouse compares their names. +This comparison is case-insensitive and the characters `_` (underscore) and `.` (dot) are considered as equal. +If types of a column and a field of Protocol Buffers’ message are different the necessary conversion is applied. + +Nested messages are supported. For example, for the field `z` in the following message type + +``` capnp +message MessageType { + message XType { + message YType { + int32 z; + }; + repeated YType y; + }; + XType x; +}; +``` + +ClickHouse tries to find a column named `x.y.z` (or `x_y_z` or `X.y_Z` and so on). +Nested messages are suitable to input or output a [nested data structures](../sql-reference/data-types/nested-data-structures/nested.md). + +Default values defined in a protobuf schema like this + +``` capnp +syntax = "proto2"; + +message MessageType { + optional int32 result_per_page = 3 [default = 10]; +} +``` + +are not applied; the [table defaults](../sql-reference/statements/create/table.md#create-default-values) are used instead of them. + +ClickHouse inputs and outputs protobuf messages in the `length-delimited` format. +It means before every message should be written its length as a [varint](https://developers.google.com/protocol-buffers/docs/encoding#varints). +See also [how to read/write length-delimited protobuf messages in popular languages](https://cwiki.apache.org/confluence/display/GEODE/Delimiting+Protobuf+Messages). + +## ProtobufSingle {#protobufsingle} + +Same as [Protobuf](#protobuf) but for storing/parsing single Protobuf message without length delimiters. + +## Avro {#data-format-avro} + +[Apache Avro](https://avro.apache.org/) is a row-oriented data serialization framework developed within Apache’s Hadoop project. + +ClickHouse Avro format supports reading and writing [Avro data files](https://avro.apache.org/docs/current/spec.html#Object+Container+Files). + +### Data Types Matching {#data_types-matching} + +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. + +| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | +|---------------------------------------------|-----------------------------------------------------------------------------------------------------------------------|------------------------------| +| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32)](../sql-reference/data-types/int-uint.md), [UInt(8\|16\|32)](../sql-reference/data-types/int-uint.md) | `int` | +| `boolean`, `int`, `long`, `float`, `double` | [Int64](../sql-reference/data-types/int-uint.md), [UInt64](../sql-reference/data-types/int-uint.md) | `long` | +| `boolean`, `int`, `long`, `float`, `double` | [Float32](../sql-reference/data-types/float.md) | `float` | +| `boolean`, `int`, `long`, `float`, `double` | [Float64](../sql-reference/data-types/float.md) | `double` | +| `bytes`, `string`, `fixed`, `enum` | [String](../sql-reference/data-types/string.md) | `bytes` or `string` \* | +| `bytes`, `string`, `fixed` | [FixedString(N)](../sql-reference/data-types/fixedstring.md) | `fixed(N)` | +| `enum` | [Enum(8\|16)](../sql-reference/data-types/enum.md) | `enum` | +| `array(T)` | [Array(T)](../sql-reference/data-types/array.md) | `array(T)` | +| `union(null, T)`, `union(T, null)` | [Nullable(T)](../sql-reference/data-types/date.md) | `union(null, T)` | +| `null` | [Nullable(Nothing)](../sql-reference/data-types/special-data-types/nothing.md) | `null` | +| `int (date)` \** | [Date](../sql-reference/data-types/date.md) | `int (date)` \** | +| `long (timestamp-millis)` \** | [DateTime64(3)](../sql-reference/data-types/datetime.md) | `long (timestamp-millis)` \* | +| `long (timestamp-micros)` \** | [DateTime64(6)](../sql-reference/data-types/datetime.md) | `long (timestamp-micros)` \* | + +\* `bytes` is default, controlled by [output_format_avro_string_column_pattern](../operations/settings/settings.md#settings-output_format_avro_string_column_pattern) +\** [Avro logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types) + +Unsupported Avro data types: `record` (non-root), `map` + +Unsupported Avro logical data types: `time-millis`, `time-micros`, `duration` + +### Inserting Data {#inserting-data-1} + +To insert data from an Avro file into ClickHouse table: + +``` bash +$ cat file.avro | clickhouse-client --query="INSERT INTO {some_table} FORMAT Avro" +``` + +The root schema of input Avro file must be of `record` type. + +To find the correspondence between table columns and fields of Avro schema ClickHouse compares their names. This comparison is case-sensitive. +Unused fields are skipped. + +Data types of ClickHouse table columns can differ from the corresponding fields of the Avro data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to corresponding column type. + +### Selecting Data {#selecting-data-1} + +To select data from ClickHouse table into an Avro file: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Avro" > file.avro +``` + +Column names must: + +- start with `[A-Za-z_]` +- subsequently contain only `[A-Za-z0-9_]` + +Output Avro file compression and sync interval can be configured with [output_format_avro_codec](../operations/settings/settings.md#settings-output_format_avro_codec) and [output_format_avro_sync_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectively. + +## AvroConfluent {#data-format-avro-confluent} + +AvroConfluent supports decoding single-object Avro messages commonly used with [Kafka](https://kafka.apache.org/) and [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html). + +Each Avro message embeds a schema id that can be resolved to the actual schema with help of the Schema Registry. + +Schemas are cached once resolved. + +Schema Registry URL is configured with [format_avro_schema_registry_url](../operations/settings/settings.md#format_avro_schema_registry_url). + +### Data Types Matching {#data_types-matching-1} + +Same as [Avro](#data-format-avro). + +### Usage {#usage} + +To quickly verify schema resolution you can use [kafkacat](https://github.com/edenhill/kafkacat) with [clickhouse-local](../operations/utilities/clickhouse-local.md): + +``` bash +$ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-local --input-format AvroConfluent --format_avro_schema_registry_url 'http://schema-registry' -S "field1 Int64, field2 String" -q 'select * from table' +1 a +2 b +3 c +``` + +To use `AvroConfluent` with [Kafka](../engines/table-engines/integrations/kafka.md): + +``` sql +CREATE TABLE topic1_stream +( + field1 String, + field2 String +) +ENGINE = Kafka() +SETTINGS +kafka_broker_list = 'kafka-broker', +kafka_topic_list = 'topic1', +kafka_group_name = 'group1', +kafka_format = 'AvroConfluent'; + +SET format_avro_schema_registry_url = 'http://schema-registry'; + +SELECT * FROM topic1_stream; +``` + +:::warning +Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine. +::: + +## Parquet {#data-format-parquet} + +[Apache Parquet](https://parquet.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. ClickHouse supports read and write operations for this format. + +### Data Types Matching {#data_types-matching-2} + +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. + +| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | +|------------------------------|-----------------------------------------------------------|------------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `UINT16` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| — | [FixedString](../sql-reference/data-types/fixedstring.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `LIST` | [Array](../sql-reference/data-types/array.md) | `LIST` | +| `STRUCT` | [Tuple](../sql-reference/data-types/tuple.md) | `STRUCT` | +| `MAP` | [Map](../sql-reference/data-types/map.md) | `MAP` | + +Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested. + +ClickHouse supports configurable precision of `Decimal` type. The `INSERT` query treats the Parquet `DECIMAL` type as the ClickHouse `Decimal128` type. + +Unsupported Parquet data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. + +Data types of ClickHouse table columns can differ from the corresponding fields of the Parquet data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column. + +### Inserting and Selecting Data {#inserting-and-selecting-data} + +You can insert Parquet data from a file into ClickHouse table by the following command: + +``` bash +$ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" +``` + +To insert data into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs values you must switch on the [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested) setting. + +You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} +``` + +To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md). + +## Arrow {#data-format-arrow} + +[Apache Arrow](https://arrow.apache.org/) comes with two built-in columnar storage formats. ClickHouse supports read and write operations for these formats. + +`Arrow` is Apache Arrow’s "file mode" format. It is designed for in-memory random access. + +### Data Types Matching {#data_types-matching-arrow} + +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. + +| Arrow data type (`INSERT`) | ClickHouse data type | Arrow data type (`SELECT`) | +|----------------------------|-----------------------------------------------------|----------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT32` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `FLOAT64` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `UINT16` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `STRING`, `BINARY` | [FixedString](../sql-reference/data-types/fixedstring.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `DECIMAL256` | [Decimal256](../sql-reference/data-types/decimal.md)| `DECIMAL256` | +| `LIST` | [Array](../sql-reference/data-types/array.md) | `LIST` | +| `STRUCT` | [Tuple](../sql-reference/data-types/tuple.md) | `STRUCT` | +| `MAP` | [Map](../sql-reference/data-types/map.md) | `MAP` | + +Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested. + +The `DICTIONARY` type is supported for `INSERT` queries, and for `SELECT` queries there is an [output_format_arrow_low_cardinality_as_dictionary](../operations/settings/settings.md#output-format-arrow-low-cardinality-as-dictionary) setting that allows to output [LowCardinality](../sql-reference/data-types/lowcardinality.md) type as a `DICTIONARY` type. + +ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the Arrow `DECIMAL` type as the ClickHouse `Decimal128` type. + +Unsupported Arrow data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. + +The data types of ClickHouse table columns do not have to match the corresponding Arrow data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. + +### Inserting Data {#inserting-data-arrow} + +You can insert Arrow data from a file into ClickHouse table by the following command: + +``` bash +$ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow" +``` + +To insert data into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs values you must switch on the [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested) setting. + +### Selecting Data {#selecting-data-arrow} + +You can select data from a ClickHouse table and save them into some file in the Arrow format by the following command: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Arrow" > {filename.arrow} +``` + +## ArrowStream {#data-format-arrow-stream} + +`ArrowStream` is Apache Arrow’s “stream mode” format. It is designed for in-memory stream processing. + +## ORC {#data-format-orc} + +[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the [Hadoop](https://hadoop.apache.org/) ecosystem. + +### Data Types Matching {#data_types-matching-3} + +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. + +| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) | +|--------------------------|-----------------------------------------------------|--------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `LIST` | [Array](../sql-reference/data-types/array.md) | `LIST` | +| `STRUCT` | [Tuple](../sql-reference/data-types/tuple.md) | `STRUCT` | +| `MAP` | [Map](../sql-reference/data-types/map.md) | `MAP` | + +Arrays can be nested and can have a value of the `Nullable` type as an argument. `Tuple` and `Map` types also can be nested. + +ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. + +Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. + +The data types of ClickHouse table columns do not have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. + +### Inserting Data {#inserting-data-2} + +You can insert ORC data from a file into ClickHouse table by the following command: + +``` bash +$ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" +``` + +To insert data into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs values you must switch on the [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested) setting. + +### Selecting Data {#selecting-data-2} + +You can select data from a ClickHouse table and save them into some file in the ORC format by the following command: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + +To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md). + +## LineAsString {#lineasstring} + +In this format, every line of input data is interpreted as a single string value. This format can only be parsed for table with a single field of type [String](../sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](../sql-reference/statements/create/table.md#default) or [MATERIALIZED](../sql-reference/statements/create/table.md#materialized), or omitted. + +**Example** + +Query: + +``` sql +DROP TABLE IF EXISTS line_as_string; +CREATE TABLE line_as_string (field String) ENGINE = Memory; +INSERT INTO line_as_string FORMAT LineAsString "I love apple", "I love banana", "I love orange"; +SELECT * FROM line_as_string; +``` + +Result: + +``` text +┌─field─────────────────────────────────────────────┐ +│ "I love apple", "I love banana", "I love orange"; │ +└───────────────────────────────────────────────────┘ +``` + +## Regexp {#data-format-regexp} + +Each line of imported data is parsed according to the regular expression. + +When working with the `Regexp` format, you can use the following settings: + +- `format_regexp` — [String](../sql-reference/data-types/string.md). Contains regular expression in the [re2](https://github.com/google/re2/wiki/Syntax) format. + +- `format_regexp_escaping_rule` — [String](../sql-reference/data-types/string.md). The following escaping rules are supported: + + - CSV (similarly to [CSV](#csv)) + - JSON (similarly to [JSONEachRow](#jsoneachrow)) + - Escaped (similarly to [TSV](#tabseparated)) + - Quoted (similarly to [Values](#data-format-values)) + - Raw (extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](#tabseparatedraw)) + +- `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Defines the need to throw an exeption in case the `format_regexp` expression does not match the imported data. Can be set to `0` or `1`. + +**Usage** + +The regular expression from `format_regexp` setting is applied to every line of imported data. The number of subpatterns in the regular expression must be equal to the number of columns in imported dataset. + +Lines of the imported data must be separated by newline character `'\n'` or DOS-style newline `"\r\n"`. + +The content of every matched subpattern is parsed with the method of corresponding data type, according to `format_regexp_escaping_rule` setting. + +If the regular expression does not match the line and `format_regexp_skip_unmatched` is set to 1, the line is silently skipped. If `format_regexp_skip_unmatched` is set to 0, exception is thrown. + +**Example** + +Consider the file data.tsv: + +```text +id: 1 array: [1,2,3] string: str1 date: 2020-01-01 +id: 2 array: [1,2,3] string: str2 date: 2020-01-02 +id: 3 array: [1,2,3] string: str3 date: 2020-01-03 +``` +and the table: + +```sql +CREATE TABLE imp_regex_table (id UInt32, array Array(UInt32), string String, date Date) ENGINE = Memory; +``` + +Import command: + +```bash +$ cat data.tsv | clickhouse-client --query "INSERT INTO imp_regex_table FORMAT Regexp SETTINGS format_regexp='id: (.+?) array: (.+?) string: (.+?) date: (.+?)', format_regexp_escaping_rule='Escaped', format_regexp_skip_unmatched=0;" +``` + +Query: + +```sql +SELECT * FROM imp_regex_table; +``` + +Result: + +```text +┌─id─┬─array───┬─string─┬───────date─┐ +│ 1 │ [1,2,3] │ str1 │ 2020-01-01 │ +│ 2 │ [1,2,3] │ str2 │ 2020-01-02 │ +│ 3 │ [1,2,3] │ str3 │ 2020-01-03 │ +└────┴─────────┴────────┴────────────┘ +``` + +## Format Schema {#formatschema} + +The file name containing the format schema is set by the setting `format_schema`. +It’s required to set this setting when it is used one of the formats `Cap'n Proto` and `Protobuf`. +The format schema is a combination of a file name and the name of a message type in this file, delimited by a colon, +e.g. `schemafile.proto:MessageType`. +If the file has the standard extension for the format (for example, `.proto` for `Protobuf`), +it can be omitted and in this case, the format schema looks like `schemafile:MessageType`. + +If you input or output data via the [client](../interfaces/cli.md) in the [interactive mode](../interfaces/cli.md#cli_usage), the file name specified in the format schema +can contain an absolute path or a path relative to the current directory on the client. +If you use the client in the [batch mode](../interfaces/cli.md#cli_usage), the path to the schema must be relative due to security reasons. + +If you input or output data via the [HTTP interface](../interfaces/http.md) the file name specified in the format schema +should be located in the directory specified in [format_schema_path](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-format_schema_path) +in the server configuration. + +## Skipping Errors {#skippingerrors} + +Some formats such as `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` and `Protobuf` can skip broken row if parsing error occurred and continue parsing from the beginning of next row. See [input_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) and +[input_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) settings. +Limitations: +- In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly. +- `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty. + +## RawBLOB {#rawblob} + +In this format, all input data is read to a single value. It is possible to parse only a table with a single field of type [String](../sql-reference/data-types/string.md) or similar. +The result is output in binary format without delimiters and escaping. If more than one value is output, the format is ambiguous, and it will be impossible to read the data back. + +Below is a comparison of the formats `RawBLOB` and [TabSeparatedRaw](#tabseparatedraw). +`RawBLOB`: +- data is output in binary format, no escaping; +- there are no delimiters between values; +- no newline at the end of each value. +[TabSeparatedRaw] (#tabseparatedraw): +- data is output without escaping; +- the rows contain values separated by tabs; +- there is a line feed after the last value in every row. + +The following is a comparison of the `RawBLOB` and [RowBinary](#rowbinary) formats. +`RawBLOB`: +- String fields are output without being prefixed by length. +`RowBinary`: +- String fields are represented as length in varint format (unsigned [LEB128] (https://en.wikipedia.org/wiki/LEB128)), followed by the bytes of the string. + +When an empty data is passed to the `RawBLOB` input, ClickHouse throws an exception: + +``` text +Code: 108. DB::Exception: No data to insert +``` + +**Example** + +``` bash +$ clickhouse-client --query "CREATE TABLE {some_table} (a String) ENGINE = Memory;" +$ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT RawBLOB" +$ clickhouse-client --query "SELECT * FROM {some_table} FORMAT RawBLOB" | md5sum +``` + +Result: + +``` text +f9725a22f9191e064120d718e26862a9 - +``` + +## MsgPack {#msgpack} + +ClickHouse supports reading and writing [MessagePack](https://msgpack.org/) data files. + +### Data Types Matching {#data-types-matching-msgpack} + +| MessagePack data type (`INSERT`) | ClickHouse data type | MessagePack data type (`SELECT`) | +|--------------------------------------------------------------------|-----------------------------------------------------------|------------------------------------| +| `uint N`, `positive fixint` | [UIntN](../sql-reference/data-types/int-uint.md) | `uint N` | +| `int N` | [IntN](../sql-reference/data-types/int-uint.md) | `int N` | +| `bool` | [UInt8](../sql-reference/data-types/int-uint.md) | `uint 8` | +| `fixstr`, `str 8`, `str 16`, `str 32`, `bin 8`, `bin 16`, `bin 32` | [String](../sql-reference/data-types/string.md) | `bin 8`, `bin 16`, `bin 32` | +| `fixstr`, `str 8`, `str 16`, `str 32`, `bin 8`, `bin 16`, `bin 32` | [FixedString](../sql-reference/data-types/fixedstring.md) | `bin 8`, `bin 16`, `bin 32` | +| `float 32` | [Float32](../sql-reference/data-types/float.md) | `float 32` | +| `float 64` | [Float64](../sql-reference/data-types/float.md) | `float 64` | +| `uint 16` | [Date](../sql-reference/data-types/date.md) | `uint 16` | +| `uint 32` | [DateTime](../sql-reference/data-types/datetime.md) | `uint 32` | +| `uint 64` | [DateTime64](../sql-reference/data-types/datetime.md) | `uint 64` | +| `fixarray`, `array 16`, `array 32` | [Array](../sql-reference/data-types/array.md) | `fixarray`, `array 16`, `array 32` | +| `fixmap`, `map 16`, `map 32` | [Map](../sql-reference/data-types/map.md) | `fixmap`, `map 16`, `map 32` | + +Example: + +Writing to a file ".msgpk": + +```sql +$ clickhouse-client --query="CREATE TABLE msgpack (array Array(UInt8)) ENGINE = Memory;" +$ clickhouse-client --query="INSERT INTO msgpack VALUES ([0, 1, 2, 3, 42, 253, 254, 255]), ([255, 254, 253, 42, 3, 2, 1, 0])"; +$ clickhouse-client --query="SELECT * FROM msgpack FORMAT MsgPack" > tmp_msgpack.msgpk; +``` diff --git a/docs/en/reference/interfaces/grpc.md b/docs/en/reference/interfaces/grpc.md new file mode 100644 index 00000000000..6ada38c6220 --- /dev/null +++ b/docs/en/reference/interfaces/grpc.md @@ -0,0 +1,99 @@ +--- +sidebar_position: 19 +sidebar_label: gRPC Interface +--- + +# gRPC Interface {#grpc-interface} + +## Introduction {#grpc-interface-introduction} + +ClickHouse supports [gRPC](https://grpc.io/) interface. It is an open source remote procedure call system that uses HTTP/2 and [Protocol Buffers](https://en.wikipedia.org/wiki/Protocol_Buffers). The implementation of gRPC in ClickHouse supports: + +- SSL; +- authentication; +- sessions; +- compression; +- parallel queries through the same channel; +- cancellation of queries; +- getting progress and logs; +- external tables. + +The specification of the interface is described in [clickhouse_grpc.proto](https://github.com/ClickHouse/ClickHouse/blob/master/src/Server/grpc_protos/clickhouse_grpc.proto). + +## gRPC Configuration {#grpc-interface-configuration} + +To use the gRPC interface set `grpc_port` in the main [server configuration](../operations/configuration-files.md). Other configuration options see in the following example: + +```xml +9100 + + false + + + /path/to/ssl_cert_file + /path/to/ssl_key_file + + + false + + + /path/to/ssl_ca_cert_file + + + deflate + + + medium + + + -1 + -1 + + + false + +``` + +## Built-in Client {#grpc-client} + +You can write a client in any of the programming languages supported by gRPC using the provided [specification](https://github.com/ClickHouse/ClickHouse/blob/master/src/Server/grpc_protos/clickhouse_grpc.proto). +Or you can use a built-in Python client. It is placed in [utils/grpc-client/clickhouse-grpc-client.py](https://github.com/ClickHouse/ClickHouse/blob/master/utils/grpc-client/clickhouse-grpc-client.py) in the repository. The built-in client requires [grpcio and grpcio-tools](https://grpc.io/docs/languages/python/quickstart) Python modules. + +The client supports the following arguments: + +- `--help` – Shows a help message and exits. +- `--host HOST, -h HOST` – A server name. Default value: `localhost`. You can use IPv4 or IPv6 addresses also. +- `--port PORT` – A port to connect to. This port should be enabled in the ClickHouse server configuration (see `grpc_port`). Default value: `9100`. +- `--user USER_NAME, -u USER_NAME` – A user name. Default value: `default`. +- `--password PASSWORD` – A password. Default value: empty string. +- `--query QUERY, -q QUERY` – A query to process when using non-interactive mode. +- `--database DATABASE, -d DATABASE` – A default database. If not specified, the current database set in the server settings is used (`default` by default). +- `--format OUTPUT_FORMAT, -f OUTPUT_FORMAT` – A result output [format](formats.md). Default value for interactive mode: `PrettyCompact`. +- `--debug` – Enables showing debug information. + +To run the client in an interactive mode call it without `--query` argument. + +In a batch mode query data can be passed via `stdin`. + +**Client Usage Example** + +In the following example a table is created and loaded with data from a CSV file. Then the content of the table is queried. + +``` bash +./clickhouse-grpc-client.py -q "CREATE TABLE grpc_example_table (id UInt32, text String) ENGINE = MergeTree() ORDER BY id;" +echo "0,Input data for" > a.txt ; echo "1,gRPC protocol example" >> a.txt +cat a.txt | ./clickhouse-grpc-client.py -q "INSERT INTO grpc_example_table FORMAT CSV" + +./clickhouse-grpc-client.py --format PrettyCompact -q "SELECT * FROM grpc_example_table;" +``` + +Result: + +``` text +┌─id─┬─text──────────────────┐ +│ 0 │ Input data for │ +│ 1 │ gRPC protocol example │ +└────┴───────────────────────┘ +``` diff --git a/docs/en/reference/interfaces/http.md b/docs/en/reference/interfaces/http.md new file mode 100644 index 00000000000..a97cf6671b2 --- /dev/null +++ b/docs/en/reference/interfaces/http.md @@ -0,0 +1,664 @@ +--- +sidebar_position: 19 +sidebar_label: HTTP Interface +--- + +# HTTP Interface {#http-interface} + +The HTTP interface lets you use ClickHouse on any platform from any programming language. We use it for working from Java and Perl, as well as shell scripts. In other departments, the HTTP interface is used from Perl, Python, and Go. The HTTP interface is more limited than the native interface, but it has better compatibility. + +By default, `clickhouse-server` listens for HTTP on port 8123 (this can be changed in the config). + +Sometimes, `curl` command is not available on user operating systems. On Ubuntu or Debian, run `sudo apt install curl`. Please refer this [documentation](https://curl.se/download.html) to install it before running the examples. + +If you make a `GET /` request without parameters, it returns 200 response code and the string which defined in [http_server_default_response](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-http_server_default_response) default value “Ok.” (with a line feed at the end) + +``` bash +$ curl 'http://localhost:8123/' +Ok. +``` + +Web UI can be accessed here: `http://localhost:8123/play`. + +![Web UI](../images/play.png) + + +In health-check scripts use `GET /ping` request. This handler always returns “Ok.” (with a line feed at the end). Available from version 18.12.13. See also `/replicas_status` to check replica's delay. + +``` bash +$ curl 'http://localhost:8123/ping' +Ok. +$ curl 'http://localhost:8123/replicas_status' +Ok. +``` + +Send the request as a URL ‘query’ parameter, or as a POST. Or send the beginning of the query in the ‘query’ parameter, and the rest in the POST (we’ll explain later why this is necessary). The size of the URL is limited to 16 KB, so keep this in mind when sending large queries. + +If successful, you receive the 200 response code and the result in the response body. +If an error occurs, you receive the 500 response code and an error description text in the response body. + +When using the GET method, ‘readonly’ is set. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body or in the URL parameter. + +Examples: + +``` bash +$ curl 'http://localhost:8123/?query=SELECT%201' +1 + +$ wget -nv -O- 'http://localhost:8123/?query=SELECT 1' +1 + +$ echo -ne 'GET /?query=SELECT%201 HTTP/1.0\r\n\r\n' | nc localhost 8123 +HTTP/1.0 200 OK +Date: Wed, 27 Nov 2019 10:30:18 GMT +Connection: Close +Content-Type: text/tab-separated-values; charset=UTF-8 +X-ClickHouse-Server-Display-Name: clickhouse.ru-central1.internal +X-ClickHouse-Query-Id: 5abe861c-239c-467f-b955-8a201abb8b7f +X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} + +1 +``` + +As you can see, `curl` is somewhat inconvenient in that spaces must be URL escaped. +Although `wget` escapes everything itself, we do not recommend using it because it does not work well over HTTP 1.1 when using keep-alive and Transfer-Encoding: chunked. + +``` bash +$ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- +1 + +$ echo 'SELECT 1' | curl 'http://localhost:8123/?query=' --data-binary @- +1 + +$ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- +1 +``` + +If part of the query is sent in the parameter, and part in the POST, a line feed is inserted between these two data parts. +Example (this won’t work): + +``` bash +$ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- +Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL +ECT 1 +, expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception +``` + +By default, data is returned in [TabSeparated](formats.md#tabseparated) format. + +You use the FORMAT clause of the query to request any other format. + +Also, you can use the ‘default_format’ URL parameter or the ‘X-ClickHouse-Format’ header to specify a default format other than TabSeparated. + +``` bash +$ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- +┏━━━┓ +┃ 1 ┃ +┡━━━┩ +│ 1 │ +└───┘ +``` + +The POST method of transmitting data is necessary for `INSERT` queries. In this case, you can write the beginning of the query in the URL parameter, and use POST to pass the data to insert. The data to insert could be, for example, a tab-separated dump from MySQL. In this way, the `INSERT` query replaces `LOAD DATA LOCAL INFILE` from MySQL. + +**Examples** + +Creating a table: + +``` bash +$ echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- +``` + +Using the familiar INSERT query for data insertion: + +``` bash +$ echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- +``` + +Data can be sent separately from the query: + +``` bash +$ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- +``` + +You can specify any data format. The ‘Values’ format is the same as what is used when writing INSERT INTO t VALUES: + +``` bash +$ echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- +``` + +To insert data from a tab-separated dump, specify the corresponding format: + +``` bash +$ echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- +``` + +Reading the table contents. Data is output in random order due to parallel query processing: + +``` bash +$ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' +7 +8 +9 +10 +11 +12 +1 +2 +3 +4 +5 +6 +``` + +Deleting the table. + +``` bash +$ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- +``` + +For successful requests that do not return a data table, an empty response body is returned. + + +## Compression {#compression} + +You can use compression to reduce network traffic when transmitting a large amount of data or for creating dumps that are immediately compressed. + +You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you need `clickhouse-compressor` program to work with it. It is installed with the `clickhouse-client` package. To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. + +If you specify `compress=1` in the URL, the server will compress the data it sends to you. If you specify `decompress=1` in the URL, the server will decompress the data which you pass in the `POST` method. + +You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). ClickHouse supports the following [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): + +- `gzip` +- `br` +- `deflate` +- `xz` + +To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. +In order for ClickHouse to compress the response, enable compression with [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting and append `Accept-Encoding: compression_method` header to the request. You can configure the data compression level in the [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level) setting for all compression methods. + +:::info +Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. +::: + +**Examples** + +``` bash +# Sending compressed data to the server +$ echo "SELECT 1" | gzip -c | \ + curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +``` + +``` bash +# Receiving compressed data archive from the server +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3' +$ zcat result.gz +0 +1 +2 +``` + +```bash +# Receiving compressed data from the server and using the gunzip to receive decompressed data +$ curl -sS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' -d 'SELECT number FROM system.numbers LIMIT 3' | gunzip - +0 +1 +2 +``` + +## Default Database {#default-database} + +You can use the ‘database’ URL parameter or the ‘X-ClickHouse-Database’ header to specify the default database. + +``` bash +$ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +``` + +By default, the database that is registered in the server settings is used as the default database. By default, this is the database called ‘default’. Alternatively, you can always specify the database using a dot before the table name. + +The username and password can be indicated in one of three ways: + +1. Using HTTP Basic Authentication. Example: + + + +``` bash +$ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- +``` + +1. In the ‘user’ and ‘password’ URL parameters. Example: + + + +``` bash +$ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- +``` + +1. Using ‘X-ClickHouse-User’ and ‘X-ClickHouse-Key’ headers. Example: + + + +``` bash +$ echo 'SELECT 1' | curl -H 'X-ClickHouse-User: user' -H 'X-ClickHouse-Key: password' 'http://localhost:8123/' -d @- +``` + +If the user name is not specified, the `default` name is used. If the password is not specified, the empty password is used. +You can also use the URL parameters to specify any settings for processing a single query or entire profiles of settings. Example:http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1 + +For more information, see the [Settings](../operations/settings/index.md) section. + +``` bash +$ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +``` + +For information about other parameters, see the section “SET”. + +Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you need to add the `session_id` GET parameter to the request. You can use any string as the session ID. By default, the session is terminated after 60 seconds of inactivity. To change this timeout, modify the `default_session_timeout` setting in the server configuration, or add the `session_timeout` GET parameter to the request. To check the session status, use the `session_check=1` parameter. Only one query at a time can be executed within a single session. + +You can receive information about the progress of a query in `X-ClickHouse-Progress` response headers. To do this, enable [send_progress_in_http_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Example of the header sequence: + +``` text +X-ClickHouse-Progress: {"read_rows":"2752512","read_bytes":"240570816","total_rows_to_read":"8880128"} +X-ClickHouse-Progress: {"read_rows":"5439488","read_bytes":"482285394","total_rows_to_read":"8880128"} +X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_rows_to_read":"8880128"} +``` + +Possible header fields: + +- `read_rows` — Number of rows read. +- `read_bytes` — Volume of data read in bytes. +- `total_rows_to_read` — Total number of rows to be read. +- `written_rows` — Number of rows written. +- `written_bytes` — Volume of data written in bytes. + +Running requests do not stop automatically if the HTTP connection is lost. Parsing and data formatting are performed on the server-side, and using the network might be ineffective. +The optional ‘query_id’ parameter can be passed as the query ID (any string). For more information, see the section “Settings, replace_running_query”. + +The optional ‘quota_key’ parameter can be passed as the quota key (any string). For more information, see the section “Quotas”. + +The HTTP interface allows passing external data (external temporary tables) for querying. For more information, see the section “External data for query processing”. + +## Response Buffering {#response-buffering} + +You can enable response buffering on the server-side. The `buffer_size` and `wait_end_of_query` URL parameters are provided for this purpose. + +`buffer_size` determines the number of bytes in the result to buffer in the server memory. If a result body is larger than this threshold, the buffer is written to the HTTP channel, and the remaining data is sent directly to the HTTP channel. + +To ensure that the entire response is buffered, set `wait_end_of_query=1`. In this case, the data that is not stored in memory will be buffered in a temporary server file. + +Example: + +``` bash +$ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' +``` + +Use buffering to avoid situations where a query processing error occurred after the response code and HTTP headers were sent to the client. In this situation, an error message is written at the end of the response body, and on the client-side, the error can only be detected at the parsing stage. + +### Queries with Parameters {#cli-queries-with-parameters} + +You can create a query with parameters and pass values for them from the corresponding HTTP request parameters. For more information, see [Queries with Parameters for CLI](../interfaces/cli.md#cli-queries-with-parameters). + +### Example {#example} + +``` bash +$ curl -sS "

?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" +``` + +## Predefined HTTP Interface {#predefined_http_interface} + +ClickHouse supports specific queries through the HTTP interface. For example, you can write data to a table as follows: + +``` bash +$ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- +``` + +ClickHouse also supports Predefined HTTP Interface which can help you more easily integrate with third-party tools like [Prometheus exporter](https://github.com/percona-lab/clickhouse_exporter). + +Example: + +- First of all, add this section to server configuration file: + + + +``` xml + + + /predefined_query + POST,GET + + predefined_query_handler + SELECT * FROM system.metrics LIMIT 5 FORMAT Template SETTINGS format_template_resultset = 'prometheus_template_output_format_resultset', format_template_row = 'prometheus_template_output_format_row', format_template_rows_between_delimiter = '\n' + + + ... + ... + +``` + +- You can now request the URL directly for data in the Prometheus format: + + + +``` bash +$ curl -v 'http://localhost:8123/predefined_query' +* Trying ::1... +* Connected to localhost (::1) port 8123 (#0) +> GET /predefined_query HTTP/1.1 +> Host: localhost:8123 +> User-Agent: curl/7.47.0 +> Accept: */* +> +< HTTP/1.1 200 OK +< Date: Tue, 28 Apr 2020 08:52:56 GMT +< Connection: Keep-Alive +< Content-Type: text/plain; charset=UTF-8 +< X-ClickHouse-Server-Display-Name: i-mloy5trc +< Transfer-Encoding: chunked +< X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a +< X-ClickHouse-Format: Template +< X-ClickHouse-Timezone: Asia/Shanghai +< Keep-Alive: timeout=3 +< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} +< +# HELP "Query" "Number of executing queries" +# TYPE "Query" counter +"Query" 1 + +# HELP "Merge" "Number of executing background merges" +# TYPE "Merge" counter +"Merge" 0 + +# HELP "PartMutation" "Number of mutations (ALTER DELETE/UPDATE)" +# TYPE "PartMutation" counter +"PartMutation" 0 + +# HELP "ReplicatedFetch" "Number of data parts being fetched from replica" +# TYPE "ReplicatedFetch" counter +"ReplicatedFetch" 0 + +# HELP "ReplicatedSend" "Number of data parts being sent to replicas" +# TYPE "ReplicatedSend" counter +"ReplicatedSend" 0 + +* Connection #0 to host localhost left intact + +* Connection #0 to host localhost left intact +``` + +As you can see from the example if `http_handlers` is configured in the config.xml file and `http_handlers` can contain many `rules`. ClickHouse will match the HTTP requests received to the predefined type in `rule` and the first matched runs the handler. Then ClickHouse will execute the corresponding predefined query if the match is successful. + +Now `rule` can configure `method`, `headers`, `url`, `handler`: +- `method` is responsible for matching the method part of the HTTP request. `method` fully conforms to the definition of [method](https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods) in the HTTP protocol. It is an optional configuration. If it is not defined in the configuration file, it does not match the method portion of the HTTP request. + +- `url` is responsible for matching the URL part of the HTTP request. It is compatible with [RE2](https://github.com/google/re2)’s regular expressions. It is an optional configuration. If it is not defined in the configuration file, it does not match the URL portion of the HTTP request. + +- `headers` are responsible for matching the header part of the HTTP request. It is compatible with RE2’s regular expressions. It is an optional configuration. If it is not defined in the configuration file, it does not match the header portion of the HTTP request. + +- `handler` contains the main processing part. Now `handler` can configure `type`, `status`, `content_type`, `response_content`, `query`, `query_param_name`. + `type` currently supports three types: [predefined_query_handler](#predefined_query_handler), [dynamic_query_handler](#dynamic_query_handler), [static](#static). + + - `query` — use with `predefined_query_handler` type, executes query when the handler is called. + + - `query_param_name` — use with `dynamic_query_handler` type, extracts and executes the value corresponding to the `query_param_name` value in HTTP request params. + + - `status` — use with `static` type, response status code. + + - `content_type` — use with `static` type, response [content-type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type). + + - `response_content` — use with `static` type, response content sent to client, when using the prefix ‘file://’ or ‘config://’, find the content from the file or configuration sends to client. + +Next are the configuration methods for different `type`. + +### predefined_query_handler {#predefined_query_handler} + +`predefined_query_handler` supports setting `Settings` and `query_params` values. You can configure `query` in the type of `predefined_query_handler`. + +`query` value is a predefined query of `predefined_query_handler`, which is executed by ClickHouse when an HTTP request is matched and the result of the query is returned. It is a must configuration. + +The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` settings, then queries the system table to check whether these settings were set successfully. + +:::warning +To keep the default `handlers` such as` query`, `play`,` ping`, add the `` rule. +::: + +Example: + +``` xml + + + [^/]+)(/(?P[^/]+))?]]> + GET + + TEST_HEADER_VALUE + [^/]+)(/(?P[^/]+))?]]> + + + predefined_query_handler + SELECT value FROM system.settings WHERE name = {name_1:String} + SELECT name, value FROM system.settings WHERE name = {name_2:String} + + + + +``` + +``` bash +$ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost:8123/query_param_with_url/1/max_threads/max_final_threads?max_threads=1&max_final_threads=2' +1 +max_final_threads 2 +``` + +:::warning +In one `predefined_query_handler` only supports one `query` of an insert type. +::: + +### dynamic_query_handler {#dynamic_query_handler} + +In `dynamic_query_handler`, the query is written in the form of param of the HTTP request. The difference is that in `predefined_query_handler`, the query is written in the configuration file. You can configure `query_param_name` in `dynamic_query_handler`. + +ClickHouse extracts and executes the value corresponding to the `query_param_name` value in the URL of the HTTP request. The default value of `query_param_name` is `/query` . It is an optional configuration. If there is no definition in the configuration file, the param is not passed in. + +To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` and `queries` whether the settings were set successfully. + +Example: + +``` xml + + + + TEST_HEADER_VALUE_DYNAMIC + + dynamic_query_handler + query_param + + + + +``` + +``` bash +$ curl -H 'XXX:TEST_HEADER_VALUE_DYNAMIC' 'http://localhost:8123/own?max_threads=1&max_final_threads=2¶m_name_1=max_threads¶m_name_2=max_final_threads&query_param=SELECT%20name,value%20FROM%20system.settings%20where%20name%20=%20%7Bname_1:String%7D%20OR%20name%20=%20%7Bname_2:String%7D' +max_threads 1 +max_final_threads 2 +``` + +### static {#static} + +`static` can return [content_type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type), [status](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) and `response_content`. `response_content` can return the specified content. + +Example: + +Return a message. + +``` xml + + + GET + xxx + /hi + + static + 402 + text/html; charset=UTF-8 + Say Hi! + + + + +``` + +``` bash +$ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi' +* Trying ::1... +* Connected to localhost (::1) port 8123 (#0) +> GET /hi HTTP/1.1 +> Host: localhost:8123 +> User-Agent: curl/7.47.0 +> Accept: */* +> XXX:xxx +> +< HTTP/1.1 402 Payment Required +< Date: Wed, 29 Apr 2020 03:51:26 GMT +< Connection: Keep-Alive +< Content-Type: text/html; charset=UTF-8 +< Transfer-Encoding: chunked +< Keep-Alive: timeout=3 +< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} +< +* Connection #0 to host localhost left intact +Say Hi!% +``` + +Find the content from the configuration send to client. + +``` xml +
]]>
+ + + + GET + xxx + /get_config_static_handler + + static + config://get_config_static_handler + + + +``` + +``` bash +$ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler' +* Trying ::1... +* Connected to localhost (::1) port 8123 (#0) +> GET /get_config_static_handler HTTP/1.1 +> Host: localhost:8123 +> User-Agent: curl/7.47.0 +> Accept: */* +> XXX:xxx +> +< HTTP/1.1 200 OK +< Date: Wed, 29 Apr 2020 04:01:24 GMT +< Connection: Keep-Alive +< Content-Type: text/plain; charset=UTF-8 +< Transfer-Encoding: chunked +< Keep-Alive: timeout=3 +< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} +< +* Connection #0 to host localhost left intact +
% +``` + +Find the content from the file send to client. + +``` xml + + + GET + xxx + /get_absolute_path_static_handler + + static + text/html; charset=UTF-8 + file:///absolute_path_file.html + + + + GET + xxx + /get_relative_path_static_handler + + static + text/html; charset=UTF-8 + file://./relative_path_file.html + + + +``` + +``` bash +$ user_files_path='/var/lib/clickhouse/user_files' +$ sudo echo "Relative Path File" > $user_files_path/relative_path_file.html +$ sudo echo "Absolute Path File" > $user_files_path/absolute_path_file.html +$ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler' +* Trying ::1... +* Connected to localhost (::1) port 8123 (#0) +> GET /get_absolute_path_static_handler HTTP/1.1 +> Host: localhost:8123 +> User-Agent: curl/7.47.0 +> Accept: */* +> XXX:xxx +> +< HTTP/1.1 200 OK +< Date: Wed, 29 Apr 2020 04:18:16 GMT +< Connection: Keep-Alive +< Content-Type: text/html; charset=UTF-8 +< Transfer-Encoding: chunked +< Keep-Alive: timeout=3 +< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} +< +Absolute Path File +* Connection #0 to host localhost left intact +$ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler' +* Trying ::1... +* Connected to localhost (::1) port 8123 (#0) +> GET /get_relative_path_static_handler HTTP/1.1 +> Host: localhost:8123 +> User-Agent: curl/7.47.0 +> Accept: */* +> XXX:xxx +> +< HTTP/1.1 200 OK +< Date: Wed, 29 Apr 2020 04:18:31 GMT +< Connection: Keep-Alive +< Content-Type: text/html; charset=UTF-8 +< Transfer-Encoding: chunked +< Keep-Alive: timeout=3 +< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} +< +Relative Path File +* Connection #0 to host localhost left intact +``` diff --git a/docs/en/reference/interfaces/index.md b/docs/en/reference/interfaces/index.md new file mode 100644 index 00000000000..16e97ed7c62 --- /dev/null +++ b/docs/en/reference/interfaces/index.md @@ -0,0 +1,28 @@ +--- +sidebar_label: Interfaces +sidebar_position: 34 +keywords: [clickhouse, network, interfaces, http, tcp, grpc, command-line, client, jdbc, odbc, driver] +description: ClickHouse provides three network interfaces +--- + +# Interfaces {#interfaces} + +ClickHouse provides three network interfaces (they can be optionally wrapped in TLS for additional security): + +- [HTTP](http.md), which is documented and easy to use directly. +- [Native TCP](../interfaces/tcp.md), which has less overhead. +- [gRPC](grpc.md). + +In most cases it is recommended to use an appropriate tool or library instead of interacting with those directly. The following are officially supported by ClickHouse: + +- [Command-line client](../interfaces/cli.md) +- [JDBC driver](../interfaces/jdbc.md) +- [ODBC driver](../interfaces/odbc.md) +- [C++ client library](../interfaces/cpp.md) + +There are also a wide range of third-party libraries for working with ClickHouse: + +- [Client libraries](../interfaces/third-party/client-libraries.md) +- [Integrations](../interfaces/third-party/integrations.md) +- [Visual interfaces](../interfaces/third-party/gui.md) + diff --git a/docs/en/reference/interfaces/jdbc.md b/docs/en/reference/interfaces/jdbc.md new file mode 100644 index 00000000000..4bea0600a2a --- /dev/null +++ b/docs/en/reference/interfaces/jdbc.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 22 +sidebar_label: JDBC Driver +--- + +# JDBC Driver {#jdbc-driver} + +Use the [official JDBC driver](https://github.com/ClickHouse/clickhouse-jdbc) (and Java client) to access ClickHouse from your Java applications. + +- Third-party drivers: + - [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) + - [clickhouse4j](https://github.com/blynkkk/clickhouse4j) + +[Original article](https://clickhouse.com/docs/en/interfaces/jdbc/) diff --git a/docs/en/reference/interfaces/mysql.md b/docs/en/reference/interfaces/mysql.md new file mode 100644 index 00000000000..df8ef38d671 --- /dev/null +++ b/docs/en/reference/interfaces/mysql.md @@ -0,0 +1,53 @@ +--- +sidebar_position: 20 +sidebar_label: MySQL Interface +--- + +# MySQL Interface {#mysql-interface} + +ClickHouse supports MySQL wire protocol. It can be enabled by [mysql_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-mysql_port) setting in configuration file: + +``` xml +9004 +``` + +Example of connecting using command-line tool `mysql`: + +``` bash +$ mysql --protocol tcp -u default -P 9004 +``` + +Output if a connection succeeded: + +``` text +Welcome to the MySQL monitor. Commands end with ; or \g. +Your MySQL connection id is 4 +Server version: 20.2.1.1-ClickHouse + +Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. + +Oracle is a registered trademark of Oracle Corporation and/or its +affiliates. Other names may be trademarks of their respective +owners. + +Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. + +mysql> +``` + +For compatibility with all MySQL clients, it is recommended to specify user password with [double SHA1](../operations/settings/settings-users.md#password_double_sha1_hex) in configuration file. +If user password is specified using [SHA256](../operations/settings/settings-users.md#password_sha256_hex), some clients won’t be able to authenticate (mysqljs and old versions of command-line tool MySQL and MariaDB). + +Restrictions: + +- prepared queries are not supported + +- some data types are sent as strings + +To cancel a long query use `KILL QUERY connection_id` statement (it is replaced with `KILL QUERY WHERE query_id = connection_id` while proceeding). For example: + +``` bash +$ mysql --protocol tcp -h mysql_server -P 9004 default -u default --password=123 -e "KILL QUERY 123456;" +``` + +[Original article](https://clickhouse.com/docs/en/interfaces/mysql/) diff --git a/docs/en/reference/interfaces/odbc.md b/docs/en/reference/interfaces/odbc.md new file mode 100644 index 00000000000..4c807654c28 --- /dev/null +++ b/docs/en/reference/interfaces/odbc.md @@ -0,0 +1,12 @@ +--- +sidebar_position: 23 +sidebar_label: ODBC Driver +--- + +# ODBC Driver {#odbc-driver} + +Use the [official ODBC driver](https://github.com/ClickHouse/clickhouse-odbc) for accessing ClickHouse as a data source. + + + +[Original article](https://clickhouse.com/docs/en/interfaces/odbc/) diff --git a/docs/en/reference/interfaces/tcp.md b/docs/en/reference/interfaces/tcp.md new file mode 100644 index 00000000000..5f2f400799f --- /dev/null +++ b/docs/en/reference/interfaces/tcp.md @@ -0,0 +1,10 @@ +--- +sidebar_position: 18 +sidebar_label: Native Interface (TCP) +--- + +# Native Interface (TCP) {#native-interface-tcp} + +The native protocol is used in the [command-line client](../interfaces/cli.md), for inter-server communication during distributed query processing, and also in other C++ programs. Unfortunately, native ClickHouse protocol does not have formal specification yet, but it can be reverse-engineered from ClickHouse source code (starting [around here](https://github.com/ClickHouse/ClickHouse/tree/master/src/Client)) and/or by intercepting and analyzing TCP traffic. + +[Original article](https://clickhouse.com/docs/en/interfaces/tcp/) diff --git a/docs/en/reference/interfaces/third-party/client-libraries.md b/docs/en/reference/interfaces/third-party/client-libraries.md new file mode 100644 index 00000000000..885e9f430f2 --- /dev/null +++ b/docs/en/reference/interfaces/third-party/client-libraries.md @@ -0,0 +1,74 @@ +--- +sidebar_position: 26 +sidebar_label: Client Libraries +--- + +# Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers} + +:::warning +ClickHouse Inc does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. +::: + +- Python + - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) + - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) + - [clickhouse-client](https://github.com/yurial/clickhouse-client) + - [aiochclient](https://github.com/maximdanilchenko/aiochclient) + - [asynch](https://github.com/long2ice/asynch) +- PHP + - [smi2/phpclickhouse](https://packagist.org/packages/smi2/phpClickHouse) + - [8bitov/clickhouse-php-client](https://packagist.org/packages/8bitov/clickhouse-php-client) + - [bozerkins/clickhouse-client](https://packagist.org/packages/bozerkins/clickhouse-client) + - [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client) + - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) + - [SeasClick C++ client](https://github.com/SeasX/SeasClick) + - [one-ck](https://github.com/lizhichao/one-ck) + - [glushkovds/phpclickhouse-laravel](https://packagist.org/packages/glushkovds/phpclickhouse-laravel) + - [kolya7k ClickHouse PHP extension](https://github.com//kolya7k/clickhouse-php) +- Go + - [clickhouse](https://github.com/kshvakov/clickhouse/) + - [go-clickhouse](https://github.com/roistat/go-clickhouse) + - [chconn](https://github.com/vahid-sohrabloo/chconn) + - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) + - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) +- Swift + - [ClickHouseNIO](https://github.com/patrick-zippenfenig/ClickHouseNIO) + - [ClickHouseVapor ORM](https://github.com/patrick-zippenfenig/ClickHouseVapor) +- NodeJs + - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) + - [node-clickhouse](https://github.com/apla/node-clickhouse) + - [nestjs-clickhouse](https://github.com/depyronick/nestjs-clickhouse) + - [clickhouse-client](https://github.com/depyronick/clickhouse-client) +- Perl + - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) + - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) + - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) +- Ruby + - [ClickHouse (Ruby)](https://github.com/shlima/click_house) + - [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord) +- Rust + - [Klickhouse](https://github.com/Protryon/klickhouse) +- R + - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) + - [RClickHouse](https://github.com/IMSMWU/RClickHouse) +- Java + - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) + - [clickhouse-client](https://github.com/Ecwid/clickhouse-client) +- Scala + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- Kotlin + - [AORM](https://github.com/TanVD/AORM) +- C# + - [Octonica.ClickHouseClient](https://github.com/Octonica/ClickHouseClient) + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) +- Elixir + - [clickhousex](https://github.com/appodeal/clickhousex/) + - [pillar](https://github.com/sofakingworld/pillar) +- Nim + - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) +- Haskell + - [hdbc-clickhouse](https://github.com/zaneli/hdbc-clickhouse) + +[Original article](https://clickhouse.com/docs/en/interfaces/third-party/client_libraries/) diff --git a/docs/en/reference/interfaces/third-party/gui.md b/docs/en/reference/interfaces/third-party/gui.md new file mode 100644 index 00000000000..92d00f2812c --- /dev/null +++ b/docs/en/reference/interfaces/third-party/gui.md @@ -0,0 +1,247 @@ +--- +sidebar_position: 28 +sidebar_label: Visual Interfaces +--- + +# Visual Interfaces from Third-party Developers {#visual-interfaces-from-third-party-developers} + +## Open-Source {#open-source} + +### Tabix {#tabix} + +Web interface for ClickHouse in the [Tabix](https://github.com/tabixio/tabix) project. + +Features: + +- Works with ClickHouse directly from the browser, without the need to install additional software. +- Query editor with syntax highlighting. +- Auto-completion of commands. +- Tools for graphical analysis of query execution. +- Colour scheme options. + +[Tabix documentation](https://tabix.io/doc/). + +### HouseOps {#houseops} + +[HouseOps](https://github.com/HouseOps/HouseOps) is a UI/IDE for OSX, Linux and Windows. + +Features: + +- Query builder with syntax highlighting. View the response in a table or JSON view. +- Export query results as CSV or JSON. +- List of processes with descriptions. Write mode. Ability to stop (`KILL`) a process. +- Database graph. Shows all tables and their columns with additional information. +- A quick view of the column size. +- Server configuration. + +The following features are planned for development: + +- Database management. +- User management. +- Real-time data analysis. +- Cluster monitoring. +- Cluster management. +- Monitoring replicated and Kafka tables. + +### LightHouse {#lighthouse} + +[LightHouse](https://github.com/VKCOM/lighthouse) is a lightweight web interface for ClickHouse. + +Features: + +- Table list with filtering and metadata. +- Table preview with filtering and sorting. +- Read-only queries execution. + +### Redash {#redash} + +[Redash](https://github.com/getredash/redash) is a platform for data visualization. + +Supports for multiple data sources including ClickHouse, Redash can join results of queries from different data sources into one final dataset. + +Features: + +- Powerful editor of queries. +- Database explorer. +- Visualization tools, that allow you to represent data in different forms. + +### Grafana {#grafana} + +[Grafana](https://grafana.com/grafana/plugins/vertamedia-clickhouse-datasource) is a platform for monitoring and visualization. + +"Grafana allows you to query, visualize, alert on and understand your metrics no matter where they are stored. Create, explore, and share dashboards with your team and foster a data driven culture. Trusted and loved by the community" — grafana.com. + +ClickHouse datasource plugin provides a support for ClickHouse as a backend database. + +### DBeaver {#dbeaver} + +[DBeaver](https://dbeaver.io/) - universal desktop database client with ClickHouse support. + +Features: + +- Query development with syntax highlight and autocompletion. +- Table list with filters and metadata search. +- Table data preview. +- Full-text search. + +By default, DBeaver does not connect using a session (the CLI for example does). If you require session support (for example to set settings for your session), edit the driver connection properties and set `session_id` to a random string (it uses the http connection under the hood). Then you can use any setting from the query window. + +### clickhouse-cli {#clickhouse-cli} + +[clickhouse-cli](https://github.com/hatarist/clickhouse-cli) is an alternative command-line client for ClickHouse, written in Python 3. + +Features: + +- Autocompletion. +- Syntax highlighting for the queries and data output. +- Pager support for the data output. +- Custom PostgreSQL-like commands. + +### clickhouse-flamegraph {#clickhouse-flamegraph} + +[clickhouse-flamegraph](https://github.com/Slach/clickhouse-flamegraph) is a specialized tool to visualize the `system.trace_log` as [flamegraph](http://www.brendangregg.com/flamegraphs.html). + +### clickhouse-plantuml {#clickhouse-plantuml} + +[cickhouse-plantuml](https://pypi.org/project/clickhouse-plantuml/) is a script to generate [PlantUML](https://plantuml.com/) diagram of tables’ schemes. + +### xeus-clickhouse {#xeus-clickhouse} + +[xeus-clickhouse](https://github.com/wangfenjin/xeus-clickhouse) is a Jupyter kernal for ClickHouse, which supports query CH data using SQL in Jupyter. + +### MindsDB Studio {#mindsdb} + +[MindsDB](https://mindsdb.com/) is an open-source AI layer for databases including ClickHouse that allows you to effortlessly develop, train and deploy state-of-the-art machine learning models. MindsDB Studio(GUI) allows you to train new models from database, interpret predictions made by the model, identify potential data biases, and evaluate and visualize model accuracy using the Explainable AI function to adapt and tune your Machine Learning models faster. + +### DBM {#dbm} + +[DBM](https://dbm.incubator.edurt.io/) DBM is a visual management tool for ClickHouse! + +Features: + +- Support query history (pagination, clear all, etc.) +- Support selected sql clauses query +- Support terminating query +- Support table management (metadata, delete, preview) +- Support database management (delete, create) +- Support custom query +- Support multiple data sources management(connection test, monitoring) +- Support monitor (processor, connection, query) +- Support migrate data + +### Bytebase {#bytebase} + +[Bytebase](https://bytebase.com) is a web-based, open source schema change and version control tool for teams. It supports various databases including ClickHouse. + +Features: + +- Schema review between developers and DBAs. +- Database-as-Code, version control the schema in VCS such GitLab and trigger the deployment upon code commit. +- Streamlined deployment with per-environment policy. +- Full migration history. +- Schema drift detection. +- Backup and restore. +- RBAC. + +### Zeppelin-Interpreter-for-ClickHouse {#zeppelin-interpreter-for-clickhouse} + +[Zeppelin-Interpreter-for-ClickHouse](https://github.com/SiderZhang/Zeppelin-Interpreter-for-ClickHouse) is a [Zeppelin](https://zeppelin.apache.org) interpreter for ClickHouse. Compared with JDBC interpreter, it can provide better timeout control for long running queries. + +## Commercial {#commercial} + +### DataGrip {#datagrip} + +[DataGrip](https://www.jetbrains.com/datagrip/) is a database IDE from JetBrains with dedicated support for ClickHouse. It is also embedded in other IntelliJ-based tools: PyCharm, IntelliJ IDEA, GoLand, PhpStorm and others. + +Features: + +- Very fast code completion. +- ClickHouse syntax highlighting. +- Support for features specific to ClickHouse, for example, nested columns, table engines. +- Data Editor. +- Refactorings. +- Search and Navigation. + +### Yandex DataLens {#yandex-datalens} + +[Yandex DataLens](https://cloud.yandex.ru/services/datalens) is a service of data visualization and analytics. + +Features: + +- Wide range of available visualizations, from simple bar charts to complex dashboards. +- Dashboards could be made publicly available. +- Support for multiple data sources including ClickHouse. +- Storage for materialized data based on ClickHouse. + +DataLens is [available for free](https://cloud.yandex.com/docs/datalens/pricing) for low-load projects, even for commercial use. + +- [DataLens documentation](https://cloud.yandex.com/docs/datalens/). +- [Tutorial](https://cloud.yandex.com/docs/solutions/datalens/data-from-ch-visualization) on visualizing data from a ClickHouse database. + +### Holistics Software {#holistics-software} + +[Holistics](https://www.holistics.io/) is a full-stack data platform and business intelligence tool. + +Features: + +- Automated email, Slack and Google Sheet schedules of reports. +- SQL editor with visualizations, version control, auto-completion, reusable query components and dynamic filters. +- Embedded analytics of reports and dashboards via iframe. +- Data preparation and ETL capabilities. +- SQL data modelling support for relational mapping of data. + +### Looker {#looker} + +[Looker](https://looker.com) is a data platform and business intelligence tool with support for 50+ database dialects including ClickHouse. Looker is available as a SaaS platform and self-hosted. Users can use Looker via the browser to explore data, build visualizations and dashboards, schedule reports, and share their insights with colleagues. Looker provides a rich set of tools to embed these features in other applications, and an API +to integrate data with other applications. + +Features: + +- Easy and agile development using LookML, a language which supports curated + [Data Modeling](https://looker.com/platform/data-modeling) to support report writers and end-users. +- Powerful workflow integration via Looker’s [Data Actions](https://looker.com/platform/actions). + +[How to configure ClickHouse in Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) + +### SeekTable {#seektable} + +[SeekTable](https://www.seektable.com) is a self-service BI tool for data exploration and operational reporting. It is available both as a cloud service and a self-hosted version. Reports from SeekTable may be embedded into any web-app. + +Features: + +- Business users-friendly reports builder. +- Powerful report parameters for SQL filtering and report-specific query customizations. +- Can connect to ClickHouse both with a native TCP/IP endpoint and a HTTP(S) interface (2 different drivers). +- It is possible to use all power of ClickHouse SQL dialect in dimensions/measures definitions. +- [Web API](https://www.seektable.com/help/web-api-integration) for automated reports generation. +- Supports reports development flow with account data [backup/restore](https://www.seektable.com/help/self-hosted-backup-restore); data models (cubes) / reports configuration is a human-readable XML and can be stored under version control system. + +SeekTable is [free](https://www.seektable.com/help/cloud-pricing) for personal/individual usage. + +[How to configure ClickHouse connection in SeekTable.](https://www.seektable.com/help/clickhouse-pivot-table) + +### Chadmin {#chadmin} + +[Chadmin](https://github.com/bun4uk/chadmin) is a simple UI where you can visualize your currently running queries on your ClickHouse cluster and info about them and kill them if you want. + +### TABLUM.IO {#tablum_io} + +[TABLUM.IO](https://tablum.io/) — an online query and analytics tool for ETL and visualization. It allows connecting to ClickHouse, query data via a versatile SQL console as well as to load data from static files and 3rd party services. TABLUM.IO can visualize data results as charts and tables. + +Features: +- ETL: data loading from popular databases, local and remote files, API invocations. +- Versatile SQL console with syntax highlight and visual query builder. +- Data visualization as charts and tables. +- Data materialization and sub-queries. +- Data reporting to Slack, Telegram or email. +- Data pipelining via proprietary API. +- Data export in JSON, CSV, SQL, HTML formats. +- Web-based interface. + +TABLUM.IO can be run as a self-hosted solution (as a docker image) or in the cloud. +License: [commercial](https://tablum.io/pricing) product with 3-month free period. + +Try it out for free [in the cloud](https://tablum.io/try). +Learn more about the product at [TABLUM.IO](https://tablum.io/) + +[Original article](https://clickhouse.com/docs/en/interfaces/third-party/gui/) diff --git a/docs/en/reference/interfaces/third-party/index.md b/docs/en/reference/interfaces/third-party/index.md new file mode 100644 index 00000000000..c9be2b6ada9 --- /dev/null +++ b/docs/en/reference/interfaces/third-party/index.md @@ -0,0 +1,17 @@ +--- +toc_folder_title: Third-Party +sidebar_position: 24 +--- + +# Third-Party Interfaces {#third-party-interfaces} + +This is a collection of links to third-party tools that provide some sort of interface to ClickHouse. It can be either visual interface, command-line interface or an API: + +- [Client libraries](../../interfaces/third-party/client-libraries.md) +- [Integrations](../../interfaces/third-party/integrations.md) +- [GUI](../../interfaces/third-party/gui.md) +- [Proxies](../../interfaces/third-party/proxy.md) + +:::note +Generic tools that support common API like [ODBC](../../interfaces/odbc.md) or [JDBC](../../interfaces/jdbc.md) usually can work with ClickHouse as well, but are not listed here because there are way too many of them. +::: \ No newline at end of file diff --git a/docs/en/reference/interfaces/third-party/integrations.md b/docs/en/reference/interfaces/third-party/integrations.md new file mode 100644 index 00000000000..ae055d63a9d --- /dev/null +++ b/docs/en/reference/interfaces/third-party/integrations.md @@ -0,0 +1,112 @@ +--- +sidebar_position: 27 +sidebar_label: Integrations +--- + +# Integration Libraries from Third-party Developers {#integration-libraries-from-third-party-developers} + +:::warning Disclaimer +ClickHouse, Inc. does **not** maintain the tools and libraries listed below and haven’t done extensive testing to ensure their quality. +::: + +## Infrastructure Products {#infrastructure-products} + +- Relational database management systems + - [MySQL](https://www.mysql.com) + - [mysql2ch](https://github.com/long2ice/mysql2ch) + - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) + - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) + - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) + - [PostgreSQL](https://www.postgresql.org) + - [clickhousedb_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) + - [infi.clickhouse_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (uses [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pg2ch](https://github.com/mkabilov/pg2ch) + - [clickhouse_fdw](https://github.com/adjust/clickhouse_fdw) + - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) + - [ClickHouseMigrator](https://github.com/zlzforever/ClickHouseMigrator) +- Message queues + - [Kafka](https://kafka.apache.org) + - [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (uses [Go client](https://github.com/ClickHouse/clickhouse-go/)) + - [stream-loader-clickhouse](https://github.com/adform/stream-loader) +- Stream processing + - [Flink](https://flink.apache.org) + - [flink-clickhouse-sink](https://github.com/ivi-ru/flink-clickhouse-sink) +- Object storages + - [S3](https://en.wikipedia.org/wiki/Amazon_S3) + - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) +- Container orchestration + - [Kubernetes](https://kubernetes.io) + - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) +- Configuration management + - [puppet](https://puppet.com) + - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) + - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) +- Monitoring + - [Graphite](https://graphiteapp.org) + - [graphouse](https://github.com/ClickHouse/graphouse) + - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) + - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../engines/table-engines/mergetree-family/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../engines/table-engines/mergetree-family/graphitemergetree.md#rollup-configuration) could be applied + - [Grafana](https://grafana.com/) + - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) + - [Prometheus](https://prometheus.io/) + - [clickhouse_exporter](https://github.com/f1yegor/clickhouse_exporter) + - [PromHouse](https://github.com/Percona-Lab/PromHouse) + - [clickhouse_exporter](https://github.com/hot-wifi/clickhouse_exporter) (uses [Go client](https://github.com/kshvakov/clickhouse/)) + - [Nagios](https://www.nagios.org/) + - [check_clickhouse](https://github.com/exogroup/check_clickhouse/) + - [check_clickhouse.py](https://github.com/innogames/igmonplugins/blob/master/src/check_clickhouse.py) + - [Zabbix](https://www.zabbix.com) + - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) + - [Sematext](https://sematext.com/) + - [clickhouse integration](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) +- Logging + - [rsyslog](https://www.rsyslog.com/) + - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) + - [fluentd](https://www.fluentd.org) + - [loghouse](https://github.com/flant/loghouse) (for [Kubernetes](https://kubernetes.io)) + - [logagent](https://www.sematext.com/logagent) + - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) +- Geo + - [MaxMind](https://dev.maxmind.com/geoip/) + - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) +- AutoML + - [MindsDB](https://mindsdb.com/) + - [MindsDB](https://github.com/mindsdb/mindsdb) - Predictive AI layer for ClickHouse database. + +## Programming Language Ecosystems {#programming-language-ecosystems} + +- Python + - [SQLAlchemy](https://www.sqlalchemy.org) + - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (uses [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pandas](https://pandas.pydata.org) + - [pandahouse](https://github.com/kszucs/pandahouse) +- PHP + - [Doctrine](https://www.doctrine-project.org/) + - [dbal-clickhouse](https://packagist.org/packages/friendsofdoctrine/dbal-clickhouse) +- R + - [dplyr](https://db.rstudio.com/dplyr/) + - [RClickHouse](https://github.com/IMSMWU/RClickHouse) (uses [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) +- Java + - [Hadoop](http://hadoop.apache.org) + - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (uses [JDBC](../../sql-reference/table-functions/jdbc.md)) +- Scala + - [Akka](https://akka.io) + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- C# + - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) + - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) +- Elixir + - [Ecto](https://github.com/elixir-ecto/ecto) + - [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto) +- Ruby + - [Ruby on Rails](https://rubyonrails.org/) + - [activecube](https://github.com/bitquery/activecube) + - [ActiveRecord](https://github.com/PNixx/clickhouse-activerecord) + - [GraphQL](https://github.com/graphql) + - [activecube-graphql](https://github.com/bitquery/activecube-graphql) + +[Original article](https://clickhouse.com/docs/en/interfaces/third-party/integrations/) diff --git a/docs/en/reference/interfaces/third-party/proxy.md b/docs/en/reference/interfaces/third-party/proxy.md new file mode 100644 index 00000000000..45077cb6a89 --- /dev/null +++ b/docs/en/reference/interfaces/third-party/proxy.md @@ -0,0 +1,44 @@ +--- +sidebar_position: 29 +sidebar_label: Proxies +--- + +# Proxy Servers from Third-party Developers {#proxy-servers-from-third-party-developers} + +## chproxy {#chproxy} + +[chproxy](https://github.com/Vertamedia/chproxy), is an HTTP proxy and load balancer for ClickHouse database. + +Features: + +- Per-user routing and response caching. +- Flexible limits. +- Automatic SSL certificate renewal. + +Implemented in Go. + +## KittenHouse {#kittenhouse} + +[KittenHouse](https://github.com/VKCOM/kittenhouse) is designed to be a local proxy between ClickHouse and application server in case it’s impossible or inconvenient to buffer INSERT data on your application side. + +Features: + +- In-memory and on-disk data buffering. +- Per-table routing. +- Load-balancing and health checking. + +Implemented in Go. + +## ClickHouse-Bulk {#clickhouse-bulk} + +[ClickHouse-Bulk](https://github.com/nikepan/clickhouse-bulk) is a simple ClickHouse insert collector. + +Features: + +- Group requests and send by threshold or interval. +- Multiple remote servers. +- Basic authentication. + +Implemented in Go. + +[Original article](https://clickhouse.com/docs/en/interfaces/third-party/proxy/) diff --git a/docs/en/reference/operations/_category_.yml b/docs/en/reference/operations/_category_.yml new file mode 100644 index 00000000000..3dcbd254a3f --- /dev/null +++ b/docs/en/reference/operations/_category_.yml @@ -0,0 +1,4 @@ +position: 70 +label: 'Operations' +collapsible: true +collapsed: true \ No newline at end of file diff --git a/docs/en/reference/operations/access-rights.md b/docs/en/reference/operations/access-rights.md new file mode 100644 index 00000000000..7d75c47df2b --- /dev/null +++ b/docs/en/reference/operations/access-rights.md @@ -0,0 +1,154 @@ +--- +sidebar_position: 48 +sidebar_label: Access Control and Account Management +--- + +# Access Control and Account Management {#access-control} + +ClickHouse supports access control management based on [RBAC](https://en.wikipedia.org/wiki/Role-based_access_control) approach. + +ClickHouse access entities: +- [User account](#user-account-management) +- [Role](#role-management) +- [Row Policy](#row-policy-management) +- [Settings Profile](#settings-profiles-management) +- [Quota](#quotas-management) + +You can configure access entities using: + +- SQL-driven workflow. + + You need to [enable](#enabling-access-control) this functionality. + +- Server [configuration files](../operations/configuration-files.md) `users.xml` and `config.xml`. + +We recommend using SQL-driven workflow. Both of the configuration methods work simultaneously, so if you use the server configuration files for managing accounts and access rights, you can smoothly switch to SQL-driven workflow. + +:::warning +You can’t manage the same access entity by both configuration methods simultaneously. +::: + +To see all users, roles, profiles, etc. and all their grants use [SHOW ACCESS](../sql-reference/statements/show.md#show-access-statement) statement. + +## Usage {#access-control-usage} + +By default, the ClickHouse server provides the `default` user account which is not allowed using SQL-driven access control and account management but has all the rights and permissions. The `default` user account is used in any cases when the username is not defined, for example, at login from client or in distributed queries. In distributed query processing a default user account is used, if the configuration of the server or cluster does not specify the [user and password](../engines/table-engines/special/distributed.md) properties. + +If you just started using ClickHouse, consider the following scenario: + +1. [Enable](#enabling-access-control) SQL-driven access control and account management for the `default` user. +2. Log in to the `default` user account and create all the required users. Don’t forget to create an administrator account (`GRANT ALL ON *.* TO admin_user_account WITH GRANT OPTION`). +3. [Restrict permissions](../operations/settings/permissions-for-queries.md#permissions_for_queries) for the `default` user and disable SQL-driven access control and account management for it. + +### Properties of Current Solution {#access-control-properties} + +- You can grant permissions for databases and tables even if they do not exist. +- If a table was deleted, all the privileges that correspond to this table are not revoked. This means that even if you create a new table with the same name later, all the privileges remain valid. To revoke privileges corresponding to the deleted table, you need to execute, for example, the `REVOKE ALL PRIVILEGES ON db.table FROM ALL` query. +- There are no lifetime settings for privileges. + +## User Account {#user-account-management} + +A user account is an access entity that allows to authorize someone in ClickHouse. A user account contains: + +- Identification information. +- [Privileges](../sql-reference/statements/grant.md#grant-privileges) that define a scope of queries the user can execute. +- Hosts allowed to connect to the ClickHouse server. +- Assigned and default roles. +- Settings with their constraints applied by default at user login. +- Assigned settings profiles. + +Privileges can be granted to a user account by the [GRANT](../sql-reference/statements/grant.md) query or by assigning [roles](#role-management). To revoke privileges from a user, ClickHouse provides the [REVOKE](../sql-reference/statements/revoke.md) query. To list privileges for a user, use the [SHOW GRANTS](../sql-reference/statements/show.md#show-grants-statement) statement. + +Management queries: + +- [CREATE USER](../sql-reference/statements/create/user.md) +- [ALTER USER](../sql-reference/statements/alter/user.md#alter-user-statement) +- [DROP USER](../sql-reference/statements/drop.md) +- [SHOW CREATE USER](../sql-reference/statements/show.md#show-create-user-statement) +- [SHOW USERS](../sql-reference/statements/show.md#show-users-statement) + +### Settings Applying {#access-control-settings-applying} + +Settings can be configured differently: for a user account, in its granted roles and in settings profiles. At user login, if a setting is configured for different access entities, the value and constraints of this setting are applied as follows (from higher to lower priority): + +1. User account settings. +2. The settings of default roles of the user account. If a setting is configured in some roles, then order of the setting application is undefined. +3. The settings from settings profiles assigned to a user or to its default roles. If a setting is configured in some profiles, then order of setting application is undefined. +4. Settings applied to all the server by default or from the [default profile](../operations/server-configuration-parameters/settings.md#default-profile). + +## Role {#role-management} + +Role is a container for access entities that can be granted to a user account. + +Role contains: + +- [Privileges](../sql-reference/statements/grant.md#grant-privileges) +- Settings and constraints +- List of assigned roles + +Management queries: + +- [CREATE ROLE](../sql-reference/statements/create/role.md) +- [ALTER ROLE](../sql-reference/statements/alter/role.md#alter-role-statement) +- [DROP ROLE](../sql-reference/statements/drop.md) +- [SET ROLE](../sql-reference/statements/set-role.md) +- [SET DEFAULT ROLE](../sql-reference/statements/set-role.md#set-default-role-statement) +- [SHOW CREATE ROLE](../sql-reference/statements/show.md#show-create-role-statement) +- [SHOW ROLES](../sql-reference/statements/show.md#show-roles-statement) + +Privileges can be granted to a role by the [GRANT](../sql-reference/statements/grant.md) query. To revoke privileges from a role ClickHouse provides the [REVOKE](../sql-reference/statements/revoke.md) query. + +## Row Policy {#row-policy-management} + +Row policy is a filter that defines which of the rows are available to a user or a role. Row policy contains filters for one particular table, as well as a list of roles and/or users which should use this row policy. + +:::warning +Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. +::: + +Management queries: + +- [CREATE ROW POLICY](../sql-reference/statements/create/row-policy.md) +- [ALTER ROW POLICY](../sql-reference/statements/alter/row-policy.md#alter-row-policy-statement) +- [DROP ROW POLICY](../sql-reference/statements/drop.md#drop-row-policy-statement) +- [SHOW CREATE ROW POLICY](../sql-reference/statements/show.md#show-create-row-policy-statement) +- [SHOW POLICIES](../sql-reference/statements/show.md#show-policies-statement) + +## Settings Profile {#settings-profiles-management} + +Settings profile is a collection of [settings](../operations/settings/index.md). Settings profile contains settings and constraints, as well as a list of roles and/or users to which this profile is applied. + +Management queries: + +- [CREATE SETTINGS PROFILE](../sql-reference/statements/create/settings-profile.md#create-settings-profile-statement) +- [ALTER SETTINGS PROFILE](../sql-reference/statements/alter/settings-profile.md#alter-settings-profile-statement) +- [DROP SETTINGS PROFILE](../sql-reference/statements/drop.md#drop-settings-profile-statement) +- [SHOW CREATE SETTINGS PROFILE](../sql-reference/statements/show.md#show-create-settings-profile-statement) +- [SHOW PROFILES](../sql-reference/statements/show.md#show-profiles-statement) + +## Quota {#quotas-management} + +Quota limits resource usage. See [Quotas](../operations/quotas.md). + +Quota contains a set of limits for some durations, as well as a list of roles and/or users which should use this quota. + +Management queries: + +- [CREATE QUOTA](../sql-reference/statements/create/quota.md) +- [ALTER QUOTA](../sql-reference/statements/alter/quota.md#alter-quota-statement) +- [DROP QUOTA](../sql-reference/statements/drop.md#drop-quota-statement) +- [SHOW CREATE QUOTA](../sql-reference/statements/show.md#show-create-quota-statement) +- [SHOW QUOTA](../sql-reference/statements/show.md#show-quota-statement) +- [SHOW QUOTAS](../sql-reference/statements/show.md#show-quotas-statement) + +## Enabling SQL-driven Access Control and Account Management {#enabling-access-control} + +- Setup a directory for configurations storage. + + ClickHouse stores access entity configurations in the folder set in the [access_control_path](../operations/server-configuration-parameters/settings.md#access_control_path) server configuration parameter. + +- Enable SQL-driven access control and account management for at least one user account. + + By default, SQL-driven access control and account management is disabled for all users. You need to configure at least one user in the `users.xml` configuration file and set the value of the [access_management](../operations/settings/settings-users.md#access_management-user-setting) setting to 1. + +[Original article](https://clickhouse.com/docs/en/operations/access_rights/) diff --git a/docs/en/reference/operations/backup.md b/docs/en/reference/operations/backup.md new file mode 100644 index 00000000000..c39658aa4b0 --- /dev/null +++ b/docs/en/reference/operations/backup.md @@ -0,0 +1,41 @@ +--- +sidebar_position: 49 +sidebar_label: Data Backup +--- + +# Data Backup {#data-backup} + +While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards do not cover all possible cases and can be circumvented. + +In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**. + +Each company has different resources available and business requirements, so there’s no universal solution for ClickHouse backups and restores that will fit every situation. What works for one gigabyte of data likely won’t work for tens of petabytes. There are a variety of possible approaches with their own pros and cons, which will be discussed below. It is a good idea to use several approaches instead of just one in order to compensate for their various shortcomings. + +:::note +Keep in mind that if you backed something up and never tried to restore it, chances are that restore will not work properly when you actually need it (or at least it will take longer than business can tolerate). So whatever backup approach you choose, make sure to automate the restore process as well, and practice it on a spare ClickHouse cluster regularly. +::: + +## Duplicating Source Data Somewhere Else {#duplicating-source-data-somewhere-else} + +Often data that is ingested into ClickHouse is delivered through some sort of persistent queue, such as [Apache Kafka](https://kafka.apache.org). In this case it is possible to configure an additional set of subscribers that will read the same data stream while it is being written to ClickHouse and store it in cold storage somewhere. Most companies already have some default recommended cold storage, which could be an object store or a distributed filesystem like [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html). + +## Filesystem Snapshots {#filesystem-snapshots} + +Some local filesystems provide snapshot functionality (for example, [ZFS](https://en.wikipedia.org/wiki/ZFS)), but they might not be the best choice for serving live queries. A possible solution is to create additional replicas with this kind of filesystem and exclude them from the [Distributed](../engines/table-engines/special/distributed.md) tables that are used for `SELECT` queries. Snapshots on such replicas will be out of reach of any queries that modify data. As a bonus, these replicas might have special hardware configurations with more disks attached per server, which would be cost-effective. + +## clickhouse-copier {#clickhouse-copier} + +[clickhouse-copier](../operations/utilities/clickhouse-copier.md) is a versatile tool that was initially created to re-shard petabyte-sized tables. It can also be used for backup and restore purposes because it reliably copies data between ClickHouse tables and clusters. + +For smaller volumes of data, a simple `INSERT INTO ... SELECT ...` to remote tables might work as well. + +## Manipulations with Parts {#manipulations-with-parts} + +ClickHouse allows using the `ALTER TABLE ... FREEZE PARTITION ...` query to create a local copy of table partitions. This is implemented using hardlinks to the `/var/lib/clickhouse/shadow/` folder, so it usually does not consume extra disk space for old data. The created copies of files are not handled by ClickHouse server, so you can just leave them there: you will have a simple backup that does not require any additional external system, but it will still be prone to hardware issues. For this reason, it’s better to remotely copy them to another location and then remove the local copies. Distributed filesystems and object stores are still a good options for this, but normal attached file servers with a large enough capacity might work as well (in this case the transfer will occur via the network filesystem or maybe [rsync](https://en.wikipedia.org/wiki/Rsync)). +Data can be restored from backup using the `ALTER TABLE ... ATTACH PARTITION ...` + +For more information about queries related to partition manipulations, see the [ALTER documentation](../sql-reference/statements/alter/partition.md#alter_manipulations-with-partitions). + +A third-party tool is available to automate this approach: [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup). + +[Original article](https://clickhouse.com/docs/en/operations/backup/) diff --git a/docs/en/reference/operations/caches.md b/docs/en/reference/operations/caches.md new file mode 100644 index 00000000000..f2427810184 --- /dev/null +++ b/docs/en/reference/operations/caches.md @@ -0,0 +1,29 @@ +--- +sidebar_position: 65 +sidebar_label: Caches +--- + +# Cache Types {#cache-types} + +When performing queries, ClichHouse uses different caches. + +Main cache types: + +- `mark_cache` — Cache of marks used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. +- `uncompressed_cache` — Cache of uncompressed data used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. + +Additional cache types: + +- DNS cache. +- [Regexp](../interfaces/formats.md#data-format-regexp) cache. +- Compiled expressions cache. +- [Avro format](../interfaces/formats.md#data-format-avro) schemas cache. +- [Dictionaries](../sql-reference/dictionaries/index.md) data cache. + +Indirectly used: + +- OS page cache. + +To drop cache, use [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md) statements. + +[Original article](https://clickhouse.com/docs/en/operations/caches/) diff --git a/docs/en/reference/operations/clickhouse-keeper.md b/docs/en/reference/operations/clickhouse-keeper.md new file mode 100644 index 00000000000..81547736441 --- /dev/null +++ b/docs/en/reference/operations/clickhouse-keeper.md @@ -0,0 +1,325 @@ +--- +sidebar_position: 66 +sidebar_label: ClickHouse Keeper +--- + +# [pre-production] ClickHouse Keeper {#clickHouse-keeper} + +ClickHouse server uses [ZooKeeper](https://zookeeper.apache.org/) coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is an alternative coordination system compatible with ZooKeeper. + +:::warning +This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. +::: + +## Implementation details {#implementation-details} + +ZooKeeper is one of the first well-known open-source coordination systems. It's implemented in Java, has quite a simple and powerful data model. ZooKeeper's coordination algorithm called ZAB (ZooKeeper Atomic Broadcast) doesn't provide linearizability guarantees for reads, because each ZooKeeper node serves reads locally. Unlike ZooKeeper ClickHouse Keeper is written in C++ and uses [RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft). This algorithm allows to have linearizability for reads and writes, has several open-source implementations in different languages. + +By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible. + +ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64. + +:::note +External integrations are not supported. +::: + +## Configuration {#configuration} + +ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is ``. Keeper configuration has the following parameters: + +- `tcp_port` — Port for a client to connect (default for ZooKeeper is `2181`). +- `tcp_port_secure` — Secure port for an SSL connection between client and keeper-server. +- `server_id` — Unique server id, each participant of the ClickHouse Keeper cluster must have a unique number (1, 2, 3, and so on). +- `log_storage_path` — Path to coordination logs, better to store logs on the non-busy device (same for ZooKeeper). +- `snapshot_storage_path` — Path to coordination snapshots. + +Other common parameters are inherited from the ClickHouse server config (`listen_host`, `logger`, and so on). + +Internal coordination settings are located in `.` section: + +- `operation_timeout_ms` — Timeout for a single client operation (ms) (default: 10000). +- `min_session_timeout_ms` — Min timeout for client session (ms) (default: 10000). +- `session_timeout_ms` — Max timeout for client session (ms) (default: 100000). +- `dead_session_check_period_ms` — How often ClickHouse Keeper check dead sessions and remove them (ms) (default: 500). +- `heart_beat_interval_ms` — How often a ClickHouse Keeper leader will send heartbeats to followers (ms) (default: 500). +- `election_timeout_lower_bound_ms` — If the follower didn't receive heartbeats from the leader in this interval, then it can initiate leader election (default: 1000). +- `election_timeout_upper_bound_ms` — If the follower didn't receive heartbeats from the leader in this interval, then it must initiate leader election (default: 2000). +- `rotate_log_storage_interval` — How many log records to store in a single file (default: 100000). +- `reserved_log_items` — How many coordination log records to store before compaction (default: 100000). +- `snapshot_distance` — How often ClickHouse Keeper will create new snapshots (in the number of records in logs) (default: 100000). +- `snapshots_to_keep` — How many snapshots to keep (default: 3). +- `stale_log_gap` — Threshold when leader considers follower as stale and sends the snapshot to it instead of logs (default: 10000). +- `fresh_log_gap` — When node became fresh (default: 200). +- `max_requests_batch_size` - Max size of batch in requests count before it will be sent to RAFT (default: 100). +- `force_sync` — Call `fsync` on each write to coordination log (default: true). +- `quorum_reads` — Execute read requests as writes through whole RAFT consensus with similar speed (default: false). +- `raft_logs_level` — Text logging level about coordination (trace, debug, and so on) (default: system default). +- `auto_forwarding` — Allow to forward write requests from followers to the leader (default: true). +- `shutdown_timeout` — Wait to finish internal connections and shutdown (ms) (default: 5000). +- `startup_timeout` — If the server doesn't connect to other quorum participants in the specified timeout it will terminate (ms) (default: 30000). +- `four_letter_word_white_list` — White list of 4lw commands (default: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro"). + +Quorum configuration is located in `.` section and contain servers description. + +The only parameter for the whole quorum is `secure`, which enables encrypted connection for communication between quorum participants. The parameter can be set `true` if SSL connection is required for internal communication between nodes, or left unspecified otherwise. + +The main parameters for each `` are: + +- `id` — Server identifier in a quorum. +- `hostname` — Hostname where this server is placed. +- `port` — Port where this server listens for connections. + + +Examples of configuration for quorum with three nodes can be found in [integration tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) with `test_keeper_` prefix. Example configuration for server #1: + +```xml + + 2181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 10000 + 30000 + trace + + + + + 1 + zoo1 + 9444 + + + 2 + zoo2 + 9444 + + + 3 + zoo3 + 9444 + + + +``` + +## How to run {#how-to-run} + +ClickHouse Keeper is bundled into the ClickHouse server package, just add configuration of `` and start ClickHouse server as always. If you want to run standalone ClickHouse Keeper you can start it in a similar way with: + +```bash +clickhouse-keeper --config /etc/your_path_to_config/config.xml +``` + +If you don't have the symlink (`clickhouse-keeper`) you can create it or specify `keeper` as argument: + +```bash +clickhouse keeper --config /etc/your_path_to_config/config.xml +``` + +## Four Letter Word Commands {#four-letter-word-commands} + +ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. + +The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro". + +You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. + +``` +echo mntr | nc localhost 9181 +``` + +Bellow is the detailed 4lw commands: + +- `ruok`: Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information. + +``` +imok +``` + +- `mntr`: Outputs a list of variables that could be used for monitoring the health of the cluster. + +``` +zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +zk_avg_latency 0 +zk_max_latency 0 +zk_min_latency 0 +zk_packets_received 68 +zk_packets_sent 68 +zk_num_alive_connections 1 +zk_outstanding_requests 0 +zk_server_state leader +zk_znode_count 4 +zk_watch_count 1 +zk_ephemerals_count 0 +zk_approximate_data_size 723 +zk_open_file_descriptor_count 310 +zk_max_file_descriptor_count 10240 +zk_followers 0 +zk_synced_followers 0 +``` + +- `srvr`: Lists full details for the server. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Latency min/avg/max: 0/0/0 +Received: 2 +Sent : 2 +Connections: 1 +Outstanding: 0 +Zxid: 34 +Mode: leader +Node count: 4 +``` + +- `stat`: Lists brief details for the server and connected clients. + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Clients: + 192.168.1.1:52852(recved=0,sent=0) + 192.168.1.1:52042(recved=24,sent=48) +Latency min/avg/max: 0/0/0 +Received: 4 +Sent : 4 +Connections: 1 +Outstanding: 0 +Zxid: 36 +Mode: leader +Node count: 4 +``` + +- `srst`: Reset server statistics. The command will affect the result of `srvr`, `mntr` and `stat`. + +``` +Server stats reset. +``` + +- `conf`: Print details about serving configuration. + +``` +server_id=1 +tcp_port=2181 +four_letter_word_white_list=* +log_storage_path=./coordination/logs +snapshot_storage_path=./coordination/snapshots +max_requests_batch_size=100 +session_timeout_ms=30000 +operation_timeout_ms=10000 +dead_session_check_period_ms=500 +heart_beat_interval_ms=500 +election_timeout_lower_bound_ms=1000 +election_timeout_upper_bound_ms=2000 +reserved_log_items=1000000000000000 +snapshot_distance=10000 +auto_forwarding=true +shutdown_timeout=5000 +startup_timeout=240000 +raft_logs_level=information +snapshots_to_keep=3 +rotate_log_storage_interval=100000 +stale_log_gap=10000 +fresh_log_gap=200 +max_requests_batch_size=100 +quorum_reads=false +force_sync=false +compress_logs=true +compress_snapshots_with_zstd_format=true +configuration_change_tries_count=20 +``` + +- `cons`: List full connection/session details for all clients connected to this server. Includes information on numbers of packets received/sent, session id, operation latencies, last operation performed, etc... + +``` + 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) + 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) +``` + +- `crst`: Reset connection/session statistics for all connections. + +``` +Connection stats reset. +``` + +- `envi`: Print details about serving environment + +``` +Environment: +clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +host.name=ZBMAC-C02D4054M.local +os.name=Darwin +os.arch=x86_64 +os.version=19.6.0 +cpu.count=12 +user.name=root +user.home=/Users/JackyWoo/ +user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ +user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ +``` + + +- `dirs`: Shows the total size of snapshot and log files in bytes + +``` +snapshot_dir_size: 0 +log_dir_size: 3875 +``` + +- `isro`: Tests if server is running in read-only mode. The server will respond with "ro" if in read-only mode or "rw" if not in read-only mode. + +``` +rw +``` + +- `wchs`: Lists brief information on watches for the server. + +``` +1 connections watching 1 paths +Total watches:1 +``` + +- `wchc`: Lists detailed information on watches for the server, by session. This outputs a list of sessions (connections) with associated watches (paths). Note, depending on the number of watches this operation may be expensive (ie impact server performance), use it carefully. + +``` +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + +- `wchp`: Lists detailed information on watches for the server, by path. This outputs a list of paths (znodes) with associated sessions. Note, depending on the number of watches this operation may be expensive (i. e. impact server performance), use it carefully. + +``` +/clickhouse/task_queue/ddl + 0x0000000000000001 +``` + +- `dump`: Lists the outstanding sessions and ephemeral nodes. This only works on the leader. + +``` +Sessions dump (2): +0x0000000000000001 +0x0000000000000002 +Sessions with Ephemerals (1): +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + +## [experimental] Migration from ZooKeeper {#migration-from-zookeeper} + +Seamlessly migration from ZooKeeper to ClickHouse Keeper is impossible you have to stop your ZooKeeper cluster, convert data and start ClickHouse Keeper. `clickhouse-keeper-converter` tool allows converting ZooKeeper logs and snapshots to ClickHouse Keeper snapshot. It works only with ZooKeeper > 3.4. Steps for migration: + +1. Stop all ZooKeeper nodes. + +2. Optional, but recommended: find ZooKeeper leader node, start and stop it again. It will force ZooKeeper to create a consistent snapshot. + +3. Run `clickhouse-keeper-converter` on a leader, for example: + +```bash +clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --zookeeper-snapshots-dir /var/lib/zookeeper/version-2 --output-dir /path/to/clickhouse/keeper/snapshots +``` + +4. Copy snapshot to ClickHouse server nodes with a configured `keeper` or start ClickHouse Keeper instead of ZooKeeper. The snapshot must persist on all nodes, otherwise, empty nodes can be faster and one of them can become a leader. + +[Original article](https://clickhouse.com/docs/en/operations/clickhouse-keeper/) diff --git a/docs/en/reference/operations/configuration-files.md b/docs/en/reference/operations/configuration-files.md new file mode 100644 index 00000000000..582e90544e0 --- /dev/null +++ b/docs/en/reference/operations/configuration-files.md @@ -0,0 +1,159 @@ +--- +sidebar_position: 50 +sidebar_label: Configuration Files +--- + +# Configuration Files {#configuration_files} + +ClickHouse supports multi-file configuration management. The main server configuration file is `/etc/clickhouse-server/config.xml` or `/etc/clickhouse-server/config.yaml`. Other files must be in the `/etc/clickhouse-server/config.d` directory. Note, that any configuration file can be written either in XML or YAML, but mixing formats in one file is not supported. For example, you can have main configs as `config.xml` and `users.xml` and write additional files in `config.d` and `users.d` directories in `.yaml`. + +All XML files should have the same root element, usually ``. As for YAML, `clickhouse:` should not be present, the parser will insert it automatically. + +## Override {#override} + +Some settings specified in the main configuration file can be overridden in other configuration files: + +- The `replace` or `remove` attributes can be specified for the elements of these configuration files. +- If neither is specified, it combines the contents of elements recursively, replacing values of duplicate children. +- If `replace` is specified, it replaces the entire element with the specified one. +- If `remove` is specified, it deletes the element. + +You can also declare attributes as coming from environment variables by using `from_env="VARIABLE_NAME"`: + +```xml + + + + + + + +``` + +## Substitution {#substitution} + +The config can also define “substitutions”. If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include_from](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-include_from) element in the server config. The substitution values are specified in `/clickhouse/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](../operations/server-configuration-parameters/settings.md#macros)). + +If you want to replace an entire element with a substitution use `include` as element name. + +XML substitution example: + +```xml + + + + + + + + + + +``` + +Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node and it will be fully inserted into the source element. + +## User Settings {#user-settings} + +The `config.xml` file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the `users_config` element. By default, it is `users.xml`. If `users_config` is omitted, the user settings, profiles, and quotas are specified directly in `config.xml`. + +Users configuration can be splitted into separate files similar to `config.xml` and `config.d/`. +Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`. +Directory `users.d` is used by default, as `users_config` defaults to `users.xml`. + +Note that configuration files are first merged taking into account [Override](#override) settings and includes are processed after that. + +## XML example {#example} + +For example, you can have separate config file for each user like this: + +``` bash +$ cat /etc/clickhouse-server/users.d/alice.xml +``` + +``` xml + + + + analytics + + ::/0 + + ... + analytics + + + +``` + +## YAML examples {#example} + +Here you can see default config written in YAML: [config.yaml.example](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.yaml.example). + +There are some differences between YAML and XML formats in terms of ClickHouse configurations. Here are some tips for writing a configuration in YAML format. + +You should use a Scalar node to write a key-value pair: +``` yaml +key: value +``` + +To create a node, containing other nodes you should use a Map: +``` yaml +map_key: + key1: val1 + key2: val2 + key3: val3 +``` + +To create a list of values or nodes assigned to one tag you should use a Sequence: +``` yaml +seq_key: + - val1 + - val2 + - key1: val3 + - map: + key2: val4 + key3: val5 +``` + +If you want to write an attribute for a Sequence or Map node, you should use a @ prefix before the attribute key. Note, that @ is reserved by YAML standard, so you should also to wrap it into double quotes: + +``` yaml +map: + "@attr1": value1 + "@attr2": value2 + key: 123 +``` + +From that Map we will get these XML nodes: + +``` xml + + 123 + +``` + +You can also set attributes for Sequence: + +``` yaml +seq: + - "@attr1": value1 + - "@attr2": value2 + - 123 + - abc +``` + +So, we can get YAML config equal to this XML one: + +``` xml +123 +abc +``` + +## Implementation Details {#implementation-details} + +For each config file, the server also generates `file-preprocessed.xml` files when starting. These files contain all the completed substitutions and overrides, and they are intended for informational use. If ZooKeeper substitutions were used in the config files but ZooKeeper is not available on the server start, the server loads the configuration from the preprocessed file. + +The server tracks changes in config files, as well as files and ZooKeeper nodes that were used when performing substitutions and overrides, and reloads the settings for users and clusters on the fly. This means that you can modify the cluster, users, and their settings without restarting the server. + +[Original article](https://clickhouse.com/docs/en/operations/configuration-files/) diff --git a/docs/en/reference/operations/external-authenticators/index.md b/docs/en/reference/operations/external-authenticators/index.md new file mode 100644 index 00000000000..af2ba713ec1 --- /dev/null +++ b/docs/en/reference/operations/external-authenticators/index.md @@ -0,0 +1,16 @@ +--- +sidebar_position: 48 +sidebar_label: External User Authenticators and Directories +--- + +# External User Authenticators and Directories + +ClickHouse supports authenticating and managing users using external services. + +The following external authenticators and directories are supported: + +- [LDAP](./ldap.md#external-authenticators-ldap) [Authenticator](./ldap.md#ldap-external-authenticator) and [Directory](./ldap.md#ldap-external-user-directory) +- Kerberos [Authenticator](./kerberos.md#external-authenticators-kerberos) +- [SSL X.509 authentication](./ssl-x509.md#ssl-external-authentication) + +[Original article](https://clickhouse.com/docs/en/operations/external-authenticators/index/) diff --git a/docs/en/reference/operations/external-authenticators/kerberos.md b/docs/en/reference/operations/external-authenticators/kerberos.md new file mode 100644 index 00000000000..3711bac79c3 --- /dev/null +++ b/docs/en/reference/operations/external-authenticators/kerberos.md @@ -0,0 +1,118 @@ +# Kerberos {#external-authenticators-kerberos} + +Existing and properly configured ClickHouse users can be authenticated via Kerberos authentication protocol. + +Currently, Kerberos can only be used as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths. Those users may only use HTTP requests and must be able to authenticate using GSS-SPNEGO mechanism. + +For this approach, Kerberos must be configured in the system and must be enabled in ClickHouse config. + + +## Enabling Kerberos in ClickHouse {#enabling-kerberos-in-clickhouse} + +To enable Kerberos, one should include `kerberos` section in `config.xml`. This section may contain additional parameters. + +#### Parameters: + +- `principal` - canonical service principal name that will be acquired and used when accepting security contexts. + - This parameter is optional, if omitted, the default principal will be used. + + +- `realm` - a realm, that will be used to restrict authentication to only those requests whose initiator's realm matches it. + - This parameter is optional, if omitted, no additional filtering by realm will be applied. + +Example (goes into `config.xml`): + +```xml + + + + +``` + +With principal specification: + +```xml + + + + HTTP/clickhouse.example.com@EXAMPLE.COM + + +``` + +With filtering by realm: + +```xml + + + + EXAMPLE.COM + + +``` + +:::warning +You can define only one `kerberos` section. The presence of multiple `kerberos` sections will force ClickHouse to disable Kerberos authentication. +::: + +:::warning +`principal` and `realm` sections cannot be specified at the same time. The presence of both `principal` and `realm` sections will force ClickHouse to disable Kerberos authentication. +::: + +## Kerberos as an external authenticator for existing users {#kerberos-as-an-external-authenticator-for-existing-users} + +Kerberos can be used as a method for verifying the identity of locally defined users (users defined in `users.xml` or in local access control paths). Currently, **only** requests over the HTTP interface can be *kerberized* (via GSS-SPNEGO mechanism). + +Kerberos principal name format usually follows this pattern: + +- *primary/instance@REALM* + +The */instance* part may occur zero or more times. **The *primary* part of the canonical principal name of the initiator is expected to match the kerberized user name for authentication to succeed**. + +### Enabling Kerberos in `users.xml` {#enabling-kerberos-in-users-xml} + +In order to enable Kerberos authentication for the user, specify `kerberos` section instead of `password` or similar sections in the user definition. + +Parameters: + +- `realm` - a realm that will be used to restrict authentication to only those requests whose initiator's realm matches it. + - This parameter is optional, if omitted, no additional filtering by realm will be applied. + +Example (goes into `users.xml`): + +```xml + + + + + + + + EXAMPLE.COM + + + + +``` + +:::warning +Note that Kerberos authentication cannot be used alongside with any other authentication mechanism. The presence of any other sections like `password` alongside `kerberos` will force ClickHouse to shutdown. +::: + +:::info Reminder +Note, that now, once user `my_user` uses `kerberos`, Kerberos must be enabled in the main `config.xml` file as described previously. +::: + +### Enabling Kerberos using SQL {#enabling-kerberos-using-sql} + +When [SQL-driven Access Control and Account Management](../access-rights.md#access-control) is enabled in ClickHouse, users identified by Kerberos can also be created using SQL statements. + +```sql +CREATE USER my_user IDENTIFIED WITH kerberos REALM 'EXAMPLE.COM' +``` + +...or, without filtering by realm: + +```sql +CREATE USER my_user IDENTIFIED WITH kerberos +``` diff --git a/docs/en/reference/operations/external-authenticators/ldap.md b/docs/en/reference/operations/external-authenticators/ldap.md new file mode 100644 index 00000000000..57e6ec1a087 --- /dev/null +++ b/docs/en/reference/operations/external-authenticators/ldap.md @@ -0,0 +1,182 @@ +# LDAP {#external-authenticators-ldap} + +LDAP server can be used to authenticate ClickHouse users. There are two different approaches for doing this: + +- Use LDAP as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths. +- Use LDAP as an external user directory and allow locally undefined users to be authenticated if they exist on the LDAP server. + +For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config so that other parts of the config can refer to it. + +## LDAP Server Definition {#ldap-server-definition} + +To define LDAP server you must add `ldap_servers` section to the `config.xml`. + +**Example** + +```xml + + + + + + localhost + 636 + uid={user_name},ou=users,dc=example,dc=com + 300 + yes + tls1.2 + demand + /path/to/tls_cert_file + /path/to/tls_key_file + /path/to/tls_ca_cert_file + /path/to/tls_ca_cert_dir + ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:AES256-GCM-SHA384 + + + + + localhost + 389 + EXAMPLE\{user_name} + + CN=Users,DC=example,DC=com + (&(objectClass=user)(sAMAccountName={user_name})) + + no + + + +``` + +Note, that you can define multiple LDAP servers inside the `ldap_servers` section using distinct names. + +**Parameters** + +- `host` — LDAP server hostname or IP, this parameter is mandatory and cannot be empty. +- `port` — LDAP server port, default is `636` if `enable_tls` is set to `true`, `389` otherwise. +- `bind_dn` — Template used to construct the DN to bind to. + - The resulting DN will be constructed by replacing all `{user_name}` substrings of the template with the actual user name during each authentication attempt. +- `user_dn_detection` — Section with LDAP search parameters for detecting the actual user DN of the bound user. + - This is mainly used in search filters for further role mapping when the server is Active Directory. The resulting user DN will be used when replacing `{user_dn}` substrings wherever they are allowed. By default, user DN is set equal to bind DN, but once search is performed, it will be updated with to the actual detected user DN value. + - `base_dn` — Template used to construct the base DN for the LDAP search. + - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` substrings of the template with the actual user name and bind DN during the LDAP search. + - `scope` — Scope of the LDAP search. + - Accepted values are: `base`, `one_level`, `children`, `subtree` (the default). + - `search_filter` — Template used to construct the search filter for the LDAP search. + - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}` substrings of the template with the actual user name, bind DN, and base DN during the LDAP search. + - Note, that the special characters must be escaped properly in XML. +- `verification_cooldown` — A period of time, in seconds, after a successful bind attempt, during which the user will be assumed to be successfully authenticated for all consecutive requests without contacting the LDAP server. + - Specify `0` (the default) to disable caching and force contacting the LDAP server for each authentication request. +- `enable_tls` — A flag to trigger the use of the secure connection to the LDAP server. + - Specify `no` for plain text `ldap://` protocol (not recommended). + - Specify `yes` for LDAP over SSL/TLS `ldaps://` protocol (recommended, the default). + - Specify `starttls` for legacy StartTLS protocol (plain text `ldap://` protocol, upgraded to TLS). +- `tls_minimum_protocol_version` — The minimum protocol version of SSL/TLS. + - Accepted values are: `ssl2`, `ssl3`, `tls1.0`, `tls1.1`, `tls1.2` (the default). +- `tls_require_cert` — SSL/TLS peer certificate verification behavior. + - Accepted values are: `never`, `allow`, `try`, `demand` (the default). +- `tls_cert_file` — Path to certificate file. +- `tls_key_file` — Path to certificate key file. +- `tls_ca_cert_file` — Path to CA certificate file. +- `tls_ca_cert_dir` — Path to the directory containing CA certificates. +- `tls_cipher_suite` — Allowed cipher suite (in OpenSSL notation). + +## LDAP External Authenticator {#ldap-external-authenticator} + +A remote LDAP server can be used as a method for verifying passwords for locally defined users (users defined in `users.xml` or in local access control paths). To achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. + +At each login attempt, ClickHouse tries to "bind" to the specified DN defined by the `bind_dn` parameter in the [LDAP server definition](#ldap-server-definition) using the provided credentials, and if successful, the user is considered authenticated. This is often called a "simple bind" method. + +**Example** + +```xml + + + + + + + + my_ldap_server + + + + +``` + +Note, that user `my_user` refers to `my_ldap_server`. This LDAP server must be configured in the main `config.xml` file as described previously. + +When SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled, users that are authenticated by LDAP servers can also be created using the [CREATE USER](../../sql-reference/statements/create/user.md#create-user-statement) statement. + +Query: + +```sql +CREATE USER my_user IDENTIFIED WITH ldap SERVER 'my_ldap_server'; +``` + +## LDAP Exernal User Directory {#ldap-external-user-directory} + +In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. To achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section of the `config.xml` file. + +At each login attempt, ClickHouse tries to find the user definition locally and authenticate it as usual. If the user is not defined, ClickHouse will assume the definition exists in the external LDAP directory and will try to "bind" to the specified DN at the LDAP server using the provided credentials. If successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then be assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled and roles are created using the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. + +**Example** + +Goes into `config.xml`. + +```xml + + + + + + my_ldap_server + + + + + + ou=groups,dc=example,dc=com + subtree + (&(objectClass=groupOfNames)(member={bind_dn})) + cn + clickhouse_ + + + + + + my_ad_server + + CN=Users,DC=example,DC=com + CN + subtree + (&(objectClass=group)(member={user_dn})) + clickhouse_ + + + + +``` + +Note that `my_ldap_server` referred in the `ldap` section inside the `user_directories` section must be a previously defined LDAP server that is configured in the `config.xml` (see [LDAP Server Definition](#ldap-server-definition)). + +**Parameters** + +- `server` — One of LDAP server names defined in the `ldap_servers` config section above. This parameter is mandatory and cannot be empty. +- `roles` — Section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. + - If no roles are specified here or assigned during role mapping (below), user will not be able to perform any actions after authentication. +- `role_mapping` — Section with LDAP search parameters and mapping rules. + - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` and the name of the logged-in user. For each entry found during that search, the value of the specified attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. + - There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied. + - `base_dn` — Template used to construct the base DN for the LDAP search. + - The resulting DN will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{user_dn}` substrings of the template with the actual user name, bind DN, and user DN during each LDAP search. + - `scope` — Scope of the LDAP search. + - Accepted values are: `base`, `one_level`, `children`, `subtree` (the default). + - `search_filter` — Template used to construct the search filter for the LDAP search. + - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, `{user_dn}`, and `{base_dn}` substrings of the template with the actual user name, bind DN, user DN, and base DN during each LDAP search. + - Note, that the special characters must be escaped properly in XML. + - `attribute` — Attribute name whose values will be returned by the LDAP search. `cn`, by default. + - `prefix` — Prefix, that will be expected to be in front of each string in the original list of strings returned by the LDAP search. The prefix will be removed from the original strings and the resulting strings will be treated as local role names. Empty by default. + +[Original article](https://clickhouse.com/docs/en/operations/external-authenticators/ldap/) diff --git a/docs/en/reference/operations/external-authenticators/ssl-x509.md b/docs/en/reference/operations/external-authenticators/ssl-x509.md new file mode 100644 index 00000000000..dd4f35257bb --- /dev/null +++ b/docs/en/reference/operations/external-authenticators/ssl-x509.md @@ -0,0 +1,24 @@ +# SSL X.509 certificate authentication {#ssl-external-authentication} + +[SSL 'strict' option](../server-configuration-parameters/settings.md#server_configuration_parameters-openssl) enables mandatory certificate validation for the incoming connections. In this case, only connections with trusted certificates can be established. Connections with untrusted certificates will be rejected. Thus, certificate validation allows to uniquely authenticate an incoming connection. `Common Name` field of the certificate is used to identify connected user. This allows to associate multiple certificates with the same user. Additionally, reissuing and revoking of the certificates does not affect the ClickHouse configuration. + +To enable SSL certificate authentication, a list of `Common Name`'s for each ClickHouse user must be sspecified in the settings file `config.xml `: + +**Example** +```xml + + + + + + host.domain.com:example_user + host.domain.com:example_user_dev + + + + + + +``` + +For the SSL [`chain of trust`](https://en.wikipedia.org/wiki/Chain_of_trust) to work correctly, it is also important to make sure that the [`caConfig`](../server-configuration-parameters/settings.md#server_configuration_parameters-openssl) parameter is configured properly. \ No newline at end of file diff --git a/docs/en/reference/operations/index.md b/docs/en/reference/operations/index.md new file mode 100644 index 00000000000..824e851e997 --- /dev/null +++ b/docs/en/reference/operations/index.md @@ -0,0 +1,25 @@ +--- +sidebar_position: 41 +sidebar_label: Operations +--- + +# Operations {#operations} + +ClickHouse operations manual consists of the following major sections: + +- [Requirements](../operations/requirements.md) +- [Monitoring](../operations/monitoring.md) +- [Troubleshooting](../operations/troubleshooting.md) +- [Usage Recommendations](../operations/tips.md) +- [Update Procedure](../operations/update.md) +- [Access Rights](../operations/access-rights.md) +- [Data Backup](../operations/backup.md) +- [Configuration Files](../operations/configuration-files.md) +- [Quotas](../operations/quotas.md) +- [System Tables](../operations/system-tables/index.md) +- [Server Configuration Parameters](../operations/server-configuration-parameters/index.md) +- [How To Test Your Hardware With ClickHouse](../operations/performance-test.md) +- [Settings](../operations/settings/index.md) +- [Utilities](../operations/utilities/index.md) + +[Original article](https://clickhouse.com/docs/en/operations/) diff --git a/docs/en/reference/operations/monitoring.md b/docs/en/reference/operations/monitoring.md new file mode 100644 index 00000000000..437122e106d --- /dev/null +++ b/docs/en/reference/operations/monitoring.md @@ -0,0 +1,44 @@ +--- +sidebar_position: 45 +sidebar_label: Monitoring +--- + +# Monitoring {#monitoring} + +You can monitor: + +- Utilization of hardware resources. +- ClickHouse server metrics. + +## Resource Utilization {#resource-utilization} + +ClickHouse does not monitor the state of hardware resources by itself. + +It is highly recommended to set up monitoring for: + +- Load and temperature on processors. + + You can use [dmesg](https://en.wikipedia.org/wiki/Dmesg), [turbostat](https://www.linux.org/docs/man8/turbostat.html) or other instruments. + +- Utilization of storage system, RAM and network. + +## ClickHouse Server Metrics {#clickhouse-server-metrics} + +ClickHouse server has embedded instruments for self-state monitoring. + +To track server events use server logs. See the [logger](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-logger) section of the configuration file. + +ClickHouse collects: + +- Different metrics of how the server uses computational resources. +- Common statistics on query processing. + +You can find metrics in the [system.metrics](../operations/system-tables/metrics.md#system_tables-metrics), [system.events](../operations/system-tables/events.md#system_tables-events), and [system.asynchronous_metrics](../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) tables. + +You can configure ClickHouse to export metrics to [Graphite](https://github.com/graphite-project). See the [Graphite section](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-graphite) in the ClickHouse server configuration file. Before configuring export of metrics, you should set up Graphite by following their official [guide](https://graphite.readthedocs.io/en/latest/install.html). + +You can configure ClickHouse to export metrics to [Prometheus](https://prometheus.io). See the [Prometheus section](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-prometheus) in the ClickHouse server configuration file. Before configuring export of metrics, you should set up Prometheus by following their official [guide](https://prometheus.io/docs/prometheus/latest/installation/). + +Additionally, you can monitor server availability through the HTTP API. Send the `HTTP GET` request to `/ping`. If the server is available, it responds with `200 OK`. + +To monitor servers in a cluster configuration, you should set the [max_replica_delay_for_distributed_queries](../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) parameter and use the HTTP resource `/replicas_status`. A request to `/replicas_status` returns `200 OK` if the replica is available and is not delayed behind the other replicas. If a replica is delayed, it returns `503 HTTP_SERVICE_UNAVAILABLE` with information about the gap. diff --git a/docs/en/reference/operations/named-collections.md b/docs/en/reference/operations/named-collections.md new file mode 100644 index 00000000000..52520ba76b7 --- /dev/null +++ b/docs/en/reference/operations/named-collections.md @@ -0,0 +1,230 @@ +--- +sidebar_position: 69 +sidebar_label: "Named connections" +--- + +# Storing details for connecting to external sources in configuration files {#named-collections} + +Details for connecting to external sources (dictionaries, tables, table functions) can be saved +in configuration files and thus simplify the creation of objects and hide credentials +from users with only SQL access. + +Parameters can be set in XML `CSV` and overridden in SQL `, format = 'TSV'`. +The parameters in SQL can be overridden using format `key` = `value`: `compression_method = 'gzip'`. + +Named connections are stored in the `config.xml` file of the ClickHouse server in the `` section and are applied when ClickHouse starts. + +Example of configuration: +```xml +$ cat /etc/clickhouse-server/config.d/named_collections.xml + + + ... + + +``` + +## Named connections for accessing S3. + +The description of parameters see [s3 Table Function](../sql-reference/table-functions/s3.md). + +Example of configuration: +```xml + + + + AKIAIOSFODNN7EXAMPLE + wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + CSV + https://s3.us-east-1.amazonaws.com/yourbucket/mydata/ + + + +``` + +### Example of using named connections with the s3 function + +```sql +INSERT INTO FUNCTION s3(s3_mydata, filename = 'test_file.tsv.gz', + format = 'TSV', structure = 'number UInt64', compression_method = 'gzip') +SELECT * FROM numbers(10000); + +SELECT count() +FROM s3(s3_mydata, filename = 'test_file.tsv.gz') + +┌─count()─┐ +│ 10000 │ +└─────────┘ +1 rows in set. Elapsed: 0.279 sec. Processed 10.00 thousand rows, 90.00 KB (35.78 thousand rows/s., 322.02 KB/s.) +``` + +### Example of using named connections with an S3 table + +```sql +CREATE TABLE s3_engine_table (number Int64) +ENGINE=S3(s3_mydata, url='https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz', format = 'TSV') +SETTINGS input_format_with_names_use_header = 0; + +SELECT * FROM s3_engine_table LIMIT 3; +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +└────────┘ +``` + +## Named connections for accessing MySQL database + +The description of parameters see [mysql](../sql-reference/table-functions/mysql.md). + +Example of configuration: +```xml + + + + myuser + mypass + 127.0.0.1 + 3306 + test + 8 + 1 + 1 + + + +``` + +### Example of using named connections with the mysql function + +```sql +SELECT count() FROM mysql(mymysql, table = 'test'); + +┌─count()─┐ +│ 3 │ +└─────────┘ +``` + +### Example of using named connections with an MySQL table + +```sql +CREATE TABLE mytable(A Int64) ENGINE = MySQL(mymysql, table = 'test', connection_pool_size=3, replace_query=0); +SELECT count() FROM mytable; + +┌─count()─┐ +│ 3 │ +└─────────┘ +``` + +### Example of using named connections with database with engine MySQL + +```sql +CREATE DATABASE mydatabase ENGINE = MySQL(mymysql); + +SHOW TABLES FROM mydatabase; + +┌─name───┐ +│ source │ +│ test │ +└────────┘ +``` + +### Example of using named connections with an external dictionary with source MySQL + +```sql +CREATE DICTIONARY dict (A Int64, B String) +PRIMARY KEY A +SOURCE(MYSQL(NAME mymysql TABLE 'source')) +LIFETIME(MIN 1 MAX 2) +LAYOUT(HASHED()); + +SELECT dictGet('dict', 'B', 2); + +┌─dictGet('dict', 'B', 2)─┐ +│ two │ +└─────────────────────────┘ +``` + +## Named connections for accessing PostgreSQL database + +The description of parameters see [postgresql](../sql-reference/table-functions/postgresql.md). + +Example of configuration: +```xml + + + + pguser + jw8s0F4 + 127.0.0.1 + 5432 + test + test_schema + 8 + + + +``` + +### Example of using named connections with the postgresql function + +```sql +SELECT * FROM postgresql(mypg, table = 'test'); + +┌─a─┬─b───┐ +│ 2 │ two │ +│ 1 │ one │ +└───┴─────┘ + + +SELECT * FROM postgresql(mypg, table = 'test', schema = 'public'); + +┌─a─┐ +│ 1 │ +│ 2 │ +│ 3 │ +└───┘ +``` + + +### Example of using named connections with database with engine PostgreSQL + +```sql +CREATE TABLE mypgtable (a Int64) ENGINE = PostgreSQL(mypg, table = 'test', schema = 'public'); + +SELECT * FROM mypgtable; + +┌─a─┐ +│ 1 │ +│ 2 │ +│ 3 │ +└───┘ +``` + +### Example of using named connections with database with engine PostgreSQL + +```sql +CREATE DATABASE mydatabase ENGINE = PostgreSQL(mypg); + +SHOW TABLES FROM mydatabase + +┌─name─┐ +│ test │ +└──────┘ +``` + +### Example of using named connections with an external dictionary with source POSTGRESQL + +```sql +CREATE DICTIONARY dict (a Int64, b String) +PRIMARY KEY a +SOURCE(POSTGRESQL(NAME mypg TABLE test)) +LIFETIME(MIN 1 MAX 2) +LAYOUT(HASHED()); + +SELECT dictGet('dict', 'b', 2); + +┌─dictGet('dict', 'b', 2)─┐ +│ two │ +└─────────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/reference/operations/opentelemetry.md b/docs/en/reference/operations/opentelemetry.md new file mode 100644 index 00000000000..740537d88bc --- /dev/null +++ b/docs/en/reference/operations/opentelemetry.md @@ -0,0 +1,65 @@ +--- +sidebar_position: 62 +sidebar_label: OpenTelemetry Support +--- + +# [experimental] OpenTelemetry Support + +[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting traces and metrics from the distributed application. ClickHouse has some support for OpenTelemetry. + +:::warning +This is an experimental feature that will change in backwards-incompatible ways in future releases. +::: + +## Supplying Trace Context to ClickHouse + +ClickHouse accepts trace context HTTP headers, as described by the [W3C recommendation](https://www.w3.org/TR/trace-context/). It also accepts trace context over a native protocol that is used for communication between ClickHouse servers or between the client and server. For manual testing, trace context headers conforming to the Trace Context recommendation can be supplied to `clickhouse-client` using `--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags. + +If no parent trace context is supplied or the provided trace context does not comply with W3C standard above, ClickHouse can start a new trace, with probability controlled by the [opentelemetry_start_trace_probability](../operations/settings/settings.md#opentelemetry-start-trace-probability) setting. + +## Propagating the Trace Context + +The trace context is propagated to downstream services in the following cases: + +* Queries to remote ClickHouse servers, such as when using [Distributed](../engines/table-engines/special/distributed.md) table engine. + +* [url](../sql-reference/table-functions/url.md) table function. Trace context information is sent in HTTP headers. + +## Tracing the ClickHouse Itself + +ClickHouse creates `trace spans` for each query and some of the query execution stages, such as query planning or distributed queries. + +To be useful, the tracing information has to be exported to a monitoring system that supports OpenTelemetry, such as [Jaeger](https://jaegertracing.io/) or [Prometheus](https://prometheus.io/). ClickHouse avoids a dependency on a particular monitoring system, instead only providing the tracing data through a system table. OpenTelemetry trace span information [required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) is stored in the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table. + +The table must be enabled in the server configuration, see the `opentelemetry_span_log` element in the default config file `config.xml`. It is enabled by default. + +The tags or attributes are saved as two parallel arrays, containing the keys and values. Use [ARRAY JOIN](../sql-reference/statements/select/array-join.md) to work with them. + +## Integration with monitoring systems + +At the moment, there is no ready tool that can export the tracing data from ClickHouse to a monitoring system. + +For testing, it is possible to setup the export using a materialized view with the [URL](../engines/table-engines/special/url.md) engine over the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format: + +```sql +CREATE MATERIALIZED VIEW default.zipkin_spans +ENGINE = URL('http://127.0.0.1:9411/api/v2/spans', 'JSONEachRow') +SETTINGS output_format_json_named_tuples_as_objects = 1, + output_format_json_array_of_rows = 1 AS +SELECT + lower(hex(trace_id)) AS traceId, + case when parent_span_id = 0 then '' else lower(hex(parent_span_id)) end AS parentId, + lower(hex(span_id)) AS id, + operation_name AS name, + start_time_us AS timestamp, + finish_time_us - start_time_us AS duration, + cast(tuple('clickhouse'), 'Tuple(serviceName text)') AS localEndpoint, + cast(tuple( + attribute.values[indexOf(attribute.names, 'db.statement')]), + 'Tuple("db.statement" text)') AS tags +FROM system.opentelemetry_span_log +``` + +In case of any errors, the part of the log data for which the error has occurred will be silently lost. Check the server log for error messages if the data does not arrive. + +[Original article](https://clickhouse.com/docs/en/operations/opentelemetry/) diff --git a/docs/en/reference/operations/optimizing-performance/index.md b/docs/en/reference/operations/optimizing-performance/index.md new file mode 100644 index 00000000000..ef9c6a4b664 --- /dev/null +++ b/docs/en/reference/operations/optimizing-performance/index.md @@ -0,0 +1,8 @@ +--- +sidebar_label: Optimizing Performance +sidebar_position: 52 +--- + +# Optimizing Performance {#optimizing-performance} + +- [Sampling query profiler](../../operations/optimizing-performance/sampling-query-profiler.md) diff --git a/docs/en/reference/operations/optimizing-performance/sampling-query-profiler.md b/docs/en/reference/operations/optimizing-performance/sampling-query-profiler.md new file mode 100644 index 00000000000..39e83545506 --- /dev/null +++ b/docs/en/reference/operations/optimizing-performance/sampling-query-profiler.md @@ -0,0 +1,62 @@ +--- +sidebar_position: 54 +sidebar_label: Query Profiling +--- + +# Sampling Query Profiler {#sampling-query-profiler} + +ClickHouse runs sampling profiler that allows analyzing query execution. Using profiler you can find source code routines that used the most frequently during query execution. You can trace CPU time and wall-clock time spent including idle time. + +To use profiler: + +- Setup the [trace_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-trace_log) section of the server configuration. + + This section configures the [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) system table containing the results of the profiler functioning. It is configured by default. Remember that data in this table is valid only for a running server. After the server restart, ClickHouse does not clean up the table and all the stored virtual memory address may become invalid. + +- Setup the [query_profiler_cpu_time_period_ns](../../operations/settings/settings.md#query_profiler_cpu_time_period_ns) or [query_profiler_real_time_period_ns](../../operations/settings/settings.md#query_profiler_real_time_period_ns) settings. Both settings can be used simultaneously. + + These settings allow you to configure profiler timers. As these are the session settings, you can get different sampling frequency for the whole server, individual users or user profiles, for your interactive session, and for each individual query. + +The default sampling frequency is one sample per second and both CPU and real timers are enabled. This frequency allows collecting enough information about ClickHouse cluster. At the same time, working with this frequency, profiler does not affect ClickHouse server’s performance. If you need to profile each individual query try to use higher sampling frequency. + +To analyze the `trace_log` system table: + +- Install the `clickhouse-common-static-dbg` package. See [Install from DEB Packages](../../install.md#install-from-deb-packages). + +- Allow introspection functions by the [allow_introspection_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) setting. + + For security reasons, introspection functions are disabled by default. + +- Use the `addressToLine`, `addressToLineWithInlines`, `addressToSymbol` and `demangle` [introspection functions](../../sql-reference/functions/introspection.md) to get function names and their positions in ClickHouse code. To get a profile for some query, you need to aggregate data from the `trace_log` table. You can aggregate data by individual functions or by the whole stack traces. + +If you need to visualize `trace_log` info, try [flamegraph](../../interfaces/third-party/gui/#clickhouse-flamegraph) and [speedscope](https://github.com/laplab/clickhouse-speedscope). + +## Example {#example} + +In this example we: + +- Filtering `trace_log` data by a query identifier and the current date. + +- Aggregating by stack trace. + +- Using introspection functions, we will get a report of: + + - Names of symbols and corresponding source code functions. + - Source code locations of these functions. + + + +``` sql +SELECT + count(), + arrayStringConcat(arrayMap(x -> concat(demangle(addressToSymbol(x)), '\n ', addressToLine(x)), trace), '\n') AS sym +FROM system.trace_log +WHERE (query_id = 'ebca3574-ad0a-400a-9cbc-dca382f5998c') AND (event_date = today()) +GROUP BY trace +ORDER BY count() DESC +LIMIT 10 +``` + +``` text +{% include "examples/sampling_query_profiler_result.txt" %} +``` diff --git a/docs/en/reference/operations/performance-test.md b/docs/en/reference/operations/performance-test.md new file mode 100644 index 00000000000..0ba3a9908a5 --- /dev/null +++ b/docs/en/reference/operations/performance-test.md @@ -0,0 +1,83 @@ +--- +sidebar_position: 54 +sidebar_label: Testing Hardware +--- + +# How to Test Your Hardware with ClickHouse {#how-to-test-your-hardware-with-clickhouse} + +You can run basic ClickHouse performance test on any server without installation of ClickHouse packages. + + +## Automated Run + +You can run benchmark with a single script. + +1. Download the script. +``` +wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/hardware.sh +``` + +2. Run the script. +``` +chmod a+x ./hardware.sh +./hardware.sh +``` + +3. Copy the output and send it to feedback@clickhouse.com + +All the results are published here: https://clickhouse.com/benchmark/hardware/ + + +## Manual Run + +Alternatively you can perform benchmark in the following steps. + +1. ssh to the server and download the binary with wget: +```bash +# For amd64: +wget https://builds.clickhouse.com/master/amd64/clickhouse +# For aarch64: +wget https://builds.clickhouse.com/master/aarch64/clickhouse +# For powerpc64le: +wget https://builds.clickhouse.com/master/powerpc64le/clickhouse +# For freebsd: +wget https://builds.clickhouse.com/master/freebsd/clickhouse +# For freebsd-aarch64: +wget https://builds.clickhouse.com/master/freebsd-aarch64/clickhouse +# For freebsd-powerpc64le: +wget https://builds.clickhouse.com/master/freebsd-powerpc64le/clickhouse +# For macos: +wget https://builds.clickhouse.com/master/macos/clickhouse +# For macos-aarch64: +wget https://builds.clickhouse.com/master/macos-aarch64/clickhouse +# Then do: +chmod a+x clickhouse +``` +2. Download benchmark files: +```bash +wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/benchmark-new.sh +chmod a+x benchmark-new.sh +wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/queries.sql +``` +3. Download the [web analytics dataset](../example-datasets/metrica.md) (“hits” table containing 100 million rows). +```bash +wget https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz +tar xvf hits_100m_obfuscated_v1.tar.xz -C . +mv hits_100m_obfuscated_v1/* . +``` +4. Run the server: +```bash +./clickhouse server +``` +5. Check the data: ssh to the server in another terminal +```bash +./clickhouse client --query "SELECT count() FROM hits_100m_obfuscated" +100000000 +``` +6. Run the benchmark: +```bash +./benchmark-new.sh hits_100m_obfuscated +``` +7. Send the numbers and the info about your hardware configuration to feedback@clickhouse.com + +All the results are published here: https://clickhouse.com/benchmark/hardware/ diff --git a/docs/en/reference/operations/quotas.md b/docs/en/reference/operations/quotas.md new file mode 100644 index 00000000000..77b0697d483 --- /dev/null +++ b/docs/en/reference/operations/quotas.md @@ -0,0 +1,120 @@ +--- +sidebar_position: 51 +sidebar_label: Quotas +--- + +# Quotas {#quotas} + +Quotas allow you to limit resource usage over a period of time or track the use of resources. +Quotas are set up in the user config, which is usually ‘users.xml’. + +The system also has a feature for limiting the complexity of a single query. See the section [Restrictions on query complexity](../operations/settings/query-complexity.md). + +In contrast to query complexity restrictions, quotas: + +- Place restrictions on a set of queries that can be run over a period of time, instead of limiting a single query. +- Account for resources spent on all remote servers for distributed query processing. + +Let’s look at the section of the ‘users.xml’ file that defines quotas. + +``` xml + + + + + + + + 3600 + + + 0 + 0 + 0 + 0 + 0 + 0 + 0 + + +``` + +By default, the quota tracks resource consumption for each hour, without limiting usage. +The resource consumption calculated for each interval is output to the server log after each request. + +``` xml + + + + + 3600 + + 1000 + 100 + 100 + 100 + 1000000000 + 100000000000 + 900 + + + + 86400 + + 10000 + 10000 + 10000 + 1000 + 5000000000 + 500000000000 + 7200 + + +``` + +For the ‘statbox’ quota, restrictions are set for every hour and for every 24 hours (86,400 seconds). The time interval is counted, starting from an implementation-defined fixed moment in time. In other words, the 24-hour interval does not necessarily begin at midnight. + +When the interval ends, all collected values are cleared. For the next hour, the quota calculation starts over. + +Here are the amounts that can be restricted: + +`queries` – The total number of requests. + +`query_selects` – The total number of select requests. + +`query_inserts` – The total number of insert requests. + +`errors` – The number of queries that threw an exception. + +`result_rows` – The total number of rows given as a result. + +`read_rows` – The total number of source rows read from tables for running the query on all remote servers. + +`execution_time` – The total query execution time, in seconds (wall time). + +If the limit is exceeded for at least one time interval, an exception is thrown with a text about which restriction was exceeded, for which interval, and when the new interval begins (when queries can be sent again). + +Quotas can use the “quota key” feature to report on resources for multiple keys independently. Here is an example of this: + +``` xml + + + + +``` + +The quota is assigned to users in the ‘users’ section of the config. See the section “Access rights”. + +For distributed query processing, the accumulated amounts are stored on the requestor server. So if the user goes to another server, the quota there will “start over”. + +When the server is restarted, quotas are reset. + +[Original article](https://clickhouse.com/docs/en/operations/quotas/) diff --git a/docs/en/reference/operations/requirements.md b/docs/en/reference/operations/requirements.md new file mode 100644 index 00000000000..4b13033096f --- /dev/null +++ b/docs/en/reference/operations/requirements.md @@ -0,0 +1,59 @@ +--- +sidebar_position: 44 +sidebar_label: Requirements +--- + +# Requirements + +## CPU {#cpu} + +For installation from prebuilt deb packages, use a CPU with x86_64 architecture and support for SSE 4.2 instructions. To run ClickHouse with processors that do not support SSE 4.2 or have AArch64 or PowerPC64LE architecture, you should build ClickHouse from sources. + +ClickHouse implements parallel data processing and uses all the hardware resources available. When choosing a processor, take into account that ClickHouse works more efficiently at configurations with a large number of cores but a lower clock rate than at configurations with fewer cores and a higher clock rate. For example, 16 cores with 2600 MHz is preferable to 8 cores with 3600 MHz. + +It is recommended to use **Turbo Boost** and **hyper-threading** technologies. It significantly improves performance with a typical workload. + +## RAM {#ram} + +We recommend using a minimum of 4GB of RAM to perform non-trivial queries. The ClickHouse server can run with a much smaller amount of RAM, but it requires memory for processing queries. + +The required volume of RAM depends on: + +- The complexity of queries. +- The amount of data that is processed in queries. + +To calculate the required volume of RAM, you should estimate the size of temporary data for [GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause), [DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct), [JOIN](../sql-reference/statements/select/join.md#select-join) and other operations you use. + +ClickHouse can use external memory for temporary data. See [GROUP BY in External Memory](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory) for details. + +## Swap File {#swap-file} + +Disable the swap file for production environments. + +## Storage Subsystem {#storage-subsystem} + +You need to have 2GB of free disk space to install ClickHouse. + +The volume of storage required for your data should be calculated separately. Assessment should include: + +- Estimation of the data volume. + + You can take a sample of the data and get the average size of a row from it. Then multiply the value by the number of rows you plan to store. + +- The data compression coefficient. + + To estimate the data compression coefficient, load a sample of your data into ClickHouse, and compare the actual size of the data with the size of the table stored. For example, clickstream data is usually compressed by 6-10 times. + +To calculate the final volume of data to be stored, apply the compression coefficient to the estimated data volume. If you plan to store data in several replicas, then multiply the estimated volume by the number of replicas. + +## Network {#network} + +If possible, use networks of 10G or higher class. + +The network bandwidth is critical for processing distributed queries with a large amount of intermediate data. Besides, network speed affects replication processes. + +## Software {#software} + +ClickHouse is developed primarily for the Linux family of operating systems. The recommended Linux distribution is Ubuntu. The `tzdata` package should be installed in the system. + +ClickHouse can also work in other operating system families. See details in the [install guide](../install.md) section of the documentation. diff --git a/docs/en/reference/operations/server-configuration-parameters/index.md b/docs/en/reference/operations/server-configuration-parameters/index.md new file mode 100644 index 00000000000..1e4ddc6368e --- /dev/null +++ b/docs/en/reference/operations/server-configuration-parameters/index.md @@ -0,0 +1,16 @@ +--- +sidebar_position: 54 +sidebar_label: Server Configuration Parameters +--- + +# Server Configuration Parameters + +This section contains descriptions of server settings that cannot be changed at the session or query level. + +These settings are stored in the `config.xml` file on the ClickHouse server. + +Other settings are described in the “[Settings](../../operations/settings/index.md#session-settings-intro)” section. + +Before studying the settings, read the [Configuration files](../../operations/configuration-files.md#configuration_files) section and note the use of substitutions (the `incl` and `optional` attributes). + +[Original article](https://clickhouse.com/docs/en/operations/server_configuration_parameters/) diff --git a/docs/en/reference/operations/server-configuration-parameters/settings.md b/docs/en/reference/operations/server-configuration-parameters/settings.md new file mode 100644 index 00000000000..985dc626ea4 --- /dev/null +++ b/docs/en/reference/operations/server-configuration-parameters/settings.md @@ -0,0 +1,1630 @@ +--- +sidebar_position: 57 +sidebar_label: Server Settings +--- + +# Server Settings {#server-settings} + +## builtin_dictionaries_reload_interval {#builtin-dictionaries-reload-interval} + +The interval in seconds before reloading built-in dictionaries. + +ClickHouse reloads built-in dictionaries every x seconds. This makes it possible to edit dictionaries “on the fly” without restarting the server. + +Default value: 3600. + +**Example** + +``` xml +3600 +``` + +## compression {#server-settings-compression} + +Data compression settings for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine tables. + +:::warning +Don’t use it if you have just started using ClickHouse. +::: + +Configuration template: + +``` xml + + + ... + ... + ... + ... + + ... + +``` + +`` fields: + +- `min_part_size` – The minimum size of a data part. +- `min_part_size_ratio` – The ratio of the data part size to the table size. +- `method` – Compression method. Acceptable values: `lz4`, `lz4hc`, `zstd`. +- `level` – Compression level. See [Codecs](../../sql-reference/statements/create/table.md#create-query-general-purpose-codecs). + +You can configure multiple `` sections. + +Actions when conditions are met: + +- If a data part matches a condition set, ClickHouse uses the specified compression method. +- If a data part matches multiple condition sets, ClickHouse uses the first matched condition set. + +If no conditions met for a data part, ClickHouse uses the `lz4` compression. + +**Example** + +``` xml + + + 10000000000 + 0.01 + zstd + 1 + + +``` + +## encryption {#server-settings-encryption} + +Configures a command to obtain a key to be used by [encryption codecs](../../sql-reference/statements/create/table.md#create-query-encryption-codecs). Key (or keys) should be written in environment variables or set in the configuration file. + +Keys can be hex or string with a length equal to 16 bytes. + +**Example** + +Loading from config: + +```xml + + + 1234567812345678 + + +``` + +:::note +Storing keys in the configuration file is not recommended. It isn't secure. You can move the keys into a separate config file on a secure disk and put a symlink to that config file to `config.d/` folder. +::: + +Loading from config, when the key is in hex: + +```xml + + + 00112233445566778899aabbccddeeff + + +``` + +Loading key from the environment variable: + +```xml + + + + + +``` + +Here `current_key_id` sets the current key for encryption, and all specified keys can be used for decryption. + +Each of these methods can be applied for multiple keys: + +```xml + + + 00112233445566778899aabbccddeeff + + 1 + + +``` + +Here `current_key_id` shows current key for encryption. + +Also, users can add nonce that must be 12 bytes long (by default encryption and decryption processes use nonce that consists of zero bytes): + +```xml + + + 012345678910 + + +``` + +Or it can be set in hex: + +```xml + + + abcdefabcdef + + +``` + +Everything mentioned above can be applied for `aes_256_gcm_siv` (but the key must be 32 bytes long). + + +## custom_settings_prefixes {#custom_settings_prefixes} + +List of prefixes for [custom settings](../../operations/settings/index.md#custom_settings). The prefixes must be separated with commas. + +**Example** + +```xml +custom_ +``` + +**See Also** + +- [Custom settings](../../operations/settings/index.md#custom_settings) + +## core_dump {#server_configuration_parameters-core_dump} + +Configures soft limit for core dump file size. + +Possible values: + +- Positive integer. + +Default value: `1073741824` (1 GB). + +:::note +Hard limit is configured via system tools +::: + +**Example** + +```xml + + 1073741824 + +``` + +## database_atomic_delay_before_drop_table_sec {#database_atomic_delay_before_drop_table_sec} + +Sets the delay before remove table data in seconds. If the query has `SYNC` modifier, this setting is ignored. + +Default value: `480` (8 minute). + +## default_database {#default-database} + +The default database. + +To get a list of databases, use the [SHOW DATABASES](../../sql-reference/statements/show.md#show-databases) query. + +**Example** + +``` xml +default +``` + +## default_profile {#default-profile} + +Default settings profile. + +Settings profiles are located in the file specified in the parameter `user_config`. + +**Example** + +``` xml +default +``` + +## default_replica_path {#default_replica_path} + +The path to the table in ZooKeeper. + +**Example** + +``` xml +/clickhouse/tables/{uuid}/{shard} +``` +## default_replica_name {#default_replica_name} + + The replica name in ZooKeeper. + +**Example** + +``` xml +{replica} +``` + +## dictionaries_config {#server_configuration_parameters-dictionaries_config} + +The path to the config file for external dictionaries. + +Path: + +- Specify the absolute path or the path relative to the server config file. +- The path can contain wildcards \* and ?. + +See also “[External dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)”. + +**Example** + +``` xml +*_dictionary.xml +``` + +## dictionaries_lazy_load {#server_configuration_parameters-dictionaries_lazy_load} + +Lazy loading of dictionaries. + +If `true`, then each dictionary is created on first use. If dictionary creation failed, the function that was using the dictionary throws an exception. + +If `false`, all dictionaries are created when the server starts, if the dictionary or dictionaries are created too long or are created with errors, then the server boots without of these dictionaries and continues to try to create these dictionaries. + +The default is `true`. + +**Example** + +``` xml +true +``` + +## format_schema_path {#server_configuration_parameters-format_schema_path} + +The path to the directory with the schemes for the input data, such as schemas for the [CapnProto](../../interfaces/formats.md#capnproto) format. + +**Example** + +``` xml + + format_schemas/ +``` + +## graphite {#server_configuration_parameters-graphite} + +Sending data to [Graphite](https://github.com/graphite-project). + +Settings: + +- host – The Graphite server. +- port – The port on the Graphite server. +- interval – The interval for sending, in seconds. +- timeout – The timeout for sending data, in seconds. +- root_path – Prefix for keys. +- metrics – Sending data from the [system.metrics](../../operations/system-tables/metrics.md#system_tables-metrics) table. +- events – Sending deltas data accumulated for the time period from the [system.events](../../operations/system-tables/events.md#system_tables-events) table. +- events_cumulative – Sending cumulative data from the [system.events](../../operations/system-tables/events.md#system_tables-events) table. +- asynchronous_metrics – Sending data from the [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) table. + +You can configure multiple `` clauses. For instance, you can use this for sending different data at different intervals. + +**Example** + +``` xml + + localhost + 42000 + 0.1 + 60 + one_min + true + true + false + true + +``` + +## graphite_rollup {#server_configuration_parameters-graphite-rollup} + +Settings for thinning data for Graphite. + +For more details, see [GraphiteMergeTree](../../engines/table-engines/mergetree-family/graphitemergetree.md). + +**Example** + +``` xml + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + +``` + +## http_port/https_port {#http-porthttps-port} + +The port for connecting to the server over HTTP(s). + +If `https_port` is specified, [openSSL](#server_configuration_parameters-openssl) must be configured. + +If `http_port` is specified, the OpenSSL configuration is ignored even if it is set. + +**Example** + +``` xml +9999 +``` + +## http_server_default_response {#server_configuration_parameters-http_server_default_response} + +The page that is shown by default when you access the ClickHouse HTTP(s) server. +The default value is “Ok.” (with a line feed at the end) + +**Example** + +Opens `https://tabix.io/` when accessing `http://localhost: http_port`. + +``` xml + +
]]> +
+``` +## hsts_max_age {#hsts-max-age} + +Expired time for HSTS in seconds. The default value is 0 means clickhouse disabled HSTS. If you set a positive number, the HSTS will be enabled and the max-age is the number you set. + +**Example** + +```xml +600000 +``` + +## include_from {#server_configuration_parameters-include_from} + +The path to the file with substitutions. + +For more information, see the section “[Configuration files](../../operations/configuration-files.md#configuration_files)”. + +**Example** + +``` xml +/etc/metrica.xml +``` + +## interserver_http_port {#interserver-http-port} + +Port for exchanging data between ClickHouse servers. + +**Example** + +``` xml +9009 +``` + +## interserver_http_host {#interserver-http-host} + +The hostname that can be used by other servers to access this server. + +If omitted, it is defined in the same way as the `hostname-f` command. + +Useful for breaking away from a specific network interface. + +**Example** + +``` xml +example.clickhouse.com +``` + +## interserver_https_port {#interserver-https-port} + +Port for exchanging data between ClickHouse servers over `HTTPS`. + +**Example** + +``` xml +9010 +``` + +## interserver_https_host {#interserver-https-host} + +Similar to `interserver_http_host`, except that this hostname can be used by other servers to access this server over `HTTPS`. + +**Example** + +``` xml +example.clickhouse.com +``` + +## interserver_http_credentials {#server-settings-interserver-http-credentials} + +A username and a password used to connect to other servers during [replication](../../engines/table-engines/mergetree-family/replication.md). Also the server authenticates other replicas using these credentials. So, `interserver_http_credentials` must be the same for all replicas in a cluster. + +By default, if `interserver_http_credentials` section is omitted, authentication is not used during replication. + +:::note +`interserver_http_credentials` settings do not relate to a ClickHouse client credentials [configuration](../../interfaces/cli.md#configuration_files). +::: + +:::note +These credentials are common for replication via `HTTP` and `HTTPS`. +::: + +The section contains the following parameters: + +- `user` — Username. +- `password` — Password. +- `allow_empty` — If `true`, then other replicas are allowed to connect without authentication even if credentials are set. If `false`, then connections without authentication are refused. Default value: `false`. +- `old` — Contains old `user` and `password` used during credential rotation. Several `old` sections can be specified. + +**Credentials Rotation** + +ClickHouse supports dynamic interserver credentials rotation without stopping all replicas at the same time to update their configuration. Credentials can be changed in several steps. + +To enable authentication, set `interserver_http_credentials.allow_empty` to `true` and add credentials. This allows connections with authentication and without it. + +``` xml + + admin + 111 + true + +``` + +After configuring all replicas set `allow_empty` to `false` or remove this setting. It makes authentication with new credentials mandatory. + +To change existing credentials, move the username and the password to `interserver_http_credentials.old` section and update `user` and `password` with new values. At this point the server uses new credentials to connect to other replicas and accepts connections with either new or old credentials. + +``` xml + + admin + 222 + + admin + 111 + + + temp + 000 + + +``` + +When new credentials are applied to all replicas, old credentials may be removed. + +## keep_alive_timeout {#keep-alive-timeout} + +The number of seconds that ClickHouse waits for incoming requests before closing the connection. Defaults to 10 seconds. + +**Example** + +``` xml +10 +``` + +## listen_host {#server_configuration_parameters-listen_host} + +Restriction on hosts that requests can come from. If you want the server to answer all of them, specify `::`. + +Examples: + +``` xml +::1 +127.0.0.1 +``` + +## listen_backlog {#server_configuration_parameters-listen_backlog} + +Backlog (queue size of pending connections) of the listen socket. + +Default value: `4096` (as in linux [5.4+](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=19f92a030ca6d772ab44b22ee6a01378a8cb32d4)). + +Usually this value does not need to be changed, since: +- default value is large enough, +- and for accepting client's connections server has separate thread. + +So even if you have `TcpExtListenOverflows` (from `nstat`) non zero and this counter grows for ClickHouse server it does not mean that this value need to be increased, since: +- usually if 4096 is not enough it shows some internal ClickHouse scaling issue, so it is better to report an issue. +- and it does not mean that the server can handle more connections later (and even if it could, by that moment clients may be gone or disconnected). + +Examples: + +``` xml +4096 +``` + +## logger {#server_configuration_parameters-logger} + +Logging settings. + +Keys: + +- `level` – Logging level. Acceptable values: `trace`, `debug`, `information`, `warning`, `error`. +- `log` – The log file. Contains all the entries according to `level`. +- `errorlog` – Error log file. +- `size` – Size of the file. Applies to `log` and `errorlog`. Once the file reaches `size`, ClickHouse archives and renames it, and creates a new log file in its place. +- `count` – The number of archived log files that ClickHouse stores. + +**Example** + +``` xml + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + +``` + +Writing to the syslog is also supported. Config example: + +``` xml + + 1 + +
syslog.remote:10514
+ myhost.local + LOG_LOCAL6 + syslog +
+
+``` + +Keys for syslog: + +- use_syslog — Required setting if you want to write to the syslog. +- address — The host\[:port\] of syslogd. If omitted, the local daemon is used. +- hostname — Optional. The name of the host that logs are sent from. +- facility — [The syslog facility keyword](https://en.wikipedia.org/wiki/Syslog#Facility) in uppercase letters with the “LOG_” prefix: (`LOG_USER`, `LOG_DAEMON`, `LOG_LOCAL3`, and so on). + Default value: `LOG_USER` if `address` is specified, `LOG_DAEMON` otherwise. +- format – Message format. Possible values: `bsd` and `syslog.` + +## send_crash_reports {#server_configuration_parameters-send_crash_reports} + +Settings for opt-in sending crash reports to the ClickHouse core developers team via [Sentry](https://sentry.io). +Enabling it, especially in pre-production environments, is highly appreciated. + +The server will need access to the public Internet via IPv4 (at the time of writing IPv6 is not supported by Sentry) for this feature to be functioning properly. + +Keys: + +- `enabled` – Boolean flag to enable the feature, `false` by default. Set to `true` to allow sending crash reports. +- `endpoint` – You can override the Sentry endpoint URL for sending crash reports. It can be either a separate Sentry account or your self-hosted Sentry instance. Use the [Sentry DSN](https://docs.sentry.io/error-reporting/quickstart/?platform=native#configure-the-sdk) syntax. +- `anonymize` - Avoid attaching the server hostname to the crash report. +- `http_proxy` - Configure HTTP proxy for sending crash reports. +- `debug` - Sets the Sentry client into debug mode. +- `tmp_path` - Filesystem path for temporary crash report state. + +**Recommended way to use** + +``` xml + + true + +``` + +## macros {#macros} + +Parameter substitutions for replicated tables. + +Can be omitted if replicated tables are not used. + +For more information, see the section [Creating replicated tables](../../engines/table-engines/mergetree-family/replication.md#creating-replicated-tables). + +**Example** + +``` xml + +``` + +## mark_cache_size {#server-mark-cache-size} + +Approximate size (in bytes) of the cache of marks used by table engines of the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family. + +The cache is shared for the server and memory is allocated as needed. + +**Example** + +``` xml +5368709120 +``` + +## max_server_memory_usage {#max_server_memory_usage} + +Limits total RAM usage by the ClickHouse server. + +Possible values: + +- Positive integer. +- 0 — Auto. + +Default value: `0`. + +**Additional Info** + +The default `max_server_memory_usage` value is calculated as `memory_amount * max_server_memory_usage_to_ram_ratio`. + +**See also** + +- [max_memory_usage](../../operations/settings/query-complexity.md#settings_max_memory_usage) +- [max_server_memory_usage_to_ram_ratio](#max_server_memory_usage_to_ram_ratio) + +## max_server_memory_usage_to_ram_ratio {#max_server_memory_usage_to_ram_ratio} + +Defines the fraction of total physical RAM amount, available to the ClickHouse server. If the server tries to utilize more, the memory is cut down to the appropriate amount. + +Possible values: + +- Positive double. +- 0 — The ClickHouse server can use all available RAM. + +Default value: `0.9`. + +**Usage** + +On hosts with low RAM and swap, you possibly need setting `max_server_memory_usage_to_ram_ratio` larger than 1. + +**Example** + +``` xml +0.9 +``` + +**See Also** + +- [max_server_memory_usage](#max_server_memory_usage) + +## max_concurrent_queries {#max-concurrent-queries} + +The maximum number of simultaneously processed queries related to MergeTree table. +Queries may be limited by other settings: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). + +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `100`. + +**Example** + +``` xml +100 +``` + +## max_concurrent_insert_queries {#max-concurrent-insert-queries} + +The maximum number of simultaneously processed `INSERT` queries. + +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +100 +``` + +## max_concurrent_select_queries {#max-concurrent-select-queries} + +The maximum number of simultaneously processed `SELECT` queries. + +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +100 +``` + +## max_concurrent_queries_for_user {#max-concurrent-queries-for-user} + +The maximum number of simultaneously processed queries related to MergeTree table per user. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +5 +``` + +## max_concurrent_queries_for_all_users {#max-concurrent-queries-for-all-users} + +Throw exception if the value of this setting is less or equal than the current number of simultaneously processed queries. + +Example: `max_concurrent_queries_for_all_users` can be set to 99 for all users and database administrator can set it to 100 for itself to run queries for investigation even when the server is overloaded. + +Modifying the setting for one query or user does not affect other queries. + +Possible values: + +- Positive integer. +- 0 — No limit. + +Default value: `0`. + +**Example** + +``` xml +99 +``` + +**See Also** + +- [max_concurrent_queries](#max-concurrent-queries) + +## min_marks_to_honor_max_concurrent_queries {#min-marks-to-honor-max-concurrent-queries} + +The minimal number of marks read by the query for applying the [max_concurrent_queries](#max-concurrent-queries) setting. + +Possible values: + +- Positive integer. +- 0 — Disabled. + +**Example** + +``` xml +10 +``` + +## max_connections {#max-connections} + +The maximum number of inbound connections. + +**Example** + +``` xml +4096 +``` + +## max_open_files {#max-open-files} + +The maximum number of open files. + +By default: `maximum`. + +We recommend using this option in Mac OS X since the `getrlimit()` function returns an incorrect value. + +**Example** + +``` xml +262144 +``` + +## max_table_size_to_drop {#max-table-size-to-drop} + +Restriction on deleting tables. + +If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you can’t delete it using a DROP query. + +If you still need to delete the table without restarting the ClickHouse server, create the `/flags/force_drop_table` file and run the DROP query. + +Default value: 50 GB. + +The value 0 means that you can delete all tables without any restrictions. + +**Example** + +``` xml +0 +``` + +## max_thread_pool_size {#max-thread-pool-size} + +ClickHouse uses threads from the Global Thread pool to process queries. If there is no idle thread to process a query, then a new thread is created in the pool. `max_thread_pool_size` limits the maximum number of threads in the pool. + +Possible values: + +- Positive integer. + +Default value: `10000`. + +**Example** + +``` xml +12000 +``` + +## max_thread_pool_free_size {#max-thread-pool-free-size} + +If the number of **idle** threads in the Global Thread pool is greater than `max_thread_pool_free_size`, then ClickHouse releases resources occupied by some threads and the pool size is decreased. Threads can be created again if necessary. + +Possible values: + +- Positive integer. + +Default value: `1000`. + +**Example** + +``` xml +1200 +``` + +## thread_pool_queue_size {#thread-pool-queue-size} + +The maximum number of jobs that can be scheduled on the Global Thread pool. Increasing queue size leads to larger memory usage. It is recommended to keep this value equal to [max_thread_pool_size](#max-thread-pool-size). + +Possible values: + +- Positive integer. + +Default value: `10000`. + +**Example** + +``` xml +12000 +``` + +## merge_tree {#server_configuration_parameters-merge_tree} + +Fine tuning for tables in the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). + +For more information, see the MergeTreeSettings.h header file. + +**Example** + +``` xml + + 5 + +``` + +## metric_log {#metric_log} + +It is enabled by default. If it`s not, you can do this manually. + +**Enabling** + +To manually turn on metrics history collection [`system.metric_log`](../../operations/system-tables/metric_log.md), create `/etc/clickhouse-server/config.d/metric_log.xml` with the following content: + +``` xml + + + system + metric_log
+ 7500 + 1000 +
+
+``` + +**Disabling** + +To disable `metric_log` setting, you should create the following file `/etc/clickhouse-server/config.d/disable_metric_log.xml` with the following content: + +``` xml + + + +``` + +## replicated_merge_tree {#server_configuration_parameters-replicated_merge_tree} + +Fine tuning for tables in the [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/mergetree.md). + +This setting has a higher priority. + +For more information, see the MergeTreeSettings.h header file. + +**Example** + +``` xml + + 5 + +``` + +## openSSL {#server_configuration_parameters-openssl} + +SSL client/server configuration. + +Support for SSL is provided by the `libpoco` library. The interface is described in the file [SSLManager.h](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h) + +Keys for server/client settings: + +- privateKeyFile – The path to the file with the secret key of the PEM certificate. The file may contain a key and certificate at the same time. +- certificateFile – The path to the client/server certificate file in PEM format. You can omit it if `privateKeyFile` contains the certificate. +- caConfig – The path to the file or directory that contains trusted root certificates. +- verificationMode – The method for checking the node’s certificates. Details are in the description of the [Context](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/Context.h) class. Possible values: `none`, `relaxed`, `strict`, `once`. +- verificationDepth – The maximum length of the verification chain. Verification will fail if the certificate chain length exceeds the set value. +- loadDefaultCAFile – Indicates that built-in CA certificates for OpenSSL will be used. Acceptable values: `true`, `false`. \| +- cipherList – Supported OpenSSL encryptions. For example: `ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH`. +- cacheSessions – Enables or disables caching sessions. Must be used in combination with `sessionIdContext`. Acceptable values: `true`, `false`. +- sessionIdContext – A unique set of random characters that the server appends to each generated identifier. The length of the string must not exceed `SSL_MAX_SSL_SESSION_ID_LENGTH`. This parameter is always recommended since it helps avoid problems both if the server caches the session and if the client requested caching. Default value: `${application.name}`. +- sessionCacheSize – The maximum number of sessions that the server caches. Default value: 1024\*20. 0 – Unlimited sessions. +- sessionTimeout – Time for caching the session on the server. +- extendedVerification – Automatically extended verification of certificates after the session ends. Acceptable values: `true`, `false`. +- requireTLSv1 – Require a TLSv1 connection. Acceptable values: `true`, `false`. +- requireTLSv1_1 – Require a TLSv1.1 connection. Acceptable values: `true`, `false`. +- requireTLSv1_2 – Require a TLSv1.2 connection. Acceptable values: `true`, `false`. +- fips – Activates OpenSSL FIPS mode. Supported if the library’s OpenSSL version supports FIPS. +- privateKeyPassphraseHandler – Class (PrivateKeyPassphraseHandler subclass) that requests the passphrase for accessing the private key. For example: ``, `KeyFileHandler`, `test`, ``. +- invalidCertificateHandler – Class (a subclass of CertificateHandler) for verifying invalid certificates. For example: ` ConsoleCertificateHandler ` . +- disableProtocols – Protocols that are not allowed to use. +- preferServerCiphers – Preferred server ciphers on the client. + +**Example of settings:** + +``` xml + + + + /etc/clickhouse-server/server.crt + /etc/clickhouse-server/server.key + + /etc/clickhouse-server/dhparam.pem + none + true + true + sslv2,sslv3 + true + + + true + true + sslv2,sslv3 + true + + + + RejectCertificateHandler + + + +``` + +## part_log {#server_configuration_parameters-part-log} + +Logging events that are associated with [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). For instance, adding or merging data. You can use the log to simulate merge algorithms and compare their characteristics. You can visualize the merge process. + +Queries are logged in the [system.part_log](../../operations/system-tables/part_log.md#system_tables-part-log) table, not in a separate file. You can configure the name of this table in the `table` parameter (see below). + +Use the following parameters to configure logging: + +- `database` – Name of the database. +- `table` – Name of the system table. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined. +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. + +**Example** + +``` xml + + system + part_log
+ toMonday(event_date) + 7500 +
+``` + +## path {#server_configuration_parameters-path} + +The path to the directory containing data. + +:::note +The trailing slash is mandatory. +::: + +**Example** + +``` xml +/var/lib/clickhouse/ +``` + +## prometheus {#server_configuration_parameters-prometheus} + +Exposing metrics data for scraping from [Prometheus](https://prometheus.io). + +Settings: + +- `endpoint` – HTTP endpoint for scraping metrics by prometheus server. Start from ‘/’. +- `port` – Port for `endpoint`. +- `metrics` – Flag that sets to expose metrics from the [system.metrics](../../operations/system-tables/metrics.md#system_tables-metrics) table. +- `events` – Flag that sets to expose metrics from the [system.events](../../operations/system-tables/events.md#system_tables-events) table. +- `asynchronous_metrics` – Flag that sets to expose current metrics values from the [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) table. + +**Example** + +``` xml + + /metrics + 8001 + true + true + true + +``` + +## query_log {#server_configuration_parameters-query-log} + +Setting for logging queries received with the [log_queries=1](../../operations/settings/settings.md) setting. + +Queries are logged in the [system.query_log](../../operations/system-tables/query_log.md#system_tables-query_log) table, not in a separate file. You can change the name of the table in the `table` parameter (see below). + +Use the following parameters to configure logging: + +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined. +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. + +If the table does not exist, ClickHouse will create it. If the structure of the query log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. + +**Example** + +``` xml + + system + query_log
+ Engine = MergeTree PARTITION BY event_date ORDER BY event_time TTL event_date + INTERVAL 30 day + 7500 +
+``` + +## query_thread_log {#server_configuration_parameters-query_thread_log} + +Setting for logging threads of queries received with the [log_query_threads=1](../../operations/settings/settings.md#settings-log-query-threads) setting. + +Queries are logged in the [system.query_thread_log](../../operations/system-tables/query_thread_log.md#system_tables-query_thread_log) table, not in a separate file. You can change the name of the table in the `table` parameter (see below). + +Use the following parameters to configure logging: + +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined. +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. + +If the table does not exist, ClickHouse will create it. If the structure of the query thread log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. + +**Example** + +``` xml + + system + query_thread_log
+ toMonday(event_date) + 7500 +
+``` + +## query_views_log {#server_configuration_parameters-query_views_log} + +Setting for logging views (live, materialized etc) dependant of queries received with the [log_query_views=1](../../operations/settings/settings.md#settings-log-query-views) setting. + +Queries are logged in the [system.query_views_log](../../operations/system-tables/query_views_log.md#system_tables-query_views_log) table, not in a separate file. You can change the name of the table in the `table` parameter (see below). + +Use the following parameters to configure logging: + +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined. +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. + +If the table does not exist, ClickHouse will create it. If the structure of the query views log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. + +**Example** + +``` xml + + system + query_views_log
+ toYYYYMM(event_date) + 7500 +
+``` + +## text_log {#server_configuration_parameters-text_log} + +Settings for the [text_log](../../operations/system-tables/text_log.md#system_tables-text_log) system table for logging text messages. + +Parameters: + +- `level` — Maximum Message Level (by default `Trace`) which will be stored in a table. +- `database` — Database name. +- `table` — Table name. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined. +- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. + +**Example** +```xml + + + notice + system + text_log
+ 7500 + + Engine = MergeTree PARTITION BY event_date ORDER BY event_time TTL event_date + INTERVAL 30 day +
+
+``` + + +## trace_log {#server_configuration_parameters-trace_log} + +Settings for the [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) system table operation. + +Parameters: + +- `database` — Database for storing a table. +- `table` — Table name. +- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined. +- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/index.md) for a system table. Can't be used if `partition_by` defined. +- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. + +The default server configuration file `config.xml` contains the following settings section: + +``` xml + + system + trace_log
+ toYYYYMM(event_date) + 7500 +
+``` + +## query_masking_rules {#query-masking-rules} + +Regexp-based rules, which will be applied to queries as well as all log messages before storing them in server logs, +`system.query_log`, `system.text_log`, `system.processes` tables, and in logs sent to the client. That allows preventing +sensitive data leakage from SQL queries (like names, emails, personal identifiers or credit card numbers) to logs. + +**Example** + +``` xml + + + hide SSN + (^|\D)\d{3}-\d{2}-\d{4}($|\D) + 000-00-0000 + + +``` + +Config fields: +- `name` - name for the rule (optional) +- `regexp` - RE2 compatible regular expression (mandatory) +- `replace` - substitution string for sensitive data (optional, by default - six asterisks) + +The masking rules are applied to the whole query (to prevent leaks of sensitive data from malformed / non-parsable queries). + +`system.events` table have counter `QueryMaskingRulesMatch` which have an overall number of query masking rules matches. + +For distributed queries each server have to be configured separately, otherwise, subqueries passed to other +nodes will be stored without masking. + +## remote_servers {#server-settings-remote-servers} + +Configuration of clusters used by the [Distributed](../../engines/table-engines/special/distributed.md) table engine and by the `cluster` table function. + +**Example** + +``` xml + +``` + +For the value of the `incl` attribute, see the section “[Configuration files](../../operations/configuration-files.md#configuration_files)”. + +**See Also** + +- [skip_unavailable_shards](../../operations/settings/settings.md#settings-skip_unavailable_shards) + +## timezone {#server_configuration_parameters-timezone} + +The server’s time zone. + +Specified as an IANA identifier for the UTC timezone or geographic location (for example, Africa/Abidjan). + +The time zone is necessary for conversions between String and DateTime formats when DateTime fields are output to text format (printed on the screen or in a file), and when getting DateTime from a string. Besides, the time zone is used in functions that work with the time and date if they didn’t receive the time zone in the input parameters. + +**Example** + +``` xml +Asia/Istanbul +``` + +## tcp_port {#server_configuration_parameters-tcp_port} + +Port for communicating with clients over the TCP protocol. + +**Example** + +``` xml +9000 +``` + +## tcp_port_secure {#server_configuration_parameters-tcp_port_secure} + +TCP port for secure communication with clients. Use it with [OpenSSL](#server_configuration_parameters-openssl) settings. + +**Possible values** + +Positive integer. + +**Default value** + +``` xml +9440 +``` + +## mysql_port {#server_configuration_parameters-mysql_port} + +Port for communicating with clients over MySQL protocol. + +**Possible values** + +Positive integer. + +Example + +``` xml +9004 +``` + +## postgresql_port {#server_configuration_parameters-postgresql_port} + +Port for communicating with clients over PostgreSQL protocol. + +**Possible values** + +Positive integer. + +Example + +``` xml +9005 +``` + +## tmp_path {#tmp-path} + +Path to temporary data for processing large queries. + +:::note +The trailing slash is mandatory. +::: + +**Example** + +``` xml +/var/lib/clickhouse/tmp/ +``` + +## tmp_policy {#tmp-policy} + +Policy from [storage_configuration](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) to store temporary files. + +If not set, [tmp_path](#tmp-path) is used, otherwise it is ignored. + +:::note +- `move_factor` is ignored. +- `keep_free_space_bytes` is ignored. +- `max_data_part_size_bytes` is ignored. +- Уou must have exactly one volume in that policy. +::: + +## uncompressed_cache_size {#server-settings-uncompressed_cache_size} + +Cache size (in bytes) for uncompressed data used by table engines from the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). + +There is one shared cache for the server. Memory is allocated on demand. The cache is used if the option [use_uncompressed_cache](../../operations/settings/settings.md#setting-use_uncompressed_cache) is enabled. + +The uncompressed cache is advantageous for very short queries in individual cases. + +**Example** + +``` xml +8589934592 +``` + +## user_files_path {#server_configuration_parameters-user_files_path} + +The directory with user files. Used in the table function [file()](../../sql-reference/table-functions/file.md). + +**Example** + +``` xml +/var/lib/clickhouse/user_files/ +``` + +## users_config {#users-config} + +Path to the file that contains: + +- User configurations. +- Access rights. +- Settings profiles. +- Quota settings. + +**Example** + +``` xml +users.xml +``` + +## zookeeper {#server-settings_zookeeper} + +Contains settings that allow ClickHouse to interact with a [ZooKeeper](http://zookeeper.apache.org/) cluster. + +ClickHouse uses ZooKeeper for storing metadata of replicas when using replicated tables. If replicated tables are not used, this section of parameters can be omitted. + +This section contains the following parameters: + +- `node` — ZooKeeper endpoint. You can set multiple endpoints. + + For example: + + + +``` xml + + example_host + 2181 + +``` + + The `index` attribute specifies the node order when trying to connect to the ZooKeeper cluster. + +- `session_timeout_ms` — Maximum timeout for the client session in milliseconds. +- `operation_timeout_ms` — Maximum timeout for one operation in milliseconds. +- `root` — The [znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes) that is used as the root for znodes used by the ClickHouse server. Optional. +- `identity` — User and password, that can be required by ZooKeeper to give access to requested znodes. Optional. + +**Example configuration** + +``` xml + + + example1 + 2181 + + + example2 + 2181 + + 30000 + 10000 + + /path/to/zookeeper/node + + user:password + +``` + +**See Also** + +- [Replication](../../engines/table-engines/mergetree-family/replication.md) +- [ZooKeeper Programmer’s Guide](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html) +- [Optional secured communication between ClickHouse and Zookeeper](../ssl-zookeeper.md#secured-communication-with-zookeeper) + +## use_minimalistic_part_header_in_zookeeper {#server-settings-use_minimalistic_part_header_in_zookeeper} + +Storage method for data part headers in ZooKeeper. + +This setting only applies to the `MergeTree` family. It can be specified: + +- Globally in the [merge_tree](#server_configuration_parameters-merge_tree) section of the `config.xml` file. + + ClickHouse uses the setting for all the tables on the server. You can change the setting at any time. Existing tables change their behaviour when the setting changes. + +- For each table. + + When creating a table, specify the corresponding [engine setting](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table). The behaviour of an existing table with this setting does not change, even if the global setting changes. + +**Possible values** + +- 0 — Functionality is turned off. +- 1 — Functionality is turned on. + +If `use_minimalistic_part_header_in_zookeeper = 1`, then [replicated](../../engines/table-engines/mergetree-family/replication.md) tables store the headers of the data parts compactly using a single `znode`. If the table contains many columns, this storage method significantly reduces the volume of the data stored in Zookeeper. + +:::note +After applying `use_minimalistic_part_header_in_zookeeper = 1`, you can’t downgrade the ClickHouse server to a version that does not support this setting. Be careful when upgrading ClickHouse on servers in a cluster. Don’t upgrade all the servers at once. It is safer to test new versions of ClickHouse in a test environment, or on just a few servers of a cluster. + +Data part headers already stored with this setting can't be restored to their previous (non-compact) representation. +::: + +**Default value:** 0. + +## disable_internal_dns_cache {#server-settings-disable-internal-dns-cache} + +Disables the internal DNS cache. Recommended for operating ClickHouse in systems +with frequently changing infrastructure such as Kubernetes. + +**Default value:** 0. + +## dns_cache_update_period {#server-settings-dns-cache-update-period} + +The period of updating IP addresses stored in the ClickHouse internal DNS cache (in seconds). +The update is performed asynchronously, in a separate system thread. + +**Default value**: 15. + +**See also** + +- [background_schedule_pool_size](../../operations/settings/settings.md#background_schedule_pool_size) + +## distributed_ddl {#server-settings-distributed_ddl} + +Manage executing [distributed ddl queries](../../sql-reference/distributed-ddl.md) (CREATE, DROP, ALTER, RENAME) on cluster. +Works only if [ZooKeeper](#server-settings_zookeeper) is enabled. + +**Example** + +```xml + + + /clickhouse/task_queue/ddl + + + default + + + 1 + + + + + 604800 + + + 60 + + + 1000 + +``` + +## access_control_path {#access_control_path} + +Path to a folder where a ClickHouse server stores user and role configurations created by SQL commands. + +Default value: `/var/lib/clickhouse/access/`. + +**See also** + +- [Access Control and Account Management](../../operations/access-rights.md#access-control) + +## user_directories {#user_directories} + +Section of the configuration file that contains settings: +- Path to configuration file with predefined users. +- Path to folder where users created by SQL commands are stored. +- ZooKeeper node path where users created by SQL commands are stored and replicated (experimental). + +If this section is specified, the path from [users_config](../../operations/server-configuration-parameters/settings.md#users-config) and [access_control_path](../../operations/server-configuration-parameters/settings.md#access_control_path) won't be used. + +The `user_directories` section can contain any number of items, the order of the items means their precedence (the higher the item the higher the precedence). + +**Examples** + +``` xml + + + /etc/clickhouse-server/users.xml + + + /var/lib/clickhouse/access/ + + +``` + +Users, roles, row policies, quotas, and profiles can be also stored in ZooKeeper: + +``` xml + + + /etc/clickhouse-server/users.xml + + + /clickhouse/access/ + + +``` + +You can also define sections `memory` — means storing information only in memory, without writing to disk, and `ldap` — means storing information on an LDAP server. + +To add an LDAP server as a remote user directory of users that are not defined locally, define a single `ldap` section with a following parameters: +- `server` — one of LDAP server names defined in `ldap_servers` config section. This parameter is mandatory and cannot be empty. +- `roles` — section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. If no roles are specified, user will not be able to perform any actions after authentication. If any of the listed roles is not defined locally at the time of authentication, the authentication attempt will fail as if the provided password was incorrect. + +**Example** + +``` xml + + my_ldap_server + + + + + +``` + +## total_memory_profiler_step {#total-memory-profiler-step} + +Sets the memory size (in bytes) for a stack trace at every peak allocation step. The data is stored in the [system.trace_log](../../operations/system-tables/trace_log.md) system table with `query_id` equal to an empty string. + +Possible values: + +- Positive integer. + +Default value: `4194304`. + +## total_memory_tracker_sample_probability {#total-memory-tracker-sample-probability} + +Allows to collect random allocations and deallocations and writes them in the [system.trace_log](../../operations/system-tables/trace_log.md) system table with `trace_type` equal to a `MemorySample` with the specified probability. The probability is for every allocation or deallocations, regardless of the size of the allocation. Note that sampling happens only when the amount of untracked memory exceeds the untracked memory limit (default value is `4` MiB). It can be lowered if [total_memory_profiler_step](#total-memory-profiler-step) is lowered. You can set `total_memory_profiler_step` equal to `1` for extra fine-grained sampling. + +Possible values: + +- Positive integer. +- 0 — Writing of random allocations and deallocations in the `system.trace_log` system table is disabled. + +Default value: `0`. + +## mmap_cache_size {#mmap-cache-size} + +Sets the cache size (in bytes) for mapped files. This setting allows to avoid frequent open/[mmap/munmap](https://en.wikipedia.org/wiki/Mmap)/close calls (which are very expensive due to consequent page faults) and to reuse mappings from several threads and queries. The setting value is the number of mapped regions (usually equal to the number of mapped files). The amount of data in mapped files can be monitored in [system.metrics](../../operations/system-tables/metrics.md), [system.metric_log](../../operations/system-tables/metric_log.md) system tables by the `MMappedFiles` and `MMappedFileBytes` metrics, in [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md), [system.asynchronous_metrics_log](../../operations/system-tables/asynchronous_metric_log.md) by the `MMapCacheCells` metric, and also in [system.events](../../operations/system-tables/events.md), [system.processes](../../operations/system-tables/processes.md), [system.query_log](../../operations/system-tables/query_log.md), [system.query_thread_log](../../operations/system-tables/query_thread_log.md), [system.query_views_log](../../operations/system-tables/query_views_log.md) by the `CreatedReadBufferMMap`, `CreatedReadBufferMMapFailed`, `MMappedFileCacheHits`, `MMappedFileCacheMisses` events. Note that the amount of data in mapped files does not consume memory directly and is not accounted in query or server memory usage — because this memory can be discarded similar to OS page cache. The cache is dropped (the files are closed) automatically on the removal of old parts in tables of the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family, also it can be dropped manually by the `SYSTEM DROP MMAP CACHE` query. + +Possible values: + +- Positive integer. + +Default value: `1000`. + +## compiled_expression_cache_size {#compiled-expression-cache-size} + +Sets the cache size (in bytes) for [compiled expressions](../../operations/caches.md). + +Possible values: + +- Positive integer. + +Default value: `134217728`. + +## compiled_expression_cache_elements_size {#compiled_expression_cache_elements_size} + +Sets the cache size (in elements) for [compiled expressions](../../operations/caches.md). + +Possible values: + +- Positive integer. + +Default value: `10000`. + diff --git a/docs/en/reference/operations/settings/constraints-on-settings.md b/docs/en/reference/operations/settings/constraints-on-settings.md new file mode 100644 index 00000000000..5adde60a460 --- /dev/null +++ b/docs/en/reference/operations/settings/constraints-on-settings.md @@ -0,0 +1,73 @@ +--- +sidebar_position: 62 +sidebar_label: Constraints on Settings +--- + +# Constraints on Settings {#constraints-on-settings} + +The constraints on settings can be defined in the `profiles` section of the `user.xml` configuration file and prohibit users from changing some of the settings with the `SET` query. +The constraints are defined as the following: + +``` xml + + + + + lower_boundary + + + upper_boundary + + + lower_boundary + upper_boundary + + + + + + + +``` + +If the user tries to violate the constraints an exception is thrown and the setting isn’t changed. +There are supported three types of constraints: `min`, `max`, `readonly`. The `min` and `max` constraints specify upper and lower boundaries for a numeric setting and can be used in combination. The `readonly` constraint specifies that the user cannot change the corresponding setting at all. + +**Example:** Let `users.xml` includes lines: + +``` xml + + + 10000000000 + 0 + ... + + + 5000000000 + 20000000000 + + + + + + + +``` + +The following queries all throw exceptions: + +``` sql +SET max_memory_usage=20000000001; +SET max_memory_usage=4999999999; +SET force_index_by_date=1; +``` + +``` text +Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be greater than 20000000000. +Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be less than 5000000000. +Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should not be changed. +``` + +**Note:** the `default` profile has special handling: all the constraints defined for the `default` profile become the default constraints, so they restrict all the users until they’re overridden explicitly for these users. + +[Original article](https://clickhouse.com/docs/en/operations/settings/constraints_on_settings/) diff --git a/docs/en/reference/operations/settings/index.md b/docs/en/reference/operations/settings/index.md new file mode 100644 index 00000000000..c371bb0c41a --- /dev/null +++ b/docs/en/reference/operations/settings/index.md @@ -0,0 +1,58 @@ +--- +sidebar_label: Settings +sidebar_position: 52 +slug: index +--- + +# Settings + +There are multiple ways to make all the settings described in this section of documentation. + +Settings are configured in layers, so each subsequent layer redefines the previous settings. + +Ways to configure settings, in order of priority: + +- Settings in the `users.xml` server configuration file. + + Set in the element ``. + +- Session settings. + + Send `SET setting=value` from the ClickHouse console client in interactive mode. + Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you need to specify the `session_id` HTTP parameter. + +- Query settings. + + - When starting the ClickHouse console client in non-interactive mode, set the startup parameter `--setting=value`. + - When using the HTTP API, pass CGI parameters (`URL?setting_1=value&setting_2=value...`). + - Make settings in the [SETTINGS](../../sql-reference/statements/select/index.md#settings-in-select) clause of the SELECT query. The setting value is applied only to that query and is reset to default or previous value after the query is executed. + +Settings that can only be made in the server config file are not covered in this section. + +## Custom Settings {#custom_settings} + +In addition to the common [settings](../../operations/settings/settings.md), users can define custom settings. + +A custom setting name must begin with one of predefined prefixes. The list of these prefixes must be declared in the [custom_settings_prefixes](../../operations/server-configuration-parameters/settings.md#custom_settings_prefixes) parameter in the server configuration file. + +```xml +custom_ +``` + +To define a custom setting use `SET` command: + +```sql +SET custom_a = 123; +``` + +To get the current value of a custom setting use `getSetting()` function: + +```sql +SELECT getSetting('custom_a'); +``` + +**See Also** + +- [Server Configuration Settings](../../operations/server-configuration-parameters/settings.md) + +[Original article](https://clickhouse.com/docs/en/operations/settings/) diff --git a/docs/en/reference/operations/settings/merge-tree-settings.md b/docs/en/reference/operations/settings/merge-tree-settings.md new file mode 100644 index 00000000000..27a0578cce2 --- /dev/null +++ b/docs/en/reference/operations/settings/merge-tree-settings.md @@ -0,0 +1,383 @@ +# MergeTree tables settings {#merge-tree-settings} + +The values of `merge_tree` settings (for all MergeTree tables) can be viewed in the table `system.merge_tree_settings`, they can be overridden in `config.xml` in the `merge_tree` section, or set in the `SETTINGS` section of each table. + +Override example in `config.xml`: + +``` text + + 5 + +``` + +An example to set in `SETTINGS` for a particular table: + +``` sql +CREATE TABLE foo +( + `A` Int64 +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS max_suspicious_broken_parts = 500; +``` + +An example of changing the settings for a specific table with the `ALTER TABLE ... MODIFY SETTING` command: + +``` sql +ALTER TABLE foo + MODIFY SETTING max_suspicious_broken_parts = 100; + +-- reset to default (use value from system.merge_tree_settings) +ALTER TABLE foo + RESET SETTING max_suspicious_broken_parts; +``` + +## parts_to_throw_insert {#parts-to-throw-insert} + +If the number of active parts in a single partition exceeds the `parts_to_throw_insert` value, `INSERT` is interrupted with the `Too many parts (N). Merges are processing significantly slower than inserts` exception. + +Possible values: + +- Any positive integer. + +Default value: 300. + +To achieve maximum performance of `SELECT` queries, it is necessary to minimize the number of parts processed, see [Merge Tree](../../development/architecture.md#merge-tree). + +You can set a larger value to 600 (1200), this will reduce the probability of the `Too many parts` error, but at the same time `SELECT` performance might degrade. Also in case of a merge issue (for example, due to insufficient disk space) you will notice it later than it could be with the original 300. + + +## parts_to_delay_insert {#parts-to-delay-insert} + +If the number of active parts in a single partition exceeds the `parts_to_delay_insert` value, an `INSERT` artificially slows down. + +Possible values: + +- Any positive integer. + +Default value: 150. + +ClickHouse artificially executes `INSERT` longer (adds ‘sleep’) so that the background merge process can merge parts faster than they are added. + +## inactive_parts_to_throw_insert {#inactive-parts-to-throw-insert} + +If the number of inactive parts in a single partition more than the `inactive_parts_to_throw_insert` value, `INSERT` is interrupted with the "Too many inactive parts (N). Parts cleaning are processing significantly slower than inserts" exception. + +Possible values: + +- Any positive integer. + +Default value: 0 (unlimited). + +## inactive_parts_to_delay_insert {#inactive-parts-to-delay-insert} + +If the number of inactive parts in a single partition in the table at least that many the `inactive_parts_to_delay_insert` value, an `INSERT` artificially slows down. It is useful when a server fails to clean up parts quickly enough. + +Possible values: + +- Any positive integer. + +Default value: 0 (unlimited). + +## max_delay_to_insert {#max-delay-to-insert} + +The value in seconds, which is used to calculate the `INSERT` delay, if the number of active parts in a single partition exceeds the [parts_to_delay_insert](#parts-to-delay-insert) value. + +Possible values: + +- Any positive integer. + +Default value: 1. + +The delay (in milliseconds) for `INSERT` is calculated by the formula: + +```code +max_k = parts_to_throw_insert - parts_to_delay_insert +k = 1 + parts_count_in_partition - parts_to_delay_insert +delay_milliseconds = pow(max_delay_to_insert * 1000, k / max_k) +``` + +For example if a partition has 299 active parts and parts_to_throw_insert = 300, parts_to_delay_insert = 150, max_delay_to_insert = 1, `INSERT` is delayed for `pow( 1 * 1000, (1 + 299 - 150) / (300 - 150) ) = 1000` milliseconds. + +## max_parts_in_total {#max-parts-in-total} + +If the total number of active parts in all partitions of a table exceeds the `max_parts_in_total` value `INSERT` is interrupted with the `Too many parts (N)` exception. + +Possible values: + +- Any positive integer. + +Default value: 100000. + +A large number of parts in a table reduces performance of ClickHouse queries and increases ClickHouse boot time. Most often this is a consequence of an incorrect design (mistakes when choosing a partitioning strategy - too small partitions). + +## replicated_deduplication_window {#replicated-deduplication-window} + +The number of most recently inserted blocks for which Zookeeper stores hash sums to check for duplicates. + +Possible values: + +- Any positive integer. +- 0 (disable deduplication) + +Default value: 100. + +The `Insert` command creates one or more blocks (parts). When inserting into Replicated tables, ClickHouse for [insert deduplication](../../engines/table-engines/mergetree-family/replication/) writes the hash sums of the created parts into Zookeeper. Hash sums are stored only for the most recent `replicated_deduplication_window` blocks. The oldest hash sums are removed from Zookeeper. +A large number of `replicated_deduplication_window` slows down `Inserts` because it needs to compare more entries. +The hash sum is calculated from the composition of the field names and types and the data of the inserted part (stream of bytes). + +## non_replicated_deduplication_window {#non-replicated-deduplication-window} + +The number of the most recently inserted blocks in the non-replicated [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table for which hash sums are stored to check for duplicates. + +Possible values: + +- Any positive integer. +- 0 (disable deduplication). + +Default value: 0. + +A deduplication mechanism is used, similar to replicated tables (see [replicated_deduplication_window](#replicated-deduplication-window) setting). The hash sums of the created parts are written to a local file on a disk. + +## replicated_deduplication_window_seconds {#replicated-deduplication-window-seconds} + +The number of seconds after which the hash sums of the inserted blocks are removed from Zookeeper. + +Possible values: + +- Any positive integer. + +Default value: 604800 (1 week). + +Similar to [replicated_deduplication_window](#replicated-deduplication-window), `replicated_deduplication_window_seconds` specifies how long to store hash sums of blocks for insert deduplication. Hash sums older than `replicated_deduplication_window_seconds` are removed from Zookeeper, even if they are less than ` replicated_deduplication_window`. + +## replicated_fetches_http_connection_timeout {#replicated_fetches_http_connection_timeout} + +HTTP connection timeout (in seconds) for part fetch requests. Inherited from default profile [http_connection_timeout](./settings.md#http_connection_timeout) if not set explicitly. + +Possible values: + +- Any positive integer. +- 0 - Use value of `http_connection_timeout`. + +Default value: 0. + +## replicated_fetches_http_send_timeout {#replicated_fetches_http_send_timeout} + +HTTP send timeout (in seconds) for part fetch requests. Inherited from default profile [http_send_timeout](./settings.md#http_send_timeout) if not set explicitly. + +Possible values: + +- Any positive integer. +- 0 - Use value of `http_send_timeout`. + +Default value: 0. + +## replicated_fetches_http_receive_timeout {#replicated_fetches_http_receive_timeout} + +HTTP receive timeout (in seconds) for fetch part requests. Inherited from default profile [http_receive_timeout](./settings.md#http_receive_timeout) if not set explicitly. + +Possible values: + +- Any positive integer. +- 0 - Use value of `http_receive_timeout`. + +Default value: 0. + +## max_replicated_fetches_network_bandwidth {#max_replicated_fetches_network_bandwidth} + +Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) fetches. This setting is applied to a particular table, unlike the [max_replicated_fetches_network_bandwidth_for_server](settings.md#max_replicated_fetches_network_bandwidth_for_server) setting, which is applied to the server. + +You can limit both server network and network for a particular table, but for this the value of the table-level setting should be less than server-level one. Otherwise the server considers only the `max_replicated_fetches_network_bandwidth_for_server` setting. + +The setting isn't followed perfectly accurately. + +Possible values: + +- Positive integer. +- 0 — Unlimited. + +Default value: `0`. + +**Usage** + +Could be used for throttling speed when replicating data to add or replace new nodes. + +## max_replicated_sends_network_bandwidth {#max_replicated_sends_network_bandwidth} + +Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) sends. This setting is applied to a particular table, unlike the [max_replicated_sends_network_bandwidth_for_server](settings.md#max_replicated_sends_network_bandwidth_for_server) setting, which is applied to the server. + +You can limit both server network and network for a particular table, but for this the value of the table-level setting should be less than server-level one. Otherwise the server considers only the `max_replicated_sends_network_bandwidth_for_server` setting. + +The setting isn't followed perfectly accurately. + +Possible values: + +- Positive integer. +- 0 — Unlimited. + +Default value: `0`. + +**Usage** + +Could be used for throttling speed when replicating data to add or replace new nodes. + +## old_parts_lifetime {#old-parts-lifetime} + +The time (in seconds) of storing inactive parts to protect against data loss during spontaneous server reboots. + +Possible values: + +- Any positive integer. + +Default value: 480. + +After merging several parts into a new part, ClickHouse marks the original parts as inactive and deletes them only after `old_parts_lifetime` seconds. +Inactive parts are removed if they are not used by current queries, i.e. if the `refcount` of the part is zero. + +`fsync` is not called for new parts, so for some time new parts exist only in the server's RAM (OS cache). If the server is rebooted spontaneously, new parts can be lost or damaged. +To protect data inactive parts are not deleted immediately. + +During startup ClickHouse checks the integrity of the parts. +If the merged part is damaged ClickHouse returns the inactive parts to the active list, and later merges them again. Then the damaged part is renamed (the `broken_` prefix is added) and moved to the `detached` folder. +If the merged part is not damaged, then the original inactive parts are renamed (the `ignored_` prefix is added) and moved to the `detached` folder. + +The default `dirty_expire_centisecs` value (a Linux kernel setting) is 30 seconds (the maximum time that written data is stored only in RAM), but under heavy loads on the disk system data can be written much later. Experimentally, a value of 480 seconds was chosen for `old_parts_lifetime`, during which a new part is guaranteed to be written to disk. + +## max_bytes_to_merge_at_max_space_in_pool {#max-bytes-to-merge-at-max-space-in-pool} + +The maximum total parts size (in bytes) to be merged into one part, if there are enough resources available. +`max_bytes_to_merge_at_max_space_in_pool` -- roughly corresponds to the maximum possible part size created by an automatic background merge. + +Possible values: + +- Any positive integer. + +Default value: 161061273600 (150 GB). + +The merge scheduler periodically analyzes the sizes and number of parts in partitions, and if there is enough free resources in the pool, it starts background merges. Merges occur until the total size of the source parts is larger than `max_bytes_to_merge_at_max_space_in_pool`. + +Merges initiated by [OPTIMIZE FINAL](../../sql-reference/statements/optimize.md) ignore `max_bytes_to_merge_at_max_space_in_pool` and merge parts only taking into account available resources (free disk's space) until one part remains in the partition. + +## max_bytes_to_merge_at_min_space_in_pool {#max-bytes-to-merge-at-min-space-in-pool} + +The maximum total part size (in bytes) to be merged into one part, with the minimum available resources in the background pool. + +Possible values: + +- Any positive integer. + +Default value: 1048576 (1 MB) + +`max_bytes_to_merge_at_min_space_in_pool` defines the maximum total size of parts which can be merged despite the lack of available disk space (in pool). This is necessary to reduce the number of small parts and the chance of `Too many parts` errors. +Merges book disk space by doubling the total merged parts sizes. Thus, with a small amount of free disk space, a situation may happen that there is free space, but this space is already booked by ongoing large merges, so other merges unable to start, and the number of small parts grows with every insert. + +## merge_max_block_size {#merge-max-block-size} + +The number of rows that are read from the merged parts into memory. + +Possible values: + +- Any positive integer. + +Default value: 8192 + +Merge reads rows from parts in blocks of `merge_max_block_size` rows, then merges and writes the result into a new part. The read block is placed in RAM, so `merge_max_block_size` affects the size of the RAM required for the merge. Thus, merges can consume a large amount of RAM for tables with very wide rows (if the average row size is 100kb, then when merging 10 parts, (100kb * 10 * 8192) = ~ 8GB of RAM). By decreasing `merge_max_block_size`, you can reduce the amount of RAM required for a merge but slow down a merge. + +## max_part_loading_threads {#max-part-loading-threads} + +The maximum number of threads that read parts when ClickHouse starts. + +Possible values: + +- Any positive integer. + +Default value: auto (number of CPU cores). + +During startup ClickHouse reads all parts of all tables (reads files with metadata of parts) to build a list of all parts in memory. In some systems with a large number of parts this process can take a long time, and this time might be shortened by increasing `max_part_loading_threads` (if this process is not CPU and disk I/O bound). + +## max_partitions_to_read {#max-partitions-to-read} + +Limits the maximum number of partitions that can be accessed in one query. + +The setting value specified when the table is created can be overridden via query-level setting. + +Possible values: + +- Any positive integer. + +Default value: -1 (unlimited). + +## allow_floating_point_partition_key {#allow_floating_point_partition_key} + +Enables to allow floating-point number as a partition key. + +Possible values: + +- 0 — Floating-point partition key not allowed. +- 1 — Floating-point partition key allowed. + +Default value: `0`. + +## check_sample_column_is_correct {#check_sample_column_is_correct} + +Enables the check at table creation, that the data type of a column for sampling or sampling expression is correct. The data type must be one of unsigned [integer types](../../sql-reference/data-types/int-uint.md): `UInt8`, `UInt16`, `UInt32`, `UInt64`. + +Possible values: + +- true — The check is enabled. +- false — The check is disabled at table creation. + +Default value: `true`. + +By default, the ClickHouse server checks at table creation the data type of a column for sampling or sampling expression. If you already have tables with incorrect sampling expression and do not want the server to raise an exception during startup, set `check_sample_column_is_correct` to `false`. + +## min_bytes_to_rebalance_partition_over_jbod {#min-bytes-to-rebalance-partition-over-jbod} + +Sets minimal amount of bytes to enable balancing when distributing new big parts over volume disks [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures). + +Possible values: + +- Positive integer. +- 0 — Balancing is disabled. + +Default value: `0`. + +**Usage** + +The value of the `min_bytes_to_rebalance_partition_over_jbod` setting should not be less than the value of the [max_bytes_to_merge_at_max_space_in_pool](../../operations/settings/merge-tree-settings.md#max-bytes-to-merge-at-max-space-in-pool) / 1024. Otherwise, ClickHouse throws an exception. + +## detach_not_byte_identical_parts {#detach_not_byte_identical_parts} + +Enables or disables detaching a data part on a replica after a merge or a mutation, if it is not byte-identical to data parts on other replicas. If disabled, the data part is removed. Activate this setting if you want to analyze such parts later. + +The setting is applicable to `MergeTree` tables with enabled [data replication](../../engines/table-engines/mergetree-family/replication.md). + +Possible values: + +- 0 — Parts are removed. +- 1 — Parts are detached. + +Default value: `0`. + +## merge_tree_clear_old_temporary_directories_interval_seconds {#setting-merge-tree-clear-old-temporary-directories-interval-seconds} + +Sets the interval in seconds for ClickHouse to execute the cleanup of old temporary directories. + +Possible values: + +- Any positive integer. + +Default value: `60` seconds. + +## merge_tree_clear_old_parts_interval_seconds {#setting-merge-tree-clear-old-parts-interval-seconds} + +Sets the interval in seconds for ClickHouse to execute the cleanup of old parts, WALs, and mutations. + +Possible values: + +- Any positive integer. + +Default value: `1` second. + diff --git a/docs/en/reference/operations/settings/permissions-for-queries.md b/docs/en/reference/operations/settings/permissions-for-queries.md new file mode 100644 index 00000000000..ff63f524b7d --- /dev/null +++ b/docs/en/reference/operations/settings/permissions-for-queries.md @@ -0,0 +1,59 @@ +--- +sidebar_position: 58 +sidebar_label: Permissions for Queries +--- + +# Permissions for Queries {#permissions_for_queries} + +Queries in ClickHouse can be divided into several types: + +1. Read data queries: `SELECT`, `SHOW`, `DESCRIBE`, `EXISTS`. +2. Write data queries: `INSERT`, `OPTIMIZE`. +3. Change settings query: `SET`, `USE`. +4. [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries: `CREATE`, `ALTER`, `RENAME`, `ATTACH`, `DETACH`, `DROP` `TRUNCATE`. +5. `KILL QUERY`. + +The following settings regulate user permissions by the type of query: + +- [readonly](#settings_readonly) — Restricts permissions for all types of queries except DDL queries. +- [allow_ddl](#settings_allow_ddl) — Restricts permissions for DDL queries. + +`KILL QUERY` can be performed with any settings. + +## readonly {#settings_readonly} + +Restricts permissions for reading data, write data and change settings queries. + +See how the queries are divided into types [above](#permissions_for_queries). + +Possible values: + +- 0 — All queries are allowed. +- 1 — Only read data queries are allowed. +- 2 — Read data and change settings queries are allowed. + +After setting `readonly = 1`, the user can’t change `readonly` and `allow_ddl` settings in the current session. + +When using the `GET` method in the [HTTP interface](../../interfaces/http.md), `readonly = 1` is set automatically. To modify data, use the `POST` method. + +Setting `readonly = 1` prohibit the user from changing all the settings. There is a way to prohibit the user +from changing only specific settings, for details see [constraints on settings](../../operations/settings/constraints-on-settings.md). + +Default value: 0 + +## allow_ddl {#settings_allow_ddl} + +Allows or denies [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries. + +See how the queries are divided into types [above](#permissions_for_queries). + +Possible values: + +- 0 — DDL queries are not allowed. +- 1 — DDL queries are allowed. + +You can’t execute `SET allow_ddl = 1` if `allow_ddl = 0` for the current session. + +Default value: 1 + +[Original article](https://clickhouse.com/docs/en/operations/settings/permissions_for_queries/) diff --git a/docs/en/reference/operations/settings/query-complexity.md b/docs/en/reference/operations/settings/query-complexity.md new file mode 100644 index 00000000000..c0c77bc809a --- /dev/null +++ b/docs/en/reference/operations/settings/query-complexity.md @@ -0,0 +1,315 @@ +--- +sidebar_position: 59 +sidebar_label: Restrictions on Query Complexity +--- + +# Restrictions on Query Complexity {#restrictions-on-query-complexity} + +Restrictions on query complexity are part of the settings. +They are used to provide safer execution from the user interface. +Almost all the restrictions only apply to `SELECT`. For distributed query processing, restrictions are applied on each server separately. + +ClickHouse checks the restrictions for data parts, not for each row. It means that you can exceed the value of restriction with the size of the data part. + +Restrictions on the “maximum amount of something” can take the value 0, which means “unrestricted”. +Most restrictions also have an ‘overflow_mode’ setting, meaning what to do when the limit is exceeded. +It can take one of two values: `throw` or `break`. Restrictions on aggregation (group_by_overflow_mode) also have the value `any`. + +`throw` – Throw an exception (default). + +`break` – Stop executing the query and return the partial result, as if the source data ran out. + +`any (only for group_by_overflow_mode)` – Continuing aggregation for the keys that got into the set, but do not add new keys to the set. + +## max_memory_usage {#settings_max_memory_usage} + +The maximum amount of RAM to use for running a query on a single server. + +In the default configuration file, the maximum is 10 GB. + +The setting does not consider the volume of available memory or the total volume of memory on the machine. +The restriction applies to a single query within a single server. +You can use `SHOW PROCESSLIST` to see the current memory consumption for each query. +Besides, the peak memory consumption is tracked for each query and written to the log. + +Memory usage is not monitored for the states of certain aggregate functions. + +Memory usage is not fully tracked for states of the aggregate functions `min`, `max`, `any`, `anyLast`, `argMin`, `argMax` from `String` and `Array` arguments. + +Memory consumption is also restricted by the parameters `max_memory_usage_for_user` and [max_server_memory_usage](../../operations/server-configuration-parameters/settings.md#max_server_memory_usage). + +## max_memory_usage_for_user {#max-memory-usage-for-user} + +The maximum amount of RAM to use for running a user’s queries on a single server. + +Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Core/Settings.h#L288). By default, the amount is not restricted (`max_memory_usage_for_user = 0`). + +See also the description of [max_memory_usage](#settings_max_memory_usage). + +## max_rows_to_read {#max-rows-to-read} + +The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little. + +A maximum number of rows that can be read from a table when running a query. + +## max_bytes_to_read {#max-bytes-to-read} + +A maximum number of bytes (uncompressed data) that can be read from a table when running a query. + +## read_overflow_mode {#read-overflow-mode} + +What to do when the volume of data read exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. + +## max_rows_to_read_leaf {#max-rows-to-read-leaf} + +The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little. + +A maximum number of rows that can be read from a local table on a leaf node when running a distributed query. While +distributed queries can issue a multiple sub-queries to each shard (leaf) - this limit will be checked only on the read +stage on the leaf nodes and ignored on results merging stage on the root node. For example, cluster consists of 2 shards +and each shard contains a table with 100 rows. Then distributed query which suppose to read all the data from both +tables with setting `max_rows_to_read=150` will fail as in total it will be 200 rows. While query +with `max_rows_to_read_leaf=150` will succeed since leaf nodes will read 100 rows at max. + +## max_bytes_to_read_leaf {#max-bytes-to-read-leaf} + +A maximum number of bytes (uncompressed data) that can be read from a local table on a leaf node when running +a distributed query. While distributed queries can issue a multiple sub-queries to each shard (leaf) - this limit will +be checked only on the read stage on the leaf nodes and ignored on results merging stage on the root node. +For example, cluster consists of 2 shards and each shard contains a table with 100 bytes of data. +Then distributed query which suppose to read all the data from both tables with setting `max_bytes_to_read=150` will fail +as in total it will be 200 bytes. While query with `max_bytes_to_read_leaf=150` will succeed since leaf nodes will read +100 bytes at max. + +## read_overflow_mode_leaf {#read-overflow-mode-leaf} + +What to do when the volume of data read exceeds one of the leaf limits: ‘throw’ or ‘break’. By default, throw. + +## max_rows_to_group_by {#settings-max-rows-to-group-by} + +A maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating. + +## group_by_overflow_mode {#group-by-overflow-mode} + +What to do when the number of unique keys for aggregation exceeds the limit: ‘throw’, ‘break’, or ‘any’. By default, throw. +Using the ‘any’ value lets you run an approximation of GROUP BY. The quality of this approximation depends on the statistical nature of the data. + +## max_bytes_before_external_group_by {#settings-max_bytes_before_external_group_by} + +Enables or disables execution of `GROUP BY` clauses in external memory. See [GROUP BY in external memory](../../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory). + +Possible values: + +- Maximum volume of RAM (in bytes) that can be used by the single [GROUP BY](../../sql-reference/statements/select/group-by.md#select-group-by-clause) operation. +- 0 — `GROUP BY` in external memory disabled. + +Default value: 0. + +## max_rows_to_sort {#max-rows-to-sort} + +A maximum number of rows before sorting. This allows you to limit memory consumption when sorting. + +## max_bytes_to_sort {#max-bytes-to-sort} + +A maximum number of bytes before sorting. + +## sort_overflow_mode {#sort-overflow-mode} + +What to do if the number of rows received before sorting exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. + +## max_result_rows {#setting-max_result_rows} + +Limit on the number of rows in the result. Also checked for subqueries, and on remote servers when running parts of a distributed query. + +## max_result_bytes {#max-result-bytes} + +Limit on the number of bytes in the result. The same as the previous setting. + +## result_overflow_mode {#result-overflow-mode} + +What to do if the volume of the result exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. + +Using ‘break’ is similar to using LIMIT. `Break` interrupts execution only at the block level. This means that amount of returned rows is greater than [max_result_rows](#setting-max_result_rows), multiple of [max_block_size](../../operations/settings/settings.md#setting-max_block_size) and depends on [max_threads](../../operations/settings/settings.md#settings-max_threads). + +Example: + +``` sql +SET max_threads = 3, max_block_size = 3333; +SET max_result_rows = 3334, result_overflow_mode = 'break'; + +SELECT * +FROM numbers_mt(100000) +FORMAT Null; +``` + +Result: + +``` text +6666 rows in set. ... +``` + +## max_execution_time {#max-execution-time} + +Maximum query execution time in seconds. +At this time, it is not checked for one of the sorting stages, or when merging and finalizing aggregate functions. + +## timeout_overflow_mode {#timeout-overflow-mode} + +What to do if the query is run longer than ‘max_execution_time’: ‘throw’ or ‘break’. By default, throw. + +## min_execution_speed {#min-execution-speed} + +Minimal execution speed in rows per second. Checked on every data block when ‘timeout_before_checking_execution_speed’ expires. If the execution speed is lower, an exception is thrown. + +## min_execution_speed_bytes {#min-execution-speed-bytes} + +A minimum number of execution bytes per second. Checked on every data block when ‘timeout_before_checking_execution_speed’ expires. If the execution speed is lower, an exception is thrown. + +## max_execution_speed {#max-execution-speed} + +A maximum number of execution rows per second. Checked on every data block when ‘timeout_before_checking_execution_speed’ expires. If the execution speed is high, the execution speed will be reduced. + +## max_execution_speed_bytes {#max-execution-speed-bytes} + +A maximum number of execution bytes per second. Checked on every data block when ‘timeout_before_checking_execution_speed’ expires. If the execution speed is high, the execution speed will be reduced. + +## timeout_before_checking_execution_speed {#timeout-before-checking-execution-speed} + +Checks that execution speed is not too slow (no less than ‘min_execution_speed’), after the specified time in seconds has expired. + +## max_columns_to_read {#max-columns-to-read} + +A maximum number of columns that can be read from a table in a single query. If a query requires reading a greater number of columns, it throws an exception. + +## max_temporary_columns {#max-temporary-columns} + +A maximum number of temporary columns that must be kept in RAM at the same time when running a query, including constant columns. If there are more temporary columns than this, it throws an exception. + +## max_temporary_non_const_columns {#max-temporary-non-const-columns} + +The same thing as ‘max_temporary_columns’, but without counting constant columns. +Note that constant columns are formed fairly often when running a query, but they require approximately zero computing resources. + +## max_subquery_depth {#max-subquery-depth} + +Maximum nesting depth of subqueries. If subqueries are deeper, an exception is thrown. By default, 100. + +## max_pipeline_depth {#max-pipeline-depth} + +Maximum pipeline depth. Corresponds to the number of transformations that each data block goes through during query processing. Counted within the limits of a single server. If the pipeline depth is greater, an exception is thrown. By default, 1000. + +## max_ast_depth {#max-ast-depth} + +Maximum nesting depth of a query syntactic tree. If exceeded, an exception is thrown. +At this time, it isn’t checked during parsing, but only after parsing the query. That is, a syntactic tree that is too deep can be created during parsing, but the query will fail. By default, 1000. + +## max_ast_elements {#max-ast-elements} + +A maximum number of elements in a query syntactic tree. If exceeded, an exception is thrown. +In the same way as the previous setting, it is checked only after parsing the query. By default, 50,000. + +## max_rows_in_set {#max-rows-in-set} + +A maximum number of rows for a data set in the IN clause created from a subquery. + +## max_bytes_in_set {#max-bytes-in-set} + +A maximum number of bytes (uncompressed data) used by a set in the IN clause created from a subquery. + +## set_overflow_mode {#set-overflow-mode} + +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. + +## max_rows_in_distinct {#max-rows-in-distinct} + +A maximum number of different rows when using DISTINCT. + +## max_bytes_in_distinct {#max-bytes-in-distinct} + +A maximum number of bytes used by a hash table when using DISTINCT. + +## distinct_overflow_mode {#distinct-overflow-mode} + +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. + +## max_rows_to_transfer {#max-rows-to-transfer} + +A maximum number of rows that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. + +## max_bytes_to_transfer {#max-bytes-to-transfer} + +A maximum number of bytes (uncompressed data) that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. + +## transfer_overflow_mode {#transfer-overflow-mode} + +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. + +## max_rows_in_join {#settings-max_rows_in_join} + +Limits the number of rows in the hash table that is used when joining tables. + +This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and the [Join](../../engines/table-engines/special/join.md) table engine. + +If a query contains multiple joins, ClickHouse checks this setting for every intermediate result. + +ClickHouse can proceed with different actions when the limit is reached. Use the [join_overflow_mode](#settings-join_overflow_mode) setting to choose the action. + +Possible values: + +- Positive integer. +- 0 — Unlimited number of rows. + +Default value: 0. + +## max_bytes_in_join {#settings-max_bytes_in_join} + +Limits the size in bytes of the hash table used when joining tables. + +This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md). + +If the query contains joins, ClickHouse checks this setting for every intermediate result. + +ClickHouse can proceed with different actions when the limit is reached. Use [join_overflow_mode](#settings-join_overflow_mode) settings to choose the action. + +Possible values: + +- Positive integer. +- 0 — Memory control is disabled. + +Default value: 0. + +## join_overflow_mode {#settings-join_overflow_mode} + +Defines what action ClickHouse performs when any of the following join limits is reached: + +- [max_bytes_in_join](#settings-max_bytes_in_join) +- [max_rows_in_join](#settings-max_rows_in_join) + +Possible values: + +- `THROW` — ClickHouse throws an exception and breaks operation. +- `BREAK` — ClickHouse breaks operation and does not throw an exception. + +Default value: `THROW`. + +**See Also** + +- [JOIN clause](../../sql-reference/statements/select/join.md#select-join) +- [Join table engine](../../engines/table-engines/special/join.md) + +## max_partitions_per_insert_block {#max-partitions-per-insert-block} + +Limits the maximum number of partitions in a single inserted block. + +- Positive integer. +- 0 — Unlimited number of partitions. + +Default value: 100. + +**Details** + +When inserting data, ClickHouse calculates the number of partitions in the inserted block. If the number of partitions is more than `max_partitions_per_insert_block`, ClickHouse throws an exception with the following text: + +> “Too many partitions for single INSERT block (more than” + toString(max_parts) + “). The limit is controlled by ‘max_partitions_per_insert_block’ setting. A large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).” + +[Original article](https://clickhouse.com/docs/en/operations/settings/query_complexity/) diff --git a/docs/en/reference/operations/settings/settings-profiles.md b/docs/en/reference/operations/settings/settings-profiles.md new file mode 100644 index 00000000000..b8e1e3c21c4 --- /dev/null +++ b/docs/en/reference/operations/settings/settings-profiles.md @@ -0,0 +1,80 @@ +--- +sidebar_position: 61 +sidebar_label: Settings Profiles +--- + +# Settings Profiles {#settings-profiles} + +A settings profile is a collection of settings grouped under the same name. + +:::note +ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing settings profiles. We recommend using it. +::: + +The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is `readonly=1`, which ensures read-only access. + +Settings profiles can inherit from each other. To use inheritance, indicate one or multiple `profile` settings before the other settings that are listed in the profile. In case when one setting is defined in different profiles, the latest defined is used. + +To apply all the settings in a profile, set the `profile` setting. + +Example: + +Install the `web` profile. + +``` sql +SET profile = 'web' +``` + +Settings profiles are declared in the user config file. This is usually `users.xml`. + +Example: + +``` xml + + + + + + 8 + + + + + 1000000000 + 100000000000 + + 1000000 + any + + 1000000 + 1000000000 + + 100000 + 100000000 + break + + 600 + 1000000 + 15 + + 25 + 100 + 50 + + 2 + 25 + 50 + 100 + + 1 + + +``` + +The example specifies two profiles: `default` and `web`. + +The `default` profile has a special purpose: it must always be present and is applied when starting the server. In other words, the `default` profile contains default settings. + +The `web` profile is a regular profile that can be set using the `SET` query or using a URL parameter in an HTTP query. + +[Original article](https://clickhouse.com/docs/en/operations/settings/settings_profiles/) diff --git a/docs/en/reference/operations/settings/settings-users.md b/docs/en/reference/operations/settings/settings-users.md new file mode 100644 index 00000000000..6a020be2afc --- /dev/null +++ b/docs/en/reference/operations/settings/settings-users.md @@ -0,0 +1,164 @@ +--- +sidebar_position: 63 +sidebar_label: User Settings +--- + +# User Settings {#user-settings} + +The `users` section of the `user.xml` configuration file contains user settings. + +:::note +ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing users. We recommend using it. +::: + +Structure of the `users` section: + +``` xml + + + + + + + + 0|1 + + + + + profile_name + + default + default + + + + expression + + + + + + +``` + +### user_name/password {#user-namepassword} + +Password can be specified in plaintext or in SHA256 (hex format). + +- To assign a password in plaintext (**not recommended**), place it in a `password` element. + + For example, `qwerty`. The password can be left blank. + + + +- To assign a password using its SHA256 hash, place it in a `password_sha256_hex` element. + + For example, `65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5`. + + Example of how to generate a password from shell: + + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' + + The first line of the result is the password. The second line is the corresponding SHA256 hash. + + + +- For compatibility with MySQL clients, password can be specified in double SHA1 hash. Place it in `password_double_sha1_hex` element. + + For example, `08b4a0f1de6ad37da17359e592c8d74788a83eb0`. + + Example of how to generate a password from shell: + + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' + + The first line of the result is the password. The second line is the corresponding double SHA1 hash. + +### access_management {#access_management-user-setting} + +This setting enables or disables using of SQL-driven [access control and account management](../../operations/access-rights.md#access-control) for the user. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +### user_name/networks {#user-namenetworks} + +List of networks from which the user can connect to the ClickHouse server. + +Each element of the list can have one of the following forms: + +- `` — IP address or network mask. + + Examples: `213.180.204.3`, `10.0.0.1/8`, `10.0.0.1/255.255.255.0`, `2a02:6b8::3`, `2a02:6b8::3/64`, `2a02:6b8::3/ffff:ffff:ffff:ffff::`. + +- `` — Hostname. + + Example: `example01.host.ru`. + + To check access, a DNS query is performed, and all returned IP addresses are compared to the peer address. + +- `` — Regular expression for hostnames. + + Example, `^example\d\d-\d\d-\d\.host\.ru$` + + To check access, a [DNS PTR query](https://en.wikipedia.org/wiki/Reverse_DNS_lookup) is performed for the peer address and then the specified regexp is applied. Then, another DNS query is performed for the results of the PTR query and all the received addresses are compared to the peer address. We strongly recommend that regexp ends with $. + +All results of DNS requests are cached until the server restarts. + +**Examples** + +To open access for user from any network, specify: + +``` xml +::/0 +``` + +:::warning +It’s insecure to open access from any network unless you have a firewall properly configured or the server is not directly connected to Internet. +::: + +To open access only from localhost, specify: + +``` xml +::1 +127.0.0.1 +``` + +### user_name/profile {#user-nameprofile} + +You can assign a settings profile for the user. Settings profiles are configured in a separate section of the `users.xml` file. For more information, see [Profiles of Settings](../../operations/settings/settings-profiles.md). + +### user_name/quota {#user-namequota} + +Quotas allow you to track or limit resource usage over a period of time. Quotas are configured in the `quotas` +section of the `users.xml` configuration file. + +You can assign a quotas set for the user. For a detailed description of quotas configuration, see [Quotas](../../operations/quotas.md#quotas). + +### user_name/databases {#user-namedatabases} + +In this section, you can limit rows that are returned by ClickHouse for `SELECT` queries made by the current user, thus implementing basic row-level security. + +**Example** + +The following configuration forces that user `user1` can only see the rows of `table1` as the result of `SELECT` queries, where the value of the `id` field is 1000. + +``` xml + + + + + id = 1000 + + + + +``` + +The `filter` can be any expression resulting in a [UInt8](../../sql-reference/data-types/int-uint.md)-type value. It usually contains comparisons and logical operators. Rows from `database_name.table1` where filter results to 0 are not returned for this user. The filtering is incompatible with `PREWHERE` operations and disables `WHERE→PREWHERE` optimization. + +[Original article](https://clickhouse.com/docs/en/operations/settings/settings_users/) diff --git a/docs/en/reference/operations/settings/settings.md b/docs/en/reference/operations/settings/settings.md new file mode 100644 index 00000000000..30d7dd98ee7 --- /dev/null +++ b/docs/en/reference/operations/settings/settings.md @@ -0,0 +1,4232 @@ +# Settings {#settings} + +## distributed_product_mode {#distributed-product-mode} + +Changes the behaviour of [distributed subqueries](../../sql-reference/operators/in.md). + +ClickHouse applies this setting when the query contains the product of distributed tables, i.e. when the query for a distributed table contains a non-GLOBAL subquery for the distributed table. + +Restrictions: + +- Only applied for IN and JOIN subqueries. +- Only if the FROM section uses a distributed table containing more than one shard. +- If the subquery concerns a distributed table containing more than one shard. +- Not used for a table-valued [remote](../../sql-reference/table-functions/remote.md) function. + +Possible values: + +- `deny` — Default value. Prohibits using these types of subqueries (returns the “Double-distributed in/JOIN subqueries is denied” exception). +- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` +- `global` — Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` +- `allow` — Allows the use of these types of subqueries. + +## prefer_global_in_and_join {#prefer-global-in-and-join} + +Enables the replacement of `IN`/`JOIN` operators with `GLOBAL IN`/`GLOBAL JOIN`. + +Possible values: + +- 0 — Disabled. `IN`/`JOIN` operators are not replaced with `GLOBAL IN`/`GLOBAL JOIN`. +- 1 — Enabled. `IN`/`JOIN` operators are replaced with `GLOBAL IN`/`GLOBAL JOIN`. + +Default value: `0`. + +**Usage** + +Although `SET distributed_product_mode=global` can change the queries behavior for the distributed tables, it's not suitable for local tables or tables from external resources. Here is when the `prefer_global_in_and_join` setting comes into play. + +For example, we have query serving nodes that contain local tables, which are not suitable for distribution. We need to scatter their data on the fly during distributed processing with the `GLOBAL` keyword — `GLOBAL IN`/`GLOBAL JOIN`. + +Another use case of `prefer_global_in_and_join` is accessing tables created by external engines. This setting helps to reduce the number of calls to external sources while joining such tables: only one call per query. + +**See also:** + +- [Distributed subqueries](../../sql-reference/operators/in.md#select-distributed-subqueries) for more information on how to use `GLOBAL IN`/`GLOBAL JOIN` + +## enable_optimize_predicate_expression {#enable-optimize-predicate-expression} + +Turns on predicate pushdown in `SELECT` queries. + +Predicate pushdown may significantly reduce network traffic for distributed queries. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +Usage + +Consider the following queries: + +1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` +2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` + +If `enable_optimize_predicate_expression = 1`, then the execution time of these queries is equal because ClickHouse applies `WHERE` to the subquery when processing it. + +If `enable_optimize_predicate_expression = 0`, then the execution time of the second query is much longer because the `WHERE` clause applies to all the data after the subquery finishes. + +## fallback_to_stale_replicas_for_distributed_queries {#settings-fallback_to_stale_replicas_for_distributed_queries} + +Forces a query to an out-of-date replica if updated data is not available. See [Replication](../../engines/table-engines/mergetree-family/replication.md). + +ClickHouse selects the most relevant from the outdated replicas of the table. + +Used when performing `SELECT` from a distributed table that points to replicated tables. + +By default, 1 (enabled). + +## force_index_by_date {#settings-force_index_by_date} + +Disables query execution if the index can’t be used by date. + +Works with tables in the MergeTree family. + +If `force_index_by_date=1`, ClickHouse checks whether the query has a date key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For example, the condition `Date != ' 2000-01-01 '` is acceptable even when it matches all the data in the table (i.e., running the query requires a full scan). For more information about ranges of data in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). + +## force_primary_key {#force-primary-key} + +Disables query execution if indexing by the primary key is not possible. + +Works with tables in the MergeTree family. + +If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). + +## use_skip_indexes {#settings-use_skip_indexes} + +Use data skipping indexes during query execution. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +## force_data_skipping_indices {#settings-force_data_skipping_indices} + +Disables query execution if passed data skipping indices wasn't used. + +Consider the following example: + +```sql +CREATE TABLE data +( + key Int, + d1 Int, + d1_null Nullable(Int), + INDEX d1_idx d1 TYPE minmax GRANULARITY 1, + INDEX d1_null_idx assumeNotNull(d1_null) TYPE minmax GRANULARITY 1 +) +Engine=MergeTree() +ORDER BY key; + +SELECT * FROM data_01515; +SELECT * FROM data_01515 SETTINGS force_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error. +SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- query will produce INDEX_NOT_USED error. +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx'; -- Ok. +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`'; -- Ok (example of full featured parser). +SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- query will produce INDEX_NOT_USED error, since d1_null_idx is not used. +SELECT * FROM data_01515 WHERE d1 = 0 AND assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- Ok. +``` + +Works with tables in the MergeTree family. + +## format_schema {#format-schema} + +This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. + +## fsync_metadata {#fsync-metadata} + +Enables or disables [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) when writing `.sql` files. Enabled by default. + +It makes sense to disable it if the server has millions of tiny tables that are constantly being created and destroyed. + +## function_range_max_elements_in_block {#settings-function_range_max_elements_in_block} + +Sets the safety threshold for data volume generated by function [range](../../sql-reference/functions/array-functions.md#range). Defines the maximum number of values generated by function per block of data (sum of array sizes for every row in a block). + +Possible values: + +- Positive integer. + +Default value: `500,000,000`. + +**See Also** + +- [max_block_size](#setting-max_block_size) +- [min_insert_block_size_rows](#min-insert-block-size-rows) + +## enable_http_compression {#settings-enable_http_compression} + +Enables or disables data compression in the response to an HTTP request. + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +## http_zlib_compression_level {#settings-http_zlib_compression_level} + +Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#settings-enable_http_compression). + +Possible values: Numbers from 1 to 9. + +Default value: 3. + +## http_native_compression_disable_checksumming_on_decompress {#settings-http_native_compression_disable_checksumming_on_decompress} + +Enables or disables checksum verification when decompressing the HTTP POST data from the client. Used only for ClickHouse native compression format (not used with `gzip` or `deflate`). + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +## http_max_uri_size {#http-max-uri-size} + +Sets the maximum URI length of an HTTP request. + +Possible values: + +- Positive integer. + +Default value: 1048576. + +## table_function_remote_max_addresses {#table_function_remote_max_addresses} + +Sets the maximum number of addresses generated from patterns for the [remote](../../sql-reference/table-functions/remote.md) function. + +Possible values: + +- Positive integer. + +Default value: `1000`. + +## glob_expansion_max_elements {#glob_expansion_max_elements } + +Sets the maximum number of addresses generated from patterns for external storages and table functions (like [url](../../sql-reference/table-functions/url.md)) except the `remote` function. + +Possible values: + +- Positive integer. + +Default value: `1000`. + +## send_progress_in_http_headers {#settings-send_progress_in_http_headers} + +Enables or disables `X-ClickHouse-Progress` HTTP response headers in `clickhouse-server` responses. + +For more information, read the [HTTP interface description](../../interfaces/http.md). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +## max_http_get_redirects {#setting-max_http_get_redirects} + +Limits the maximum number of HTTP GET redirect hops for [URL](../../engines/table-engines/special/url.md)-engine tables. The setting applies to both types of tables: those created by the [CREATE TABLE](../../sql-reference/statements/create/table.md) query and by the [url](../../sql-reference/table-functions/url.md) table function. + +Possible values: + +- Any positive integer number of hops. +- 0 — No hops allowed. + +Default value: 0. + +## input_format_allow_errors_num {#settings-input_format_allow_errors_num} + +Sets the maximum number of acceptable errors when reading from text formats (CSV, TSV, etc.). + +The default value is 0. + +Always pair it with `input_format_allow_errors_ratio`. + +If an error occurred while reading rows but the error counter is still less than `input_format_allow_errors_num`, ClickHouse ignores the row and moves on to the next one. + +If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception. + +## input_format_allow_errors_ratio {#settings-input_format_allow_errors_ratio} + +Sets the maximum percentage of errors allowed when reading from text formats (CSV, TSV, etc.). +The percentage of errors is set as a floating-point number between 0 and 1. + +The default value is 0. + +Always pair it with `input_format_allow_errors_num`. + +If an error occurred while reading rows but the error counter is still less than `input_format_allow_errors_ratio`, ClickHouse ignores the row and moves on to the next one. + +If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception. + +## input_format_parquet_import_nested {#input_format_parquet_import_nested} + +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Parquet](../../interfaces/formats.md#data-format-parquet) input format. + +Possible values: + +- 0 — Data can not be inserted into `Nested` columns as an array of structs. +- 1 — Data can be inserted into `Nested` columns as an array of structs. + +Default value: `0`. + +## input_format_arrow_import_nested {#input_format_arrow_import_nested} + +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format. + +Possible values: + +- 0 — Data can not be inserted into `Nested` columns as an array of structs. +- 1 — Data can be inserted into `Nested` columns as an array of structs. + +Default value: `0`. + +## input_format_orc_import_nested {#input_format_orc_import_nested} + +Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [ORC](../../interfaces/formats.md#data-format-orc) input format. + +Possible values: + +- 0 — Data can not be inserted into `Nested` columns as an array of structs. +- 1 — Data can be inserted into `Nested` columns as an array of structs. + +Default value: `0`. + +## input_format_values_interpret_expressions {#settings-input_format_values_interpret_expressions} + +Enables or disables the full SQL parser if the fast stream parser can’t parse the data. This setting is used only for the [Values](../../interfaces/formats.md#data-format-values) format at the data insertion. For more information about syntax parsing, see the [Syntax](../../sql-reference/syntax.md) section. + +Possible values: + +- 0 — Disabled. + + In this case, you must provide formatted data. See the [Formats](../../interfaces/formats.md) section. + +- 1 — Enabled. + + In this case, you can use an SQL expression as a value, but data insertion is much slower this way. If you insert only formatted data, then ClickHouse behaves as if the setting value is 0. + +Default value: 1. + +Example of Use + +Insert the [DateTime](../../sql-reference/data-types/datetime.md) type value with the different settings. + +``` sql +SET input_format_values_interpret_expressions = 0; +INSERT INTO datetime_t VALUES (now()) +``` + +``` text +Exception on client: +Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) +``` + +``` sql +SET input_format_values_interpret_expressions = 1; +INSERT INTO datetime_t VALUES (now()) +``` + +``` text +Ok. +``` + +The last query is equivalent to the following: + +``` sql +SET input_format_values_interpret_expressions = 0; +INSERT INTO datetime_t SELECT now() +``` + +``` text +Ok. +``` + +## input_format_values_deduce_templates_of_expressions {#settings-input_format_values_deduce_templates_of_expressions} + +Enables or disables template deduction for SQL expressions in [Values](../../interfaces/formats.md#data-format-values) format. It allows parsing and interpreting expressions in `Values` much faster if expressions in consecutive rows have the same structure. ClickHouse tries to deduce the template of an expression, parse the following rows using this template and evaluate the expression on a batch of successfully parsed rows. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +For the following query: + +``` sql +INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (upper('Values')), ... +``` + +- If `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=0`, expressions are interpreted separately for each row (this is very slow for large number of rows). +- If `input_format_values_interpret_expressions=0` and `format_values_deduce_templates_of_expressions=1`, expressions in the first, second and third rows are parsed using template `lower(String)` and interpreted together, expression in the forth row is parsed with another template (`upper(String)`). +- If `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=1`, the same as in previous case, but also allows fallback to interpreting expressions separately if it’s not possible to deduce template. + +## input_format_values_accurate_types_of_literals {#settings-input-format-values-accurate-types-of-literals} + +This setting is used only when `input_format_values_deduce_templates_of_expressions = 1`. Expressions for some column may have the same structure, but contain numeric literals of different types, e.g. + +``` sql +(..., abs(0), ...), -- UInt64 literal +(..., abs(3.141592654), ...), -- Float64 literal +(..., abs(-1), ...), -- Int64 literal +``` + +Possible values: + +- 0 — Disabled. + + In this case, ClickHouse may use a more general type for some literals (e.g., `Float64` or `Int64` instead of `UInt64` for `42`), but it may cause overflow and precision issues. + +- 1 — Enabled. + + In this case, ClickHouse checks the actual type of literal and uses an expression template of the corresponding type. In some cases, it may significantly slow down expression evaluation in `Values`. + +Default value: 1. + +## input_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields} + +When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv), [TabSeparated](../../interfaces/formats.md#tabseparated) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. + +:::note +When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. +::: + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +## input_format_tsv_empty_as_default {#settings-input-format-tsv-empty-as-default} + +When enabled, replace empty input fields in TSV with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. + +Disabled by default. + +## input_format_csv_empty_as_default {#settings-input-format-csv-empty-as-default} + +When enabled, replace empty input fields in CSV with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. + +Enabled by default. + +## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number} + +When enabled, always treat enum values as enum ids for TSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing. + +Possible values: + +- 0 — Enum values are parsed as values or as enum IDs. +- 1 — Enum values are parsed only as enum IDs. + +Default value: 0. + +**Example** + +Consider the table: + +```sql +CREATE TABLE table_with_enum_column_for_tsv_insert (Id Int32,Value Enum('first' = 1, 'second' = 2)) ENGINE=Memory(); +``` + +When the `input_format_tsv_enum_as_number` setting is enabled: + +Query: + +```sql +SET input_format_tsv_enum_as_number = 1; +INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; +SELECT * FROM table_with_enum_column_for_tsv_insert; +``` + +Result: + +```text +┌──Id─┬─Value──┐ +│ 102 │ second │ +└─────┴────────┘ +``` + +Query: + +```sql +SET input_format_tsv_enum_as_number = 1; +INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first'; +``` + +throws an exception. + +When the `input_format_tsv_enum_as_number` setting is disabled: + +Query: + +```sql +SET input_format_tsv_enum_as_number = 0; +INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; +INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 103 'first'; +SELECT * FROM table_with_enum_column_for_tsv_insert; +``` + +Result: + +```text +┌──Id─┬─Value──┐ +│ 102 │ second │ +└─────┴────────┘ +┌──Id─┬─Value──┐ +│ 103 │ first │ +└─────┴────────┘ +``` + +## input_format_null_as_default {#settings-input-format-null-as-default} + +Enables or disables the initialization of [NULL](../../sql-reference/syntax.md#null-literal) fields with [default values](../../sql-reference/statements/create/table.md#create-default-values), if data type of these fields is not [nullable](../../sql-reference/data-types/nullable.md#data_type-nullable). +If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. + +This setting is applicable to [INSERT ... VALUES](../../sql-reference/statements/insert-into.md) queries for text input formats. + +Possible values: + +- 0 — Inserting `NULL` into a not nullable column causes an exception. +- 1 — `NULL` fields are initialized with default column values. + +Default value: `1`. + +## insert_null_as_default {#insert_null_as_default} + +Enables or disables the insertion of [default values](../../sql-reference/statements/create/table.md#create-default-values) instead of [NULL](../../sql-reference/syntax.md#null-literal) into columns with not [nullable](../../sql-reference/data-types/nullable.md#data_type-nullable) data type. +If column type is not nullable and this setting is disabled, then inserting `NULL` causes an exception. If column type is nullable, then `NULL` values are inserted as is, regardless of this setting. + +This setting is applicable to [INSERT ... SELECT](../../sql-reference/statements/insert-into.md#insert_query_insert-select) queries. Note that `SELECT` subqueries may be concatenated with `UNION ALL` clause. + +Possible values: + +- 0 — Inserting `NULL` into a not nullable column causes an exception. +- 1 — Default column value is inserted instead of `NULL`. + +Default value: `1`. + +## input_format_skip_unknown_fields {#settings-input-format-skip-unknown-fields} + +Enables or disables skipping insertion of extra data. + +When writing data, ClickHouse throws an exception if input data contain columns that do not exist in the target table. If skipping is enabled, ClickHouse does not insert extra data and does not throw an exception. + +Supported formats: + +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [TSKV](../../interfaces/formats.md#tskv) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +## input_format_import_nested_json {#settings-input_format_import_nested_json} + +Enables or disables the insertion of JSON data with nested objects. + +Supported formats: + +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +See also: + +- [Usage of Nested Structures](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format. + +## input_format_with_names_use_header {#settings-input-format-with-names-use-header} + +Enables or disables checking the column order when inserting data. + +To improve insert performance, we recommend disabling this check if you are sure that the column order of the input data is the same as in the target table. + +Supported formats: + +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [CSVWithNames](../../interfaces/formats.md#csvwithnamesandtypes) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNames](../../interfaces/formats.md#jsoncompacteachrowwithnames) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md#jsoncompactstringseachrowwithnames) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNames](../../interfaces/formats.md#rowbinarywithnames-rowbinarywithnames) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +## input_format_with_types_use_header {#settings-input-format-with-types-use-header} + +Controls whether format parser should check if data types from the input data match data types from the target table. + +Supported formats: + +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [CSVWithNames](../../interfaces/formats.md#csvwithnamesandtypes) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [TabSeparatedWithNamesAndTypes](../../interfaces/formats.md#tabseparatedwithnamesandtypes) +- [JSONCompactEachRowWithNames](../../interfaces/formats.md#jsoncompacteachrowwithnames) +- [JSONCompactEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompacteachrowwithnamesandtypes) +- [JSONCompactStringsEachRowWithNames](../../interfaces/formats.md#jsoncompactstringseachrowwithnames) +- [JSONCompactStringsEachRowWithNamesAndTypes](../../interfaces/formats.md#jsoncompactstringseachrowwithnamesandtypes) +- [RowBinaryWithNames](../../interfaces/formats.md#rowbinarywithnames-rowbinarywithnames) +- [RowBinaryWithNamesAndTypes](../../interfaces/formats.md#rowbinarywithnamesandtypes-rowbinarywithnamesandtypes) + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +## date_time_input_format {#settings-date_time_input_format} + +Allows choosing a parser of the text representation of date and time. + +The setting does not apply to [date and time functions](../../sql-reference/functions/date-time-functions.md). + +Possible values: + +- `'best_effort'` — Enables extended parsing. + + ClickHouse can parse the basic `YYYY-MM-DD HH:MM:SS` format and all [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) date and time formats. For example, `'2018-06-08T01:02:03.000Z'`. + +- `'basic'` — Use basic parser. + + ClickHouse can parse only the basic `YYYY-MM-DD HH:MM:SS` or `YYYY-MM-DD` format. For example, `2019-08-20 10:18:56` or `2019-08-20`. + +Default value: `'basic'`. + +See also: + +- [DateTime data type.](../../sql-reference/data-types/datetime.md) +- [Functions for working with dates and times.](../../sql-reference/functions/date-time-functions.md) + +## date_time_output_format {#settings-date_time_output_format} + +Allows choosing different output formats of the text representation of date and time. + +Possible values: + +- `simple` - Simple output format. + + ClickHouse output date and time `YYYY-MM-DD hh:mm:ss` format. For example, `2019-08-20 10:18:56`. The calculation is performed according to the data type's time zone (if present) or server time zone. + +- `iso` - ISO output format. + + ClickHouse output date and time in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `YYYY-MM-DDThh:mm:ssZ` format. For example, `2019-08-20T10:18:56Z`. Note that output is in UTC (`Z` means UTC). + +- `unix_timestamp` - Unix timestamp output format. + + ClickHouse output date and time in [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time) format. For example `1566285536`. + +Default value: `simple`. + +See also: + +- [DateTime data type.](../../sql-reference/data-types/datetime.md) +- [Functions for working with dates and times.](../../sql-reference/functions/date-time-functions.md) + +## join_default_strictness {#settings-join_default_strictness} + +Sets default strictness for [JOIN clauses](../../sql-reference/statements/select/join.md#select-join). + +Possible values: + +- `ALL` — If the right table has several matching rows, ClickHouse creates a [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) from matching rows. This is the normal `JOIN` behaviour from standard SQL. +- `ANY` — If the right table has several matching rows, only the first one found is joined. If the right table has only one matching row, the results of `ANY` and `ALL` are the same. +- `ASOF` — For joining sequences with an uncertain match. +- `Empty string` — If `ALL` or `ANY` is not specified in the query, ClickHouse throws an exception. + +Default value: `ALL`. + +## join_algorithm {#settings-join_algorithm} + +Specifies [JOIN](../../sql-reference/statements/select/join.md) algorithm. + +Possible values: + +- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. +- `partial_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) is used. +- `prefer_partial_merge` — ClickHouse always tries to use `merge` join if possible. +- `auto` — ClickHouse tries to change `hash` join to `merge` join on the fly to avoid out of memory. + +Default value: `hash`. + +When using `hash` algorithm the right part of `JOIN` is uploaded into RAM. + +When using `partial_merge` algorithm ClickHouse sorts the data and dumps it to the disk. The `merge` algorithm in ClickHouse differs a bit from the classic realization. First ClickHouse sorts the right table by [join key](../../sql-reference/statements/select/join.md#select-join) in blocks and creates min-max index for sorted blocks. Then it sorts parts of left table by `join key` and joins them over right table. The min-max index is also used to skip unneeded right table blocks. + +## join_any_take_last_row {#settings-join_any_take_last_row} + +Changes behaviour of join operations with `ANY` strictness. + +:::warning +This setting applies only for `JOIN` operations with [Join](../../engines/table-engines/special/join.md) engine tables. +::: + +Possible values: + +- 0 — If the right table has more than one matching row, only the first one found is joined. +- 1 — If the right table has more than one matching row, only the last one found is joined. + +Default value: 0. + +See also: + +- [JOIN clause](../../sql-reference/statements/select/join.md#select-join) +- [Join table engine](../../engines/table-engines/special/join.md) +- [join_default_strictness](#settings-join_default_strictness) + +## join_use_nulls {#join_use_nulls} + +Sets the type of [JOIN](../../sql-reference/statements/select/join.md) behaviour. When merging tables, empty cells may appear. ClickHouse fills them differently based on this setting. + +Possible values: + +- 0 — The empty cells are filled with the default value of the corresponding field type. +- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable), and empty cells are filled with [NULL](../../sql-reference/syntax.md). + +Default value: 0. + +## partial_merge_join_optimizations {#partial_merge_join_optimizations} + +Disables optimizations in partial merge join algorithm for [JOIN](../../sql-reference/statements/select/join.md) queries. + +By default, this setting enables improvements that could lead to wrong results. If you see suspicious results in your queries, disable optimizations by this setting. Optimizations can be different in different versions of the ClickHouse server. + +Possible values: + +- 0 — Optimizations disabled. +- 1 — Optimizations enabled. + +Default value: 1. + +## partial_merge_join_rows_in_right_blocks {#partial_merge_join_rows_in_right_blocks} + +Limits sizes of right-hand join data blocks in partial merge join algorithm for [JOIN](../../sql-reference/statements/select/join.md) queries. + +ClickHouse server: + +1. Splits right-hand join data into blocks with up to the specified number of rows. +2. Indexes each block with its minimum and maximum values. +3. Unloads prepared blocks to disk if it is possible. + +Possible values: + +- Any positive integer. Recommended range of values: \[1000, 100000\]. + +Default value: 65536. + +## join_on_disk_max_files_to_merge {#join_on_disk_max_files_to_merge} + +Limits the number of files allowed for parallel sorting in MergeJoin operations when they are executed on disk. + +The bigger the value of the setting, the more RAM used and the less disk I/O needed. + +Possible values: + +- Any positive integer, starting from 2. + +Default value: 64. + +## any_join_distinct_right_table_keys {#any_join_distinct_right_table_keys} + +Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations. + +:::warning +Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour. +::: + +When the legacy behaviour enabled: + +- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are not equal because ClickHouse uses the logic with many-to-one left-to-right table keys mapping. +- Results of `ANY INNER JOIN` operations contain all rows from the left table like the `SEMI LEFT JOIN` operations do. + +When the legacy behaviour disabled: + +- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are equal because ClickHouse uses the logic which provides one-to-many keys mapping in `ANY RIGHT JOIN` operations. +- Results of `ANY INNER JOIN` operations contain one row per key from both the left and right tables. + +Possible values: + +- 0 — Legacy behaviour is disabled. +- 1 — Legacy behaviour is enabled. + +Default value: 0. + +See also: + +- [JOIN strictness](../../sql-reference/statements/select/join.md#join-settings) + +## temporary_files_codec {#temporary_files_codec} + +Sets compression codec for temporary files used in sorting and joining operations on disk. + +Possible values: + +- LZ4 — [LZ4](https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)) compression is applied. +- NONE — No compression is applied. + +Default value: LZ4. + +## max_block_size {#setting-max_block_size} + +In ClickHouse, data is processed by blocks (sets of column parts). The internal processing cycles for a single block are efficient enough, but there are noticeable expenditures on each block. The `max_block_size` setting is a recommendation for what size of the block (in a count of rows) to load from tables. The block size shouldn’t be too small, so that the expenditures on each block are still noticeable, but not too large so that the query with LIMIT that is completed after the first block is processed quickly. The goal is to avoid consuming too much memory when extracting a large number of columns in multiple threads and to preserve at least some cache locality. + +Default value: 65,536. + +Blocks the size of `max_block_size` are not always loaded from the table. If it is obvious that less data needs to be retrieved, a smaller block is processed. + +## preferred_block_size_bytes {#preferred-block-size-bytes} + +Used for the same purpose as `max_block_size`, but it sets the recommended block size in bytes by adapting it to the number of rows in the block. +However, the block size cannot be more than `max_block_size` rows. +By default: 1,000,000. It only works when reading from MergeTree engines. + +## merge_tree_min_rows_for_concurrent_read {#setting-merge-tree-min-rows-for-concurrent-read} + +If the number of rows to be read from a file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `merge_tree_min_rows_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. + +Possible values: + +- Positive integer. + +Default value: `163840`. + +## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem} + +The minimum number of lines to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +Default value: `163840`. + +## merge_tree_min_bytes_for_concurrent_read {#setting-merge-tree-min-bytes-for-concurrent-read} + +If the number of bytes to read from one file of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine table exceeds `merge_tree_min_bytes_for_concurrent_read`, then ClickHouse tries to concurrently read from this file in several threads. + +Possible value: + +- Positive integer. + +Default value: `251658240`. + +## merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem {#merge-tree-min-bytes-for-concurrent-read-for-remote-filesystem} + +The minimum number of bytes to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem. + +Possible values: + +- Positive integer. + +Default value: `251658240`. + +## merge_tree_min_rows_for_seek {#setting-merge-tree-min-rows-for-seek} + +If the distance between two data blocks to be read in one file is less than `merge_tree_min_rows_for_seek` rows, then ClickHouse does not seek through the file but reads the data sequentially. + +Possible values: + +- Any positive integer. + +Default value: 0. + +## merge_tree_min_bytes_for_seek {#setting-merge-tree-min-bytes-for-seek} + +If the distance between two data blocks to be read in one file is less than `merge_tree_min_bytes_for_seek` bytes, then ClickHouse sequentially reads a range of file that contains both blocks, thus avoiding extra seek. + +Possible values: + +- Any positive integer. + +Default value: 0. + +## merge_tree_coarse_index_granularity {#setting-merge-tree-coarse-index-granularity} + +When searching for data, ClickHouse checks the data marks in the index file. If ClickHouse finds that required keys are in some range, it divides this range into `merge_tree_coarse_index_granularity` subranges and searches the required keys there recursively. + +Possible values: + +- Any positive even integer. + +Default value: 8. + +## merge_tree_max_rows_to_use_cache {#setting-merge-tree-max-rows-to-use-cache} + +If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it does not use the cache of uncompressed blocks. + +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. + +Possible values: + +- Any positive integer. + +Default value: 128 ✕ 8192. + +## merge_tree_max_bytes_to_use_cache {#setting-merge-tree-max-bytes-to-use-cache} + +If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it does not use the cache of uncompressed blocks. + +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. + +Possible values: + +- Any positive integer. + +Default value: 2013265920. + +## min_bytes_to_use_direct_io {#settings-min-bytes-to-use-direct-io} + +The minimum data volume required for using direct I/O access to the storage disk. + +ClickHouse uses this setting when reading data from tables. If the total storage volume of all the data to be read exceeds `min_bytes_to_use_direct_io` bytes, then ClickHouse reads the data from the storage disk with the `O_DIRECT` option. + +Possible values: + +- 0 — Direct I/O is disabled. +- Positive integer. + +Default value: 0. + +## network_compression_method {#network_compression_method} + +Sets the method of data compression that is used for communication between servers and between server and [clickhouse-client](../../interfaces/cli.md). + +Possible values: + +- `LZ4` — sets LZ4 compression method. +- `ZSTD` — sets ZSTD compression method. + +Default value: `LZ4`. + +**See Also** + +- [network_zstd_compression_level](#network_zstd_compression_level) + +## network_zstd_compression_level {#network_zstd_compression_level} + +Adjusts the level of ZSTD compression. Used only when [network_compression_method](#network_compression_method) is set to `ZSTD`. + +Possible values: + +- Positive integer from 1 to 15. + +Default value: `1`. + +## log_queries {#settings-log-queries} + +Setting up query logging. + +Queries sent to ClickHouse with this setup are logged according to the rules in the [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) server configuration parameter. + +Example: + +``` text +log_queries=1 +``` + +## log_queries_min_query_duration_ms {#settings-log-queries-min-query-duration-ms} + +If enabled (non-zero), queries faster then the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables: + +- `system.query_log` +- `system.query_thread_log` + +Only the queries with the following type will get to the log: + +- `QUERY_FINISH` +- `EXCEPTION_WHILE_PROCESSING` + +- Type: milliseconds +- Default value: 0 (any query) + +## log_queries_min_type {#settings-log-queries-min-type} + +`query_log` minimal type to log. + +Possible values: +- `QUERY_START` (`=1`) +- `QUERY_FINISH` (`=2`) +- `EXCEPTION_BEFORE_START` (`=3`) +- `EXCEPTION_WHILE_PROCESSING` (`=4`) + +Default value: `QUERY_START`. + +Can be used to limit which entities will go to `query_log`, say you are interested only in errors, then you can use `EXCEPTION_WHILE_PROCESSING`: + +``` text +log_queries_min_type='EXCEPTION_WHILE_PROCESSING' +``` + +## log_query_threads {#settings-log-query-threads} + +Setting up query threads logging. + +Query threads log into [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting have effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_thread_log) server configuration parameter. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: `1`. + +**Example** + +``` text +log_query_threads=1 +``` + +## log_query_views {#settings-log-query-views} + +Setting up query views logging. + +When a query run by ClickHouse with this setup on has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_views_log) server configuration parameter. + +Example: + +``` text +log_query_views=1 +``` + +## log_formatted_queries {#settings-log-formatted-queries} + +Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table. + +Possible values: + +- 0 — Formatted queries are not logged in the system table. +- 1 — Formatted queries are logged in the system table. + +Default value: `0`. + +## log_comment {#settings-log-comment} + +Specifies the value for the `log_comment` field of the [system.query_log](../system-tables/query_log.md) table and comment text for the server log. + +It can be used to improve the readability of server logs. Additionally, it helps to select queries related to the test from the `system.query_log` after running [clickhouse-test](../../development/tests.md). + +Possible values: + +- Any string no longer than [max_query_size](#settings-max_query_size). If length is exceeded, the server throws an exception. + +Default value: empty string. + +**Example** + +Query: + +``` sql +SET log_comment = 'log_comment test', log_queries = 1; +SELECT 1; +SYSTEM FLUSH LOGS; +SELECT type, query FROM system.query_log WHERE log_comment = 'log_comment test' AND event_date >= yesterday() ORDER BY event_time DESC LIMIT 2; +``` + +Result: + +``` text +┌─type────────┬─query─────┐ +│ QueryStart │ SELECT 1; │ +│ QueryFinish │ SELECT 1; │ +└─────────────┴───────────┘ +``` + +## max_insert_block_size {#settings-max_insert_block_size} + +The size of blocks (in a count of rows) to form for insertion into a table. +This setting only applies in cases when the server forms the blocks. +For example, for an INSERT via the HTTP interface, the server parses the data format and forms blocks of the specified size. +But when using clickhouse-client, the client parses the data itself, and the ‘max_insert_block_size’ setting on the server does not affect the size of the inserted blocks. +The setting also does not have a purpose when using INSERT SELECT, since data is inserted using the same blocks that are formed after SELECT. + +Default value: 1,048,576. + +The default is slightly more than `max_block_size`. The reason for this is because certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM. + +## min_insert_block_size_rows {#min-insert-block-size-rows} + +Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. + +Possible values: + +- Positive integer. +- 0 — Squashing disabled. + +Default value: 1048576. + +## min_insert_block_size_bytes {#min-insert-block-size-bytes} + +Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. + +Possible values: + +- Positive integer. +- 0 — Squashing disabled. + +Default value: 268435456. + +## max_replica_delay_for_distributed_queries {#settings-max_replica_delay_for_distributed_queries} + +Disables lagging replicas for distributed queries. See [Replication](../../engines/table-engines/mergetree-family/replication.md). + +Sets the time in seconds. If a replica lags more than the set value, this replica is not used. + +Default value: 300. + +Used when performing `SELECT` from a distributed table that points to replicated tables. + +## max_threads {#settings-max_threads} + +The maximum number of query processing threads, excluding threads for retrieving data from remote servers (see the ‘max_distributed_connections’ parameter). + +This parameter applies to threads that perform the same stages of the query processing pipeline in parallel. +For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least ‘max_threads’ number of threads, then ‘max_threads’ are used. + +Default value: the number of physical CPU cores. + +For queries that are completed quickly because of a LIMIT, you can set a lower ‘max_threads’. For example, if the necessary number of entries are located in every block and max_threads = 8, then 8 blocks are retrieved, although it would have been enough to read just one. + +The smaller the `max_threads` value, the less memory is consumed. + +## max_insert_threads {#settings-max-insert-threads} + +The maximum number of threads to execute the `INSERT SELECT` query. + +Possible values: + +- 0 (or 1) — `INSERT SELECT` no parallel execution. +- Positive integer. Bigger than 1. + +Default value: 0. + +Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#settings-max_threads) setting. +Higher values will lead to higher memory usage. + +## max_compress_block_size {#max-compress-block-size} + +The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. + +:::warning +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +::: + +Don’t confuse blocks for compression (a chunk of memory consisting of bytes) with blocks for query processing (a set of rows from a table). + +## min_compress_block_size {#min-compress-block-size} + +For [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least `min_compress_block_size`. By default, 65,536. + +The actual size of the block, if the uncompressed data is less than `max_compress_block_size`, is no less than this value and no less than the volume of data for one mark. + +Let’s look at an example. Assume that `index_granularity` was set to 8192 during table creation. + +We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since min_compress_block_size = 65,536, a compressed block will be formed for every two marks. + +We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won’t be decompressed. + +:::warning +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +::: + +## max_query_size {#settings-max_query_size} + +The maximum part of a query that can be taken to RAM for parsing with the SQL parser. +The INSERT query also contains data for INSERT that is processed by a separate stream parser (that consumes O(1) RAM), which is not included in this restriction. + +Default value: 256 KiB. + +## max_parser_depth {#max_parser_depth} + +Limits maximum recursion depth in the recursive descent parser. Allows controlling the stack size. + +Possible values: + +- Positive integer. +- 0 — Recursion depth is unlimited. + +Default value: 1000. + +## interactive_delay {#interactive-delay} + +The interval in microseconds for checking whether request execution has been cancelled and sending the progress. + +Default value: 100,000 (checks for cancelling and sends the progress ten times per second). + +## connect_timeout, receive_timeout, send_timeout {#connect-timeout-receive-timeout-send-timeout} + +Timeouts in seconds on the socket used for communicating with the client. + +Default value: 10, 300, 300. + +## cancel_http_readonly_queries_on_client_close {#cancel-http-readonly-queries-on-client-close} + +Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. + +Default value: 0 + +## poll_interval {#poll-interval} + +Lock in a wait loop for the specified number of seconds. + +Default value: 10. + +## max_distributed_connections {#max-distributed-connections} + +The maximum number of simultaneous connections with remote servers for distributed processing of a single query to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. + +Default value: 1024. + +The following parameters are only used when creating Distributed tables (and when launching a server), so there is no reason to change them at runtime. + +## distributed_connections_pool_size {#distributed-connections-pool-size} + +The maximum number of simultaneous connections with remote servers for distributed processing of all queries to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. + +Default value: 1024. + +## max_distributed_depth {#max-distributed-depth} + +Limits the maximum depth of recursive queries for [Distributed](../../engines/table-engines/special/distributed.md) tables. + +If the value is exceeded, the server throws an exception. + +Possible values: + +- Positive integer. +- 0 — Unlimited depth. + +Default value: `5`. + +## max_replicated_fetches_network_bandwidth_for_server {#max_replicated_fetches_network_bandwidth_for_server} + +Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) fetches for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_fetches_network_bandwidth](../../operations/settings/merge-tree-settings.md#max_replicated_fetches_network_bandwidth) setting. + +The setting isn't followed perfectly accurately. + +Possible values: + +- Positive integer. +- 0 — Unlimited. + +Default value: `0`. + +**Usage** + +Could be used for throttling speed when replicating the data to add or replace new nodes. + +:::note +60000000 bytes/s approximatly corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). +::: + +## max_replicated_sends_network_bandwidth_for_server {#max_replicated_sends_network_bandwidth_for_server} + +Limits the maximum speed of data exchange over the network in bytes per second for [replicated](../../engines/table-engines/mergetree-family/replication.md) sends for the server. Only has meaning at server startup. You can also limit the speed for a particular table with [max_replicated_sends_network_bandwidth](../../operations/settings/merge-tree-settings.md#max_replicated_sends_network_bandwidth) setting. + +The setting isn't followed perfectly accurately. + +Possible values: + +- Positive integer. +- 0 — Unlimited. + +Default value: `0`. + +**Usage** + +Could be used for throttling speed when replicating the data to add or replace new nodes. + +:::note +60000000 bytes/s approximatly corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). +::: + +## connect_timeout_with_failover_ms {#connect-timeout-with-failover-ms} + +The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the ‘shard’ and ‘replica’ sections are used in the cluster definition. +If unsuccessful, several attempts are made to connect to various replicas. + +Default value: 50. + +## connection_pool_max_wait_ms {#connection-pool-max-wait-ms} + +The wait time in milliseconds for a connection when the connection pool is full. + +Possible values: + +- Positive integer. +- 0 — Infinite timeout. + +Default value: 0. + +## connections_with_failover_max_tries {#connections-with-failover-max-tries} + +The maximum number of connection attempts with each replica for the Distributed table engine. + +Default value: 3. + +## extremes {#extremes} + +Whether to count extreme values (the minimums and maximums in columns of a query result). Accepts 0 or 1. By default, 0 (disabled). +For more information, see the section “Extreme values”. + +## kafka_max_wait_ms {#kafka-max-wait-ms} + +The wait time in milliseconds for reading messages from [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) before retry. + +Possible values: + +- Positive integer. +- 0 — Infinite timeout. + +Default value: 5000. + +See also: + +- [Apache Kafka](https://kafka.apache.org/) + +## use_uncompressed_cache {#setting-use_uncompressed_cache} + +Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). +Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed_cache_size](../../operations/server-configuration-parameters/settings.md#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. + +For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically to save space for truly small queries. This means that you can keep the ‘use_uncompressed_cache’ setting always set to 1. + +## replace_running_query {#replace-running-query} + +When using the HTTP interface, the ‘query_id’ parameter can be passed. This is any string that serves as the query identifier. +If a query from the same user with the same ‘query_id’ already exists at this time, the behaviour depends on the ‘replace_running_query’ parameter. + +`0` (default) – Throw an exception (do not allow the query to run if a query with the same ‘query_id’ is already running). + +`1` – Cancel the old query and start running the new one. + +Set this parameter to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn’t finished yet, it should be cancelled. + +## replace_running_query_max_wait_ms {#replace-running-query-max-wait-ms} + +The wait time for running the query with the same `query_id` to finish, when the [replace_running_query](#replace-running-query) setting is active. + +Possible values: + +- Positive integer. +- 0 — Throwing an exception that does not allow to run a new query if the server already executes a query with the same `query_id`. + +Default value: 5000. + +## stream_flush_interval_ms {#stream-flush-interval-ms} + +Works for tables with streaming in the case of a timeout, or when a thread generates [max_insert_block_size](#settings-max_insert_block_size) rows. + +The default value is 7500. + +The smaller the value, the more often data is flushed into the table. Setting the value too low leads to poor performance. + +## load_balancing {#settings-load_balancing} + +Specifies the algorithm of replicas selection that is used for distributed query processing. + +ClickHouse supports the following algorithms of choosing replicas: + +- [Random](#load_balancing-random) (by default) +- [Nearest hostname](#load_balancing-nearest_hostname) +- [In order](#load_balancing-in_order) +- [First or random](#load_balancing-first_or_random) +- [Round robin](#load_balancing-round_robin) + +See also: + +- [distributed_replica_max_ignored_errors](#settings-distributed_replica_max_ignored_errors) + +### Random (by Default) {#load_balancing-random} + +``` sql +load_balancing = random +``` + +The number of errors is counted for each replica. The query is sent to the replica with the fewest errors, and if there are several of these, to anyone of them. +Disadvantages: Server proximity is not accounted for; if the replicas have different data, you will also get different data. + +### Nearest Hostname {#load_balancing-nearest_hostname} + +``` sql +load_balancing = nearest_hostname +``` + +The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a hostname that is most similar to the server’s hostname in the config file (for the number of different characters in identical positions, up to the minimum length of both hostnames). + +For instance, example01-01-1 and example01-01-2 are different in one position, while example01-01-1 and example01-02-2 differ in two places. +This method might seem primitive, but it does not require external data about network topology, and it does not compare IP addresses, which would be complicated for our IPv6 addresses. + +Thus, if there are equivalent replicas, the closest one by name is preferred. +We can also assume that when sending a query to the same server, in the absence of failures, a distributed query will also go to the same servers. So even if different data is placed on the replicas, the query will return mostly the same results. + +### In Order {#load_balancing-in_order} + +``` sql +load_balancing = in_order +``` + +Replicas with the same number of errors are accessed in the same order as they are specified in the configuration. +This method is appropriate when you know exactly which replica is preferable. + +### First or Random {#load_balancing-first_or_random} + +``` sql +load_balancing = first_or_random +``` + +This algorithm chooses the first replica in the set or a random replica if the first is unavailable. It’s effective in cross-replication topology setups, but useless in other configurations. + +The `first_or_random` algorithm solves the problem of the `in_order` algorithm. With `in_order`, if one replica goes down, the next one gets a double load while the remaining replicas handle the usual amount of traffic. When using the `first_or_random` algorithm, the load is evenly distributed among replicas that are still available. + +It's possible to explicitly define what the first replica is by using the setting `load_balancing_first_offset`. This gives more control to rebalance query workloads among replicas. + +### Round Robin {#load_balancing-round_robin} + +``` sql +load_balancing = round_robin +``` + +This algorithm uses a round-robin policy across replicas with the same number of errors (only the queries with `round_robin` policy is accounted). + +## prefer_localhost_replica {#settings-prefer-localhost-replica} + +Enables/disables preferable using the localhost replica when processing distributed queries. + +Possible values: + +- 1 — ClickHouse always sends a query to the localhost replica if it exists. +- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#settings-load_balancing) setting. + +Default value: 1. + +:::warning +Disable this setting if you use [max_parallel_replicas](#settings-max_parallel_replicas). +::: + +## totals_mode {#totals-mode} + +How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = ‘any’ are present. +See the section “WITH TOTALS modifier”. + +## totals_auto_threshold {#totals-auto-threshold} + +The threshold for `totals_mode = 'auto'`. +See the section “WITH TOTALS modifier”. + +## max_parallel_replicas {#settings-max_parallel_replicas} + +The maximum number of replicas for each shard when executing a query. + +Possible values: + +- Positive integer. + +Default value: `1`. + +**Additional Info** + +This setting is useful for replicated tables with a sampling key. A query may be processed faster if it is executed on several servers in parallel. But the query performance may degrade in the following cases: + +- The position of the sampling key in the partitioning key does not allow efficient range scans. +- Adding a sampling key to the table makes filtering by other columns less efficient. +- The sampling key is an expression that is expensive to calculate. +- The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency. + +:::warning +This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md#max_parallel_replica-subqueries) for more details. +::: + +## compile_expressions {#compile-expressions} + +Enables or disables compilation of frequently used simple functions and operators to native code with LLVM at runtime. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: `1`. + +## min_count_to_compile_expression {#min-count-to-compile-expression} + +Minimum count of executing same expression before it is get compiled. + +Default value: `3`. + +## compile_aggregate_expressions {#compile_aggregate_expressions} + +Enables or disables JIT-compilation of aggregate functions to native code. Enabling this setting can improve the performance. + +Possible values: + +- 0 — Aggregation is done without JIT compilation. +- 1 — Aggregation is done using JIT compilation. + +Default value: `1`. + +**See Also** + +- [min_count_to_compile_aggregate_expression](#min_count_to_compile_aggregate_expression) + +## min_count_to_compile_aggregate_expression {#min_count_to_compile_aggregate_expression} + +The minimum number of identical aggregate expressions to start JIT-compilation. Works only if the [compile_aggregate_expressions](#compile_aggregate_expressions) setting is enabled. + +Possible values: + +- Positive integer. +- 0 — Identical aggregate expressions are always JIT-compiled. + +Default value: `3`. + +## output_format_json_quote_64bit_integers {#session_settings-output_format_json_quote_64bit_integers} + +Controls quoting of 64-bit or bigger [integers](../../sql-reference/data-types/int-uint.md) (like `UInt64` or `Int128`) when they are output in a [JSON](../../interfaces/formats.md#json) format. +Such integers are enclosed in quotes by default. This behavior is compatible with most JavaScript implementations. + +Possible values: + +- 0 — Integers are output without quotes. +- 1 — Integers are enclosed in quotes. + +Default value: 1. + +## output_format_json_quote_denormals {#settings-output_format_json_quote_denormals} + +Enables `+nan`, `-nan`, `+inf`, `-inf` outputs in [JSON](../../interfaces/formats.md#json) output format. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +**Example** + +Consider the following table `account_orders`: + +```text +┌─id─┬─name───┬─duration─┬─period─┬─area─┐ +│ 1 │ Andrew │ 20 │ 0 │ 400 │ +│ 2 │ John │ 40 │ 0 │ 0 │ +│ 3 │ Bob │ 15 │ 0 │ -100 │ +└────┴────────┴──────────┴────────┴──────┘ +``` + +When `output_format_json_quote_denormals = 0`, the query returns `null` values in output: + +```sql +SELECT area/period FROM account_orders FORMAT JSON; +``` + +```json +{ + "meta": + [ + { + "name": "divide(area, period)", + "type": "Float64" + } + ], + + "data": + [ + { + "divide(area, period)": null + }, + { + "divide(area, period)": null + }, + { + "divide(area, period)": null + } + ], + + "rows": 3, + + "statistics": + { + "elapsed": 0.003648093, + "rows_read": 3, + "bytes_read": 24 + } +} +``` + +When `output_format_json_quote_denormals = 1`, the query returns: + +```json +{ + "meta": + [ + { + "name": "divide(area, period)", + "type": "Float64" + } + ], + + "data": + [ + { + "divide(area, period)": "inf" + }, + { + "divide(area, period)": "-nan" + }, + { + "divide(area, period)": "-inf" + } + ], + + "rows": 3, + + "statistics": + { + "elapsed": 0.000070241, + "rows_read": 3, + "bytes_read": 24 + } +} +``` + +## format_csv_delimiter {#settings-format_csv_delimiter} + +The character is interpreted as a delimiter in the CSV data. By default, the delimiter is `,`. + +## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number} + +When enabled, always treat enum values as enum ids for CSV input format. It's recommended to enable this setting if data contains only enum ids to optimize enum parsing. + +Possible values: + +- 0 — Enum values are parsed as values or as enum IDs. +- 1 — Enum values are parsed only as enum IDs. + +Default value: 0. + +**Examples** + +Consider the table: + +```sql +CREATE TABLE table_with_enum_column_for_csv_insert (Id Int32,Value Enum('first' = 1, 'second' = 2)) ENGINE=Memory(); +``` + +When the `input_format_csv_enum_as_number` setting is enabled: + +Query: + +```sql +SET input_format_csv_enum_as_number = 1; +INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2 +``` + +Result: + +```text +┌──Id─┬─Value──┐ +│ 102 │ second │ +└─────┴────────┘ +``` + +Query: + +```sql +SET input_format_csv_enum_as_number = 1; +INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first' +``` + +throws an exception. + +When the `input_format_csv_enum_as_number` setting is disabled: + +Query: + +```sql +SET input_format_csv_enum_as_number = 0; +INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 102,2 +INSERT INTO table_with_enum_column_for_csv_insert FORMAT CSV 103,'first' +SELECT * FROM table_with_enum_column_for_csv_insert; +``` + +Result: + +```text +┌──Id─┬─Value──┐ +│ 102 │ second │ +└─────┴────────┘ +┌──Id─┬─Value─┐ +│ 103 │ first │ +└─────┴───────┘ +``` + +## output_format_csv_crlf_end_of_line {#settings-output-format-csv-crlf-end-of-line} + +Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). + +## output_format_tsv_crlf_end_of_line {#settings-output-format-tsv-crlf-end-of-line} + +Use DOC/Windows-style line separator (CRLF) in TSV instead of Unix style (LF). + +## insert_quorum {#settings-insert_quorum} + +Enables the quorum writes. + +- If `insert_quorum < 2`, the quorum writes are disabled. +- If `insert_quorum >= 2`, the quorum writes are enabled. + +Default value: 0. + +Quorum writes + +`INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written. + +When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#settings-select_sequential_consistency). + +ClickHouse generates an exception: + +- If the number of available replicas at the time of the query is less than the `insert_quorum`. +- When `insert_quorum_parallel` is disabled and an attempt to write data is made when the previous block has not yet been inserted in `insert_quorum` of replicas. This situation may occur if the user tries to perform another `INSERT` query to the same table before the previous one with `insert_quorum` is completed. + +See also: + +- [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) +- [select_sequential_consistency](#settings-select_sequential_consistency) + +## insert_quorum_timeout {#settings-insert_quorum_timeout} + +Write to a quorum timeout in milliseconds. If the timeout has passed and no write has taken place yet, ClickHouse will generate an exception and the client must repeat the query to write the same block to the same or any other replica. + +Default value: 600 000 milliseconds (ten minutes). + +See also: + +- [insert_quorum](#settings-insert_quorum) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) +- [select_sequential_consistency](#settings-select_sequential_consistency) + +## insert_quorum_parallel {#settings-insert_quorum_parallel} + +Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +See also: + +- [insert_quorum](#settings-insert_quorum) +- [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [select_sequential_consistency](#settings-select_sequential_consistency) + +## select_sequential_consistency {#settings-select_sequential_consistency} + +Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +Usage + +When sequential consistency is enabled, ClickHouse allows the client to execute the `SELECT` query only for those replicas that contain data from all previous `INSERT` queries executed with `insert_quorum`. If the client refers to a partial replica, ClickHouse will generate an exception. The SELECT query will not include data that has not yet been written to the quorum of replicas. + +When `insert_quorum_parallel` is enabled (the default), then `select_sequential_consistency` does not work. This is because parallel `INSERT` queries can be written to different sets of quorum replicas so there is no guarantee a single replica will have received all writes. + +See also: + +- [insert_quorum](#settings-insert_quorum) +- [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert_quorum_parallel](#settings-insert_quorum_parallel) + +## insert_deduplicate {#settings-insert-deduplicate} + +Enables or disables block deduplication of `INSERT` (for Replicated\* tables). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). + +## deduplicate_blocks_in_dependent_materialized_views {#settings-deduplicate-blocks-in-dependent-materialized-views} + +Enables or disables the deduplication check for materialized views that receive data from Replicated\* tables. + +Possible values: + + 0 — Disabled. + 1 — Enabled. + +Default value: 0. + +Usage + +By default, deduplication is not performed for materialized views but is done upstream, in the source table. +If an INSERTed block is skipped due to deduplication in the source table, there will be no insertion into attached materialized views. This behaviour exists to enable the insertion of highly aggregated data into materialized views, for cases where inserted blocks are the same after materialized view aggregation but derived from different INSERTs into the source table. +At the same time, this behaviour “breaks” `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with Zookeeper) a client will get an error and can retry the operation. However, the materialized view won’t receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows for changing this behaviour. On retry, a materialized view will receive the repeat insert and will perform a deduplication check by itself, +ignoring check result for the source table, and will insert rows lost because of the first failure. + +## insert_deduplication_token {#insert_deduplication_token} + +The setting allows a user to provide own deduplication semantic in MergeTree/ReplicatedMergeTree +For example, by providing a unique value for the setting in each INSERT statement, +user can avoid the same inserted data being deduplicated. + +Possilbe values: + +- Any string + +Default value: empty string (disabled) + +`insert_deduplication_token` is used for deduplication _only_ when not empty. + +Example: + +```sql +CREATE TABLE test_table +( A Int64 ) +ENGINE = MergeTree +ORDER BY A +SETTINGS non_replicated_deduplication_window = 100; + +INSERT INTO test_table Values SETTINGS insert_deduplication_token = 'test' (1); + +-- the next insert won't be deduplicated because insert_deduplication_token is different +INSERT INTO test_table Values SETTINGS insert_deduplication_token = 'test1' (1); + +-- the next insert will be deduplicated because insert_deduplication_token +-- is the same as one of the previous +INSERT INTO test_table Values SETTINGS insert_deduplication_token = 'test' (2); + +SELECT * FROM test_table + +┌─A─┐ +│ 1 │ +└───┘ +┌─A─┐ +│ 1 │ +└───┘ +``` + +## max_network_bytes {#settings-max-network-bytes} + +Limits the data volume (in bytes) that is received or transmitted over the network when executing a query. This setting applies to every individual query. + +Possible values: + +- Positive integer. +- 0 — Data volume control is disabled. + +Default value: 0. + +## max_network_bandwidth {#settings-max-network-bandwidth} + +Limits the speed of the data exchange over the network in bytes per second. This setting applies to every query. + +Possible values: + +- Positive integer. +- 0 — Bandwidth control is disabled. + +Default value: 0. + +## max_network_bandwidth_for_user {#settings-max-network-bandwidth-for-user} + +Limits the speed of the data exchange over the network in bytes per second. This setting applies to all concurrently running queries performed by a single user. + +Possible values: + +- Positive integer. +- 0 — Control of the data speed is disabled. + +Default value: 0. + +## max_network_bandwidth_for_all_users {#settings-max-network-bandwidth-for-all-users} + +Limits the speed that data is exchanged at over the network in bytes per second. This setting applies to all concurrently running queries on the server. + +Possible values: + +- Positive integer. +- 0 — Control of the data speed is disabled. + +Default value: 0. + +## count_distinct_implementation {#settings-count_distinct_implementation} + +Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) construction. + +Possible values: + +- [uniq](../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined](../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) +- [uniqCombined64](../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +- [uniqHLL12](../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) +- [uniqExact](../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) + +Default value: `uniqExact`. + +## skip_unavailable_shards {#settings-skip_unavailable_shards} + +Enables or disables silently skipping of unavailable shards. + +Shard is considered unavailable if all its replicas are unavailable. A replica is unavailable in the following cases: + +- ClickHouse can’t connect to replica for any reason. + + When connecting to a replica, ClickHouse performs several attempts. If all these attempts fail, the replica is considered unavailable. + +- Replica can’t be resolved through DNS. + + If replica’s hostname can’t be resolved through DNS, it can indicate the following situations: + + - Replica’s host has no DNS record. It can occur in systems with dynamic DNS, for example, [Kubernetes](https://kubernetes.io), where nodes can be unresolvable during downtime, and this is not an error. + + - Configuration error. ClickHouse configuration file contains a wrong hostname. + +Possible values: + +- 1 — skipping enabled. + + If a shard is unavailable, ClickHouse returns a result based on partial data and does not report node availability issues. + +- 0 — skipping disabled. + + If a shard is unavailable, ClickHouse throws an exception. + +Default value: 0. + +## distributed_group_by_no_merge {#distributed-group-by-no-merge} + +Do not merge aggregation states from different servers for distributed query processing, you can use this in case it is for certain that there are different keys on different shards + +Possible values: + +- `0` — Disabled (final query processing is done on the initiator node). +- `1` - Do not merge aggregation states from different servers for distributed query processing (query completelly processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards. +- `2` - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completelly on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`). + +Default value: `0` + +**Example** + +```sql +SELECT * +FROM remote('127.0.0.{2,3}', system.one) +GROUP BY dummy +LIMIT 1 +SETTINGS distributed_group_by_no_merge = 1 +FORMAT PrettyCompactMonoBlock + +┌─dummy─┐ +│ 0 │ +│ 0 │ +└───────┘ +``` + +```sql +SELECT * +FROM remote('127.0.0.{2,3}', system.one) +GROUP BY dummy +LIMIT 1 +SETTINGS distributed_group_by_no_merge = 2 +FORMAT PrettyCompactMonoBlock + +┌─dummy─┐ +│ 0 │ +└───────┘ +``` + +## distributed_push_down_limit {#distributed-push-down-limit} + +Enables or disables [LIMIT](#limit) applying on each shard separatelly. + +This will allow to avoid: +- Sending extra rows over network; +- Processing rows behind the limit on the initiator. + +Starting from 21.9 version you cannot get inaccurate results anymore, since `distributed_push_down_limit` changes query execution only if at least one of the conditions met: +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) > 0. +- Query **does not have** `GROUP BY`/`DISTINCT`/`LIMIT BY`, but it has `ORDER BY`/`LIMIT`. +- Query **has** `GROUP BY`/`DISTINCT`/`LIMIT BY` with `ORDER BY`/`LIMIT` and: + - [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled. + - [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) is enabled. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: `1`. + +See also: + +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) +- [optimize_skip_unused_shards](#optimize-skip-unused-shards) +- [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) + +## optimize_skip_unused_shards_limit {#optimize-skip-unused-shards-limit} + +Limit for number of sharding key values, turns off `optimize_skip_unused_shards` if the limit is reached. + +Too many values may require significant amount for processing, while the benefit is doubtful, since if you have huge number of values in `IN (...)`, then most likely the query will be sent to all shards anyway. + +Default value: 1000 + +## optimize_skip_unused_shards {#optimize-skip-unused-shards} + +Enables or disables skipping of unused shards for [SELECT](../../sql-reference/statements/select/index.md) queries that have sharding key condition in `WHERE/PREWHERE` (assuming that the data is distributed by sharding key, otherwise a query yields incorrect result). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0 + +## optimize_skip_unused_shards_rewrite_in {#optimize-skip-unused-shards-rewrite-in} + +Rewrite IN in query for remote shards to exclude values that does not belong to the shard (requires optimize_skip_unused_shards). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1 (since it requires `optimize_skip_unused_shards` anyway, which `0` by default) + +## allow_nondeterministic_optimize_skip_unused_shards {#allow-nondeterministic-optimize-skip-unused-shards} + +Allow nondeterministic (like `rand` or `dictGet`, since later has some caveats with updates) functions in sharding key. + +Possible values: + +- 0 — Disallowed. +- 1 — Allowed. + +Default value: 0 + +## optimize_skip_unused_shards_nesting {#optimize-skip-unused-shards-nesting} + +Controls [`optimize_skip_unused_shards`](#optimize-skip-unused-shards) (hence still requires [`optimize_skip_unused_shards`](#optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). + +Possible values: + +- 0 — Disabled, `optimize_skip_unused_shards` works always. +- 1 — Enables `optimize_skip_unused_shards` only for the first level. +- 2 — Enables `optimize_skip_unused_shards` up to the second level. + +Default value: 0 + +## force_optimize_skip_unused_shards {#force-optimize-skip-unused-shards} + +Enables or disables query execution if [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled and skipping of unused shards is not possible. If the skipping is not possible and the setting is enabled, an exception will be thrown. + +Possible values: + +- 0 — Disabled. ClickHouse does not throw an exception. +- 1 — Enabled. Query execution is disabled only if the table has a sharding key. +- 2 — Enabled. Query execution is disabled regardless of whether a sharding key is defined for the table. + +Default value: 0 + +## force_optimize_skip_unused_shards_nesting {#settings-force_optimize_skip_unused_shards_nesting} + +Controls [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards) (hence still requires [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table). + +Possible values: + +- 0 - Disabled, `force_optimize_skip_unused_shards` works always. +- 1 — Enables `force_optimize_skip_unused_shards` only for the first level. +- 2 — Enables `force_optimize_skip_unused_shards` up to the second level. + +Default value: 0 + +## optimize_distributed_group_by_sharding_key {#optimize-distributed-group-by-sharding-key} + +Optimize `GROUP BY sharding_key` queries, by avoiding costly aggregation on the initiator server (which will reduce memory usage for the query on the initiator server). + +The following types of queries are supported (and all combinations of them): + +- `SELECT DISTINCT [..., ]sharding_key[, ...] FROM dist` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...]` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...] ORDER BY x` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...] LIMIT 1` +- `SELECT ... FROM dist GROUP BY sharding_key[, ...] LIMIT 1 BY x` + +The following types of queries are not supported (support for some of them may be added later): + +- `SELECT ... GROUP BY sharding_key[, ...] WITH TOTALS` +- `SELECT ... GROUP BY sharding_key[, ...] WITH ROLLUP` +- `SELECT ... GROUP BY sharding_key[, ...] WITH CUBE` +- `SELECT ... GROUP BY sharding_key[, ...] SETTINGS extremes=1` + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0 + +See also: + +- [distributed_group_by_no_merge](#distributed-group-by-no-merge) +- [distributed_push_down_limit](#distributed-push-down-limit) +- [optimize_skip_unused_shards](#optimize-skip-unused-shards) + +:::note +Right now it requires `optimize_skip_unused_shards` (the reason behind this is that one day it may be enabled by default, and it will work correctly only if data was inserted via Distributed table, i.e. data is distributed according to sharding_key). +::: + +## optimize_throw_if_noop {#setting-optimize_throw_if_noop} + +Enables or disables throwing an exception if an [OPTIMIZE](../../sql-reference/statements/misc.md#misc_operations-optimize) query didn’t perform a merge. + +By default, `OPTIMIZE` returns successfully even if it didn’t do anything. This setting lets you differentiate these situations and get the reason in an exception message. + +Possible values: + +- 1 — Throwing an exception is enabled. +- 0 — Throwing an exception is disabled. + +Default value: 0. + +## optimize_functions_to_subcolumns {#optimize-functions-to-subcolumns} + +Enables or disables optimization by transforming some functions to reading subcolumns. This reduces the amount of data to read. + +These functions can be transformed: + +- [length](../../sql-reference/functions/array-functions.md#array_functions-length) to read the [size0](../../sql-reference/data-types/array.md#array-size) subcolumn. +- [empty](../../sql-reference/functions/array-functions.md#function-empty) to read the [size0](../../sql-reference/data-types/array.md#array-size) subcolumn. +- [notEmpty](../../sql-reference/functions/array-functions.md#function-notempty) to read the [size0](../../sql-reference/data-types/array.md#array-size) subcolumn. +- [isNull](../../sql-reference/operators/index.md#operator-is-null) to read the [null](../../sql-reference/data-types/nullable.md#finding-null) subcolumn. +- [isNotNull](../../sql-reference/operators/index.md#is-not-null) to read the [null](../../sql-reference/data-types/nullable.md#finding-null) subcolumn. +- [count](../../sql-reference/aggregate-functions/reference/count.md) to read the [null](../../sql-reference/data-types/nullable.md#finding-null) subcolumn. +- [mapKeys](../../sql-reference/functions/tuple-map-functions.md#mapkeys) to read the [keys](../../sql-reference/data-types/map.md#map-subcolumns) subcolumn. +- [mapValues](../../sql-reference/functions/tuple-map-functions.md#mapvalues) to read the [values](../../sql-reference/data-types/map.md#map-subcolumns) subcolumn. + +Possible values: + +- 0 — Optimization disabled. +- 1 — Optimization enabled. + +Default value: `0`. + +## optimize_trivial_count_query {#optimize-trivial-count-query} + +Enables or disables the optimization to trivial query `SELECT count() FROM table` using metadata from MergeTree. If you need to use row-level security, disable this setting. + +Possible values: + + - 0 — Optimization disabled. + - 1 — Optimization enabled. + +Default value: `1`. + +See also: + +- [optimize_functions_to_subcolumns](#optimize-functions-to-subcolumns) + +## distributed_replica_error_half_life {#settings-distributed_replica_error_half_life} + +- Type: seconds +- Default value: 60 seconds + +Controls how fast errors in distributed tables are zeroed. If a replica is unavailable for some time, accumulates 5 errors, and distributed_replica_error_half_life is set to 1 second, then the replica is considered normal 3 seconds after the last error. + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_cap](#settings-distributed_replica_error_cap) +- [distributed_replica_max_ignored_errors](#settings-distributed_replica_max_ignored_errors) + +## distributed_replica_error_cap {#settings-distributed_replica_error_cap} + +- Type: unsigned int +- Default value: 1000 + +The error count of each replica is capped at this value, preventing a single replica from accumulating too many errors. + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_half_life](#settings-distributed_replica_error_half_life) +- [distributed_replica_max_ignored_errors](#settings-distributed_replica_max_ignored_errors) + +## distributed_replica_max_ignored_errors {#settings-distributed_replica_max_ignored_errors} + +- Type: unsigned int +- Default value: 0 + +The number of errors that will be ignored while choosing replicas (according to `load_balancing` algorithm). + +See also: + +- [load_balancing](#load_balancing-round_robin) +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_cap](#settings-distributed_replica_error_cap) +- [distributed_replica_error_half_life](#settings-distributed_replica_error_half_life) + +## distributed_directory_monitor_sleep_time_ms {#distributed_directory_monitor_sleep_time_ms} + +Base interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. The actual interval grows exponentially in the event of errors. + +Possible values: + +- A positive integer number of milliseconds. + +Default value: 100 milliseconds. + +## distributed_directory_monitor_max_sleep_time_ms {#distributed_directory_monitor_max_sleep_time_ms} + +Maximum interval for the [Distributed](../../engines/table-engines/special/distributed.md) table engine to send data. Limits exponential growth of the interval set in the [distributed_directory_monitor_sleep_time_ms](#distributed_directory_monitor_sleep_time_ms) setting. + +Possible values: + +- A positive integer number of milliseconds. + +Default value: 30000 milliseconds (30 seconds). + +## distributed_directory_monitor_batch_inserts {#distributed_directory_monitor_batch_inserts} + +Enables/disables inserted data sending in batches. + +When batch sending is enabled, the [Distributed](../../engines/table-engines/special/distributed.md) table engine tries to send multiple files of inserted data in one operation instead of sending them separately. Batch sending improves cluster performance by better-utilizing server and network resources. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +Default value: 0. + +## distributed_directory_monitor_split_batch_on_failure {#distributed_directory_monitor_split_batch_on_failure} + +Enables/disables splitting batches on failures. + +Sometimes sending particular batch to the remote shard may fail, because of some complex pipeline after (i.e. `MATERIALIZED VIEW` with `GROUP BY`) due to `Memory limit exceeded` or similar errors. In this case, retrying will not help (and this will stuck distributed sends for the table) but sending files from that batch one by one may succeed INSERT. + +So installing this setting to `1` will disable batching for such batches (i.e. temporary disables `distributed_directory_monitor_batch_inserts` for failed batches). + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +Default value: 0. + +:::note +This setting also affects broken batches (that may appears because of abnormal server (machine) termination and no `fsync_after_insert`/`fsync_directories` for [Distributed](../../engines/table-engines/special/distributed.md) table engine). +::: + +:::warning +You should not rely on automatic batch splitting, since this may hurt performance. +::: + +## os_thread_priority {#setting-os-thread-priority} + +Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. The OS scheduler considers this priority when choosing the next thread to run on each available CPU core. + +:::warning +To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments do not allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. +::: + +Possible values: + +- You can set values in the range `[-20, 19]`. + +Lower values mean higher priority. Threads with low `nice` priority values are executed more frequently than threads with high values. High values are preferable for long-running non-interactive queries because it allows them to quickly give up resources in favour of short interactive queries when they arrive. + +Default value: 0. + +## query_profiler_real_time_period_ns {#query_profiler_real_time_period_ns} + +Sets the period for a real clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). Real clock timer counts wall-clock time. + +Possible values: + +- Positive integer number, in nanoseconds. + + Recommended values: + + - 10000000 (100 times a second) nanoseconds and less for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 for turning off the timer. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +Default value: 1000000000 nanoseconds (once a second). + +See also: + +- System table [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) + +## query_profiler_cpu_time_period_ns {#query_profiler_cpu_time_period_ns} + +Sets the period for a CPU clock timer of the [query profiler](../../operations/optimizing-performance/sampling-query-profiler.md). This timer counts only CPU time. + +Possible values: + +- A positive integer number of nanoseconds. + + Recommended values: + + - 10000000 (100 times a second) nanoseconds and more for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 for turning off the timer. + +Type: [UInt64](../../sql-reference/data-types/int-uint.md). + +Default value: 1000000000 nanoseconds. + +See also: + +- System table [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) + +## allow_introspection_functions {#settings-allow_introspection_functions} + +Enables or disables [introspections functions](../../sql-reference/functions/introspection.md) for query profiling. + +Possible values: + +- 1 — Introspection functions enabled. +- 0 — Introspection functions disabled. + +Default value: 0. + +**See Also** + +- [Sampling Query Profiler](../../operations/optimizing-performance/sampling-query-profiler.md) +- System table [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) + +## input_format_parallel_parsing {#input-format-parallel-parsing} + +Enables or disables order-preserving parallel parsing of data formats. Supported only for [TSV](../../interfaces/formats.md#tabseparated), [TKSV](../../interfaces/formats.md#tskv), [CSV](../../interfaces/formats.md#csv) and [JSONEachRow](../../interfaces/formats.md#jsoneachrow) formats. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +Default value: `1`. + +## output_format_parallel_formatting {#output-format-parallel-formatting} + +Enables or disables parallel formatting of data formats. Supported only for [TSV](../../interfaces/formats.md#tabseparated), [TKSV](../../interfaces/formats.md#tskv), [CSV](../../interfaces/formats.md#csv) and [JSONEachRow](../../interfaces/formats.md#jsoneachrow) formats. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +Default value: `1`. + +## min_chunk_bytes_for_parallel_parsing {#min-chunk-bytes-for-parallel-parsing} + +- Type: unsigned int +- Default value: 1 MiB + +The minimum chunk size in bytes, which each thread will parse in parallel. + +## output_format_avro_codec {#settings-output_format_avro_codec} + +Sets the compression codec used for output Avro file. + +Type: string + +Possible values: + +- `null` — No compression +- `deflate` — Compress with Deflate (zlib) +- `snappy` — Compress with [Snappy](https://google.github.io/snappy/) + +Default value: `snappy` (if available) or `deflate`. + +## output_format_avro_sync_interval {#settings-output_format_avro_sync_interval} + +Sets minimum data size (in bytes) between synchronization markers for output Avro file. + +Type: unsigned int + +Possible values: 32 (32 bytes) - 1073741824 (1 GiB) + +Default value: 32768 (32 KiB) + +## output_format_avro_string_column_pattern {#output_format_avro_string_column_pattern} + +Regexp of column names of type String to output as Avro `string` (default is `bytes`). +RE2 syntax is supported. + +Type: string + +## format_avro_schema_registry_url {#format_avro_schema_registry_url} + +Sets [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html) URL to use with [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent) format. + +Default value: `Empty`. + +## input_format_avro_allow_missing_fields {#input_format_avro_allow_missing_fields} + +Enables using fields that are not specified in [Avro](../../interfaces/formats.md#data-format-avro) or [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent) format schema. When a field is not found in the schema, ClickHouse uses the default value instead of throwing an exception. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +## background_pool_size {#background_pool_size} + +Sets the number of threads performing background operations in table engines (for example, merges in [MergeTree engine](../../engines/table-engines/mergetree-family/index.md) tables). This setting is applied from the `default` profile at the ClickHouse server start and can’t be changed in a user session. By adjusting this setting, you manage CPU and disk load. Smaller pool size utilizes less CPU and disk resources, but background processes advance slower which might eventually impact query performance. + +Before changing it, please also take a look at related [MergeTree settings](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-merge_tree), such as `number_of_free_entries_in_pool_to_lower_max_size_of_merge` and `number_of_free_entries_in_pool_to_execute_mutation`. + +Possible values: + +- Any positive integer. + +Default value: 16. + +## merge_selecting_sleep_ms {#merge_selecting_sleep_ms} + +Sleep time for merge selecting when no part is selected. A lower setting triggers selecting tasks in `background_schedule_pool` frequently, which results in a large number of requests to Zookeeper in large-scale clusters. + +Possible values: + +- Any positive integer. + +Default value: `5000`. + +## parallel_distributed_insert_select {#parallel_distributed_insert_select} + +Enables parallel distributed `INSERT ... SELECT` query. + +If we execute `INSERT INTO distributed_table_a SELECT ... FROM distributed_table_b` queries and both tables use the same cluster, and both tables are either [replicated](../../engines/table-engines/mergetree-family/replication.md) or non-replicated, then this query is processed locally on every shard. + +Possible values: + +- 0 — Disabled. +- 1 — `SELECT` will be executed on each shard from the underlying table of the distributed engine. +- 2 — `SELECT` and `INSERT` will be executed on each shard from/to the underlying table of the distributed engine. + +Default value: 0. + +## insert_distributed_sync {#insert_distributed_sync} + +Enables or disables synchronous data insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table. + +By default, when inserting data into a `Distributed` table, the ClickHouse server sends data to cluster nodes in asynchronous mode. When `insert_distributed_sync=1`, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true). + +Possible values: + +- 0 — Data is inserted in asynchronous mode. +- 1 — Data is inserted in synchronous mode. + +Default value: `0`. + +**See Also** + +- [Distributed Table Engine](../../engines/table-engines/special/distributed.md#distributed) +- [Managing Distributed Tables](../../sql-reference/statements/system.md#query-language-system-distributed) + +## insert_distributed_one_random_shard {#insert_distributed_one_random_shard} + +Enables or disables random shard insertion into a [Distributed](../../engines/table-engines/special/distributed.md#distributed) table when there is no distributed key. + +By default, when inserting data into a `Distributed` table with more than one shard, the ClickHouse server will reject any insertion request if there is no distributed key. When `insert_distributed_one_random_shard = 1`, insertions are allowed and data is forwarded randomly among all shards. + +Possible values: + +- 0 — Insertion is rejected if there are multiple shards and no distributed key is given. +- 1 — Insertion is done randomly among all available shards when no distributed key is given. + +Default value: `0`. + +## insert_shard_id {#insert_shard_id} + +If not `0`, specifies the shard of [Distributed](../../engines/table-engines/special/distributed.md#distributed) table into which the data will be inserted synchronously. + +If `insert_shard_id` value is incorrect, the server will throw an exception. + +To get the number of shards on `requested_cluster`, you can check server config or use this query: + +``` sql +SELECT uniq(shard_num) FROM system.clusters WHERE cluster = 'requested_cluster'; +``` + +Possible values: + +- 0 — Disabled. +- Any number from `1` to `shards_num` of corresponding [Distributed](../../engines/table-engines/special/distributed.md#distributed) table. + +Default value: `0`. + +**Example** + +Query: + +```sql +CREATE TABLE x AS system.numbers ENGINE = MergeTree ORDER BY number; +CREATE TABLE x_dist AS x ENGINE = Distributed('test_cluster_two_shards_localhost', currentDatabase(), x); +INSERT INTO x_dist SELECT * FROM numbers(5) SETTINGS insert_shard_id = 1; +SELECT * FROM x_dist ORDER BY number ASC; +``` + +Result: + +``` text +┌─number─┐ +│ 0 │ +│ 0 │ +│ 1 │ +│ 1 │ +│ 2 │ +│ 2 │ +│ 3 │ +│ 3 │ +│ 4 │ +│ 4 │ +└────────┘ +``` + +## use_compact_format_in_distributed_parts_names {#use_compact_format_in_distributed_parts_names} + +Uses compact format for storing blocks for async (`insert_distributed_sync`) INSERT into tables with `Distributed` engine. + +Possible values: + +- 0 — Uses `user[:password]@host:port#default_database` directory format. +- 1 — Uses `[shard{shard_index}[_replica{replica_index}]]` directory format. + +Default value: `1`. + +:::note +- with `use_compact_format_in_distributed_parts_names=0` changes from cluster definition will not be applied for async INSERT. +- with `use_compact_format_in_distributed_parts_names=1` changing the order of the nodes in the cluster definition, will change the `shard_index`/`replica_index` so be aware. +::: + +## background_buffer_flush_schedule_pool_size {#background_buffer_flush_schedule_pool_size} + +Sets the number of threads performing background flush in [Buffer](../../engines/table-engines/special/buffer.md)-engine tables. This setting is applied at the ClickHouse server start and can’t be changed in a user session. + +Possible values: + +- Any positive integer. + +Default value: 16. + +## background_move_pool_size {#background_move_pool_size} + +Sets the number of threads performing background moves of data parts for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)-engine tables. This setting is applied at the ClickHouse server start and can’t be changed in a user session. + +Possible values: + +- Any positive integer. + +Default value: 8. + +## background_schedule_pool_size {#background_schedule_pool_size} + +Sets the number of threads performing background tasks for [replicated](../../engines/table-engines/mergetree-family/replication.md) tables, [Kafka](../../engines/table-engines/integrations/kafka.md) streaming, [DNS cache updates](../../operations/server-configuration-parameters/settings.md#server-settings-dns-cache-update-period). This setting is applied at ClickHouse server start and can’t be changed in a user session. + +Possible values: + +- Any positive integer. + +Default value: 128. + +## background_fetches_pool_size {#background_fetches_pool_size} + +Sets the number of threads performing background fetches for [replicated](../../engines/table-engines/mergetree-family/replication.md) tables. This setting is applied at the ClickHouse server start and can’t be changed in a user session. For production usage with frequent small insertions or slow ZooKeeper cluster is recommended to use default value. + +Possible values: + +- Any positive integer. + +Default value: 8. + +## always_fetch_merged_part {#always_fetch_merged_part} + +Prohibits data parts merging in [Replicated\*MergeTree](../../engines/table-engines/mergetree-family/replication.md)-engine tables. + +When merging is prohibited, the replica never merges parts and always downloads merged parts from other replicas. If there is no required data yet, the replica waits for it. CPU and disk load on the replica server decreases, but the network load on the cluster increases. This setting can be useful on servers with relatively weak CPUs or slow disks, such as servers for backups storage. + +Possible values: + +- 0 — `Replicated*MergeTree`-engine tables merge data parts at the replica. +- 1 — `Replicated*MergeTree`-engine tables do not merge data parts at the replica. The tables download merged data parts from other replicas. + +Default value: 0. + +**See Also** + +- [Data Replication](../../engines/table-engines/mergetree-family/replication.md) + +## background_distributed_schedule_pool_size {#background_distributed_schedule_pool_size} + +Sets the number of threads performing background tasks for [distributed](../../engines/table-engines/special/distributed.md) sends. This setting is applied at the ClickHouse server start and can’t be changed in a user session. + +Possible values: + +- Any positive integer. + +Default value: 16. + +## background_message_broker_schedule_pool_size {#background_message_broker_schedule_pool_size} + +Sets the number of threads performing background tasks for message streaming. This setting is applied at the ClickHouse server start and can’t be changed in a user session. + +Possible values: + +- Any positive integer. + +Default value: 16. + +**See Also** + +- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine. +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine. + +## validate_polygons {#validate_polygons} + +Enables or disables throwing an exception in the [pointInPolygon](../../sql-reference/functions/geo/index.md#pointinpolygon) function, if the polygon is self-intersecting or self-tangent. + +Possible values: + +- 0 — Throwing an exception is disabled. `pointInPolygon` accepts invalid polygons and returns possibly incorrect results for them. +- 1 — Throwing an exception is enabled. + +Default value: 1. + +## transform_null_in {#transform_null_in} + +Enables equality of [NULL](../../sql-reference/syntax.md#null-literal) values for [IN](../../sql-reference/operators/in.md) operator. + +By default, `NULL` values can’t be compared because `NULL` means undefined value. Thus, comparison `expr = NULL` must always return `false`. With this setting `NULL = NULL` returns `true` for `IN` operator. + +Possible values: + +- 0 — Comparison of `NULL` values in `IN` operator returns `false`. +- 1 — Comparison of `NULL` values in `IN` operator returns `true`. + +Default value: 0. + +**Example** + +Consider the `null_in` table: + +``` text +┌──idx─┬─────i─┐ +│ 1 │ 1 │ +│ 2 │ NULL │ +│ 3 │ 3 │ +└──────┴───────┘ +``` + +Query: + +``` sql +SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 0; +``` + +Result: + +``` text +┌──idx─┬────i─┐ +│ 1 │ 1 │ +└──────┴──────┘ +``` + +Query: + +``` sql +SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1; +``` + +Result: + +``` text +┌──idx─┬─────i─┐ +│ 1 │ 1 │ +│ 2 │ NULL │ +└──────┴───────┘ +``` + +**See Also** + +- [NULL Processing in IN Operators](../../sql-reference/operators/in.md#in-null-processing) + +## low_cardinality_max_dictionary_size {#low_cardinality_max_dictionary_size} + +Sets a maximum size in rows of a shared global dictionary for the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type that can be written to a storage file system. This setting prevents issues with RAM in case of unlimited dictionary growth. All the data that can’t be encoded due to maximum dictionary size limitation ClickHouse writes in an ordinary method. + +Possible values: + +- Any positive integer. + +Default value: 8192. + +## low_cardinality_use_single_dictionary_for_part {#low_cardinality_use_single_dictionary_for_part} + +Turns on or turns off using of single dictionary for the data part. + +By default, the ClickHouse server monitors the size of dictionaries and if a dictionary overflows then the server starts to write the next one. To prohibit creating several dictionaries set `low_cardinality_use_single_dictionary_for_part = 1`. + +Possible values: + +- 1 — Creating several dictionaries for the data part is prohibited. +- 0 — Creating several dictionaries for the data part is not prohibited. + +Default value: 0. + +## low_cardinality_allow_in_native_format {#low_cardinality_allow_in_native_format} + +Allows or restricts using the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) data type with the [Native](../../interfaces/formats.md#native) format. + +If usage of `LowCardinality` is restricted, ClickHouse server converts `LowCardinality`-columns to ordinary ones for `SELECT` queries, and convert ordinary columns to `LowCardinality`-columns for `INSERT` queries. + +This setting is required mainly for third-party clients which do not support `LowCardinality` data type. + +Possible values: + +- 1 — Usage of `LowCardinality` is not restricted. +- 0 — Usage of `LowCardinality` is restricted. + +Default value: 1. + +## allow_suspicious_low_cardinality_types {#allow_suspicious_low_cardinality_types} + +Allows or restricts using [LowCardinality](../../sql-reference/data-types/lowcardinality.md) with data types with fixed size of 8 bytes or less: numeric data types and `FixedString(8_bytes_or_less)`. + +For small fixed values using of `LowCardinality` is usually inefficient, because ClickHouse stores a numeric index for each row. As a result: + +- Disk space usage can rise. +- RAM consumption can be higher, depending on a dictionary size. +- Some functions can work slower due to extra coding/encoding operations. + +Merge times in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine tables can grow due to all the reasons described above. + +Possible values: + +- 1 — Usage of `LowCardinality` is not restricted. +- 0 — Usage of `LowCardinality` is restricted. + +Default value: 0. + +## min_insert_block_size_rows_for_materialized_views {#min-insert-block-size-rows-for-materialized-views} + +Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. + +Possible values: + +- Any positive integer. +- 0 — Squashing disabled. + +Default value: 1048576. + +**See Also** + +- [min_insert_block_size_rows](#min-insert-block-size-rows) + +## min_insert_block_size_bytes_for_materialized_views {#min-insert-block-size-bytes-for-materialized-views} + +Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage. + +Possible values: + +- Any positive integer. +- 0 — Squashing disabled. + +Default value: 268435456. + +**See also** + +- [min_insert_block_size_bytes](#min-insert-block-size-bytes) + +## output_format_pretty_grid_charset {#output-format-pretty-grid-charset} + +Allows changing a charset which is used for printing grids borders. Available charsets are UTF-8, ASCII. + +**Example** + +``` text +SET output_format_pretty_grid_charset = 'UTF-8'; +SELECT * FROM a; +┌─a─┐ +│ 1 │ +└───┘ + +SET output_format_pretty_grid_charset = 'ASCII'; +SELECT * FROM a; ++-a-+ +| 1 | ++---+ +``` +## optimize_read_in_order {#optimize_read_in_order} + +Enables [ORDER BY](../../sql-reference/statements/select/order-by.md#optimize_read_in_order) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for reading data from [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. + +Possible values: + +- 0 — `ORDER BY` optimization is disabled. +- 1 — `ORDER BY` optimization is enabled. + +Default value: `1`. + +**See Also** + +- [ORDER BY Clause](../../sql-reference/statements/select/order-by.md#optimize_read_in_order) + +## optimize_aggregation_in_order {#optimize_aggregation_in_order} + +Enables [GROUP BY](../../sql-reference/statements/select/group-by.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries for aggregating data in corresponding order in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. + +Possible values: + +- 0 — `GROUP BY` optimization is disabled. +- 1 — `GROUP BY` optimization is enabled. + +Default value: `0`. + +**See Also** + +- [GROUP BY optimization](../../sql-reference/statements/select/group-by.md#aggregation-in-order) + +## mutations_sync {#mutations_sync} + +Allows to execute `ALTER TABLE ... UPDATE|DELETE` queries ([mutations](../../sql-reference/statements/alter/index.md#mutations)) synchronously. + +Possible values: + +- 0 - Mutations execute asynchronously. +- 1 - The query waits for all mutations to complete on the current server. +- 2 - The query waits for all mutations to complete on all replicas (if they exist). + +Default value: `0`. + +**See Also** + +- [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) +- [Mutations](../../sql-reference/statements/alter/index.md#mutations) + +## ttl_only_drop_parts {#ttl_only_drop_parts} + +Enables or disables complete dropping of data parts where all rows are expired in [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. + +When `ttl_only_drop_parts` is disabled (by default), the ClickHouse server only deletes expired rows according to their TTL. + +When `ttl_only_drop_parts` is enabled, the ClickHouse server drops a whole part when all rows in it are expired. + +Dropping whole parts instead of partial cleaning TTL-d rows allows having shorter `merge_with_ttl_timeout` times and lower impact on system performance. + +Possible values: + +- 0 — The complete dropping of data parts is disabled. +- 1 — The complete dropping of data parts is enabled. + +Default value: `0`. + +**See Also** + +- [CREATE TABLE query clauses and settings](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) (`merge_with_ttl_timeout` setting) +- [Table TTL](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl) + +## lock_acquire_timeout {#lock_acquire_timeout} + +Defines how many seconds a locking request waits before failing. + +Locking timeout is used to protect from deadlocks while executing read/write operations with tables. When the timeout expires and the locking request fails, the ClickHouse server throws an exception "Locking attempt timed out! Possible deadlock avoided. Client should retry." with error code `DEADLOCK_AVOIDED`. + +Possible values: + +- Positive integer (in seconds). +- 0 — No locking timeout. + +Default value: `120` seconds. + +## cast_keep_nullable {#cast_keep_nullable} + +Enables or disables keeping of the `Nullable` data type in [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) operations. + +When the setting is enabled and the argument of `CAST` function is `Nullable`, the result is also transformed to `Nullable` type. When the setting is disabled, the result always has the destination type exactly. + +Possible values: + +- 0 — The `CAST` result has exactly the destination type specified. +- 1 — If the argument type is `Nullable`, the `CAST` result is transformed to `Nullable(DestinationDataType)`. + +Default value: `0`. + +**Examples** + +The following query results in the destination data type exactly: + +```sql +SET cast_keep_nullable = 0; +SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); +``` + +Result: + +```text +┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ +│ 0 │ Int32 │ +└───┴───────────────────────────────────────────────────┘ +``` + +The following query results in the `Nullable` modification on the destination data type: + +```sql +SET cast_keep_nullable = 1; +SELECT CAST(toNullable(toInt32(0)) AS Int32) as x, toTypeName(x); +``` + +Result: + +```text +┌─x─┬─toTypeName(CAST(toNullable(toInt32(0)), 'Int32'))─┐ +│ 0 │ Nullable(Int32) │ +└───┴───────────────────────────────────────────────────┘ +``` + +**See Also** + +- [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function + +## output_format_pretty_max_value_width {#output_format_pretty_max_value_width} + +Limits the width of value displayed in [Pretty](../../interfaces/formats.md#pretty) formats. If the value width exceeds the limit, the value is cut. + +Possible values: + +- Positive integer. +- 0 — The value is cut completely. + +Default value: `10000` symbols. + +**Examples** + +Query: +```sql +SET output_format_pretty_max_value_width = 10; +SELECT range(number) FROM system.numbers LIMIT 10 FORMAT PrettyCompactNoEscapes; +``` +Result: +```text +┌─range(number)─┐ +│ [] │ +│ [0] │ +│ [0,1] │ +│ [0,1,2] │ +│ [0,1,2,3] │ +│ [0,1,2,3,4⋯ │ +│ [0,1,2,3,4⋯ │ +│ [0,1,2,3,4⋯ │ +│ [0,1,2,3,4⋯ │ +│ [0,1,2,3,4⋯ │ +└───────────────┘ +``` + +Query with zero width: +```sql +SET output_format_pretty_max_value_width = 0; +SELECT range(number) FROM system.numbers LIMIT 5 FORMAT PrettyCompactNoEscapes; +``` +Result: +```text +┌─range(number)─┐ +│ ⋯ │ +│ ⋯ │ +│ ⋯ │ +│ ⋯ │ +│ ⋯ │ +└───────────────┘ +``` + +## output_format_pretty_row_numbers {#output_format_pretty_row_numbers} + +Adds row numbers to output in the [Pretty](../../interfaces/formats.md#pretty) format. + +Possible values: + +- 0 — Output without row numbers. +- 1 — Output with row numbers. + +Default value: `0`. + +**Example** + +Query: + +```sql +SET output_format_pretty_row_numbers = 1; +SELECT TOP 3 name, value FROM system.settings; +``` + +Result: +```text + ┌─name────────────────────┬─value───┐ +1. │ min_compress_block_size │ 65536 │ +2. │ max_compress_block_size │ 1048576 │ +3. │ max_block_size │ 65505 │ + └─────────────────────────┴─────────┘ +``` + +## system_events_show_zero_values {#system_events_show_zero_values} + +Allows to select zero-valued events from [`system.events`](../../operations/system-tables/events.md). + +Some monitoring systems require passing all the metrics values to them for each checkpoint, even if the metric value is zero. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: `0`. + +**Examples** + +Query + +```sql +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Result + +```text +Ok. +``` + +Query +```sql +SET system_events_show_zero_values = 1; +SELECT * FROM system.events WHERE event='QueryMemoryLimitExceeded'; +``` + +Result + +```text +┌─event────────────────────┬─value─┬─description───────────────────────────────────────────┐ +│ QueryMemoryLimitExceeded │ 0 │ Number of times when memory limit exceeded for query. │ +└──────────────────────────┴───────┴───────────────────────────────────────────────────────┘ +``` + +## persistent {#persistent} + +Disables persistency for the [Set](../../engines/table-engines/special/set.md#set) and [Join](../../engines/table-engines/special/join.md#join) table engines. + +Reduces the I/O overhead. Suitable for scenarios that pursue performance and do not require persistence. + +Possible values: + +- 1 — Enabled. +- 0 — Disabled. + +Default value: `1`. + +## format_csv_null_representation {#format_csv_null_representation} + +Defines the representation of `NULL` for [CSV](../../interfaces/formats.md#csv) output and input formats. User can set any string as a value, for example, `My NULL`. + +Default value: `\N`. + +**Examples** + +Query + +```sql +SELECT * from csv_custom_null FORMAT CSV; +``` + +Result + +```text +788 +\N +\N +``` + +Query + +```sql +SET format_csv_null_representation = 'My NULL'; +SELECT * FROM csv_custom_null FORMAT CSV; +``` + +Result + +```text +788 +My NULL +My NULL +``` + +## format_tsv_null_representation {#format_tsv_null_representation} + +Defines the representation of `NULL` for [TSV](../../interfaces/formats.md#tabseparated) output and input formats. User can set any string as a value, for example, `My NULL`. + +Default value: `\N`. + +**Examples** + +Query + +```sql +SELECT * FROM tsv_custom_null FORMAT TSV; +``` + +Result + +```text +788 +\N +\N +``` + +Query + +```sql +SET format_tsv_null_representation = 'My NULL'; +SELECT * FROM tsv_custom_null FORMAT TSV; +``` + +Result + +```text +788 +My NULL +My NULL +``` + +## output_format_json_array_of_rows {#output-format-json-array-of-rows} + +Enables the ability to output all rows as a JSON array in the [JSONEachRow](../../interfaces/formats.md#jsoneachrow) format. + +Possible values: + +- 1 — ClickHouse outputs all rows as an array, each row in the `JSONEachRow` format. +- 0 — ClickHouse outputs each row separately in the `JSONEachRow` format. + +Default value: `0`. + +**Example of a query with the enabled setting** + +Query: + +```sql +SET output_format_json_array_of_rows = 1; +SELECT number FROM numbers(3) FORMAT JSONEachRow; +``` + +Result: + +```text +[ +{"number":"0"}, +{"number":"1"}, +{"number":"2"} +] +``` + +**Example of a query with the disabled setting** + +Query: + +```sql +SET output_format_json_array_of_rows = 0; +SELECT number FROM numbers(3) FORMAT JSONEachRow; +``` + +Result: + +```text +{"number":"0"} +{"number":"1"} +{"number":"2"} +``` + +## allow_nullable_key {#allow-nullable-key} + +Allows using of the [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable)-typed values in a sorting and a primary key for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) tables. + +Possible values: + +- 1 — `Nullable`-type expressions are allowed in keys. +- 0 — `Nullable`-type expressions are not allowed in keys. + +Default value: `0`. + +:::warning +Nullable primary key usually indicates bad design. It is forbidden in almost all main stream DBMS. The feature is mainly for [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) and is not heavily tested. Use with care. +::: + +:::warning +Do not enable this feature in version `<= 21.8`. It's not properly implemented and may lead to server crash. +::: + +## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty} + +Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility. +It is implemented via query rewrite (similar to [count_distinct_implementation](#settings-count_distinct_implementation) setting) to get consistent results for distributed queries. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 0. + +**Example** + +Consider the following query with aggregate functions: +```sql +SELECT SUM(-1), MAX(0) FROM system.one WHERE 0; +``` + +With `aggregate_functions_null_for_empty = 0` it would produce: +```text +┌─SUM(-1)─┬─MAX(0)─┐ +│ 0 │ 0 │ +└─────────┴────────┘ +``` + +With `aggregate_functions_null_for_empty = 1` the result would be: +```text +┌─SUMOrNull(-1)─┬─MAXOrNull(0)─┐ +│ NULL │ NULL │ +└───────────────┴──────────────┘ +``` + +## union_default_mode {#union-default-mode} + +Sets a mode for combining `SELECT` query results. The setting is only used when shared with [UNION](../../sql-reference/statements/select/union.md) without explicitly specifying the `UNION ALL` or `UNION DISTINCT`. + +Possible values: + +- `'DISTINCT'` — ClickHouse outputs rows as a result of combining queries removing duplicate rows. +- `'ALL'` — ClickHouse outputs all rows as a result of combining queries including duplicate rows. +- `''` — ClickHouse generates an exception when used with `UNION`. + +Default value: `''`. + +See examples in [UNION](../../sql-reference/statements/select/union.md). + +## data_type_default_nullable {#data_type_default_nullable} + +Allows data types without explicit modifiers [NULL or NOT NULL](../../sql-reference/statements/create/table.md#null-modifiers) in column definition will be [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable). + +Possible values: + +- 1 — The data types in column definitions are set to `Nullable` by default. +- 0 — The data types in column definitions are set to not `Nullable` by default. + +Default value: `0`. + +## execute_merges_on_single_replica_time_threshold {#execute-merges-on-single-replica-time-threshold} + +Enables special logic to perform merges on replicas. + +Possible values: + +- Positive integer (in seconds). +- 0 — Special merges logic is not used. Merges happen in the usual way on all the replicas. + +Default value: `0`. + +**Usage** + +Selects one replica to perform the merge on. Sets the time threshold from the start of the merge. Other replicas wait for the merge to finish, then download the result. If the time threshold passes and the selected replica does not perform the merge, then the merge is performed on other replicas as usual. + +High values for that threshold may lead to replication delays. + +It can be useful when merges are CPU bounded not IO bounded (performing heavy data compression, calculating aggregate functions or default expressions that require a large amount of calculations, or just very high number of tiny merges). + +## max_final_threads {#max-final-threads} + +Sets the maximum number of parallel threads for the `SELECT` query data read phase with the [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. + +Possible values: + +- Positive integer. +- 0 or 1 — Disabled. `SELECT` queries are executed in a single thread. + +Default value: `16`. + +## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability} + +Sets the probability that the ClickHouse can start a trace for executed queries (if no parent [trace context](https://www.w3.org/TR/trace-context/) is supplied). + +Possible values: + +- 0 — The trace for all executed queries is disabled (if no parent trace context is supplied). +- Positive floating-point number in the range [0..1]. For example, if the setting value is `0,5`, ClickHouse can start a trace on average for half of the queries. +- 1 — The trace for all executed queries is enabled. + +Default value: `0`. + +## optimize_on_insert {#optimize-on-insert} + +Enables or disables data transformation before the insertion, as if merge was done on this block (according to table engine). + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: 1. + +**Example** + +The difference between enabled and disabled: + +Query: + +```sql +SET optimize_on_insert = 1; + +CREATE TABLE test1 (`FirstTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY FirstTable; + +INSERT INTO test1 SELECT number % 2 FROM numbers(5); + +SELECT * FROM test1; + +SET optimize_on_insert = 0; + +CREATE TABLE test2 (`SecondTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY SecondTable; + +INSERT INTO test2 SELECT number % 2 FROM numbers(5); + +SELECT * FROM test2; +``` + +Result: + +``` text +┌─FirstTable─┐ +│ 0 │ +│ 1 │ +└────────────┘ + +┌─SecondTable─┐ +│ 0 │ +│ 0 │ +│ 0 │ +│ 1 │ +│ 1 │ +└─────────────┘ +``` + +Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializedMySQL](../../engines/database-engines/materialized-mysql.md) behaviour. + +## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} + +Allows to select data from a file engine table without file. + +Possible values: +- 0 — `SELECT` throws exception. +- 1 — `SELECT` returns empty result. + +Default value: `0`. + +## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} + +Enables or disables truncate before insert in [File](../../engines/table-engines/special/file.md) engine tables. + +Possible values: +- 0 — `INSERT` query appends new data to the end of the file. +- 1 — `INSERT` replaces existing content of the file with the new data. + +Default value: `0`. + +## allow_experimental_geo_types {#allow-experimental-geo-types} + +Allows working with experimental [geo data types](../../sql-reference/data-types/geo.md). + +Possible values: + +- 0 — Working with geo data types is disabled. +- 1 — Working with geo data types is enabled. + +Default value: `0`. + +## database_atomic_wait_for_drop_and_detach_synchronously {#database_atomic_wait_for_drop_and_detach_synchronously} + +Adds a modifier `SYNC` to all `DROP` and `DETACH` queries. + +Possible values: + +- 0 — Queries will be executed with delay. +- 1 — Queries will be executed without delay. + +Default value: `0`. + +## show_table_uuid_in_table_create_query_if_not_nil {#show_table_uuid_in_table_create_query_if_not_nil} + +Sets the `SHOW TABLE` query display. + +Possible values: + +- 0 — The query will be displayed without table UUID. +- 1 — The query will be displayed with table UUID. + +Default value: `0`. + +## allow_experimental_live_view {#allow-experimental-live-view} + +Allows creation of experimental [live views](../../sql-reference/statements/create/view.md#live-view). + +Possible values: + +- 0 — Working with live views is disabled. +- 1 — Working with live views is enabled. + +Default value: `0`. + +## live_view_heartbeat_interval {#live-view-heartbeat-interval} + +Sets the heartbeat interval in seconds to indicate [live view](../../sql-reference/statements/create/view.md#live-view) is alive . + +Default value: `15`. + +## max_live_view_insert_blocks_before_refresh {#max-live-view-insert-blocks-before-refresh} + +Sets the maximum number of inserted blocks after which mergeable blocks are dropped and query for [live view](../../sql-reference/statements/create/view.md#live-view) is re-executed. + +Default value: `64`. + +## temporary_live_view_timeout {#temporary-live-view-timeout} + +Sets the interval in seconds after which [live view](../../sql-reference/statements/create/view.md#live-view) with timeout is deleted. + +Default value: `5`. + +## periodic_live_view_refresh {#periodic-live-view-refresh} + +Sets the interval in seconds after which periodically refreshed [live view](../../sql-reference/statements/create/view.md#live-view) is forced to refresh. + +Default value: `60`. + +## http_connection_timeout {#http_connection_timeout} + +HTTP connection timeout (in seconds). + +Possible values: + +- Any positive integer. +- 0 - Disabled (infinite timeout). + +Default value: 1. + +## http_send_timeout {#http_send_timeout} + +HTTP send timeout (in seconds). + +Possible values: + +- Any positive integer. +- 0 - Disabled (infinite timeout). + +Default value: 1800. + +## http_receive_timeout {#http_receive_timeout} + +HTTP receive timeout (in seconds). + +Possible values: + +- Any positive integer. +- 0 - Disabled (infinite timeout). + +Default value: 1800. + +## check_query_single_value_result {#check_query_single_value_result} + +Defines the level of detail for the [CHECK TABLE](../../sql-reference/statements/check-table.md#checking-mergetree-tables) query result for `MergeTree` family engines . + +Possible values: + +- 0 — the query shows a check status for every individual data part of a table. +- 1 — the query shows the general table check status. + +Default value: `0`. + +## prefer_column_name_to_alias {#prefer-column-name-to-alias} + +Enables or disables using the original column names instead of aliases in query expressions and clauses. It especially matters when alias is the same as the column name, see [Expression Aliases](../../sql-reference/syntax.md#notes-on-usage). Enable this setting to make aliases syntax rules in ClickHouse more compatible with most other database engines. + +Possible values: + +- 0 — The column name is substituted with the alias. +- 1 — The column name is not substituted with the alias. + +Default value: `0`. + +**Example** + +The difference between enabled and disabled: + +Query: + +```sql +SET prefer_column_name_to_alias = 0; +SELECT avg(number) AS number, max(number) FROM numbers(10); +``` + +Result: + +```text +Received exception from server (version 21.5.1): +Code: 184. DB::Exception: Received from localhost:9000. DB::Exception: Aggregate function avg(number) is found inside another aggregate function in query: While processing avg(number) AS number. +``` + +Query: + +```sql +SET prefer_column_name_to_alias = 1; +SELECT avg(number) AS number, max(number) FROM numbers(10); +``` + +Result: + +```text +┌─number─┬─max(number)─┐ +│ 4.5 │ 9 │ +└────────┴─────────────┘ +``` + +## limit {#limit} + +Sets the maximum number of rows to get from the query result. It adjusts the value set by the [LIMIT](../../sql-reference/statements/select/limit.md#limit-clause) clause, so that the limit, specified in the query, cannot exceed the limit, set by this setting. + +Possible values: + +- 0 — The number of rows is not limited. +- Positive integer. + +Default value: `0`. + +## offset {#offset} + +Sets the number of rows to skip before starting to return rows from the query. It adjusts the offset set by the [OFFSET](../../sql-reference/statements/select/offset.md#offset-fetch) clause, so that these two values are summarized. + +Possible values: + +- 0 — No rows are skipped . +- Positive integer. + +Default value: `0`. + +**Example** + +Input table: + +``` sql +CREATE TABLE test (i UInt64) ENGINE = MergeTree() ORDER BY i; +INSERT INTO test SELECT number FROM numbers(500); +``` + +Query: + +``` sql +SET limit = 5; +SET offset = 7; +SELECT * FROM test LIMIT 10 OFFSET 100; +``` +Result: + +``` text +┌───i─┐ +│ 107 │ +│ 108 │ +│ 109 │ +└─────┘ +``` + +## optimize_syntax_fuse_functions {#optimize_syntax_fuse_functions} + +Enables to fuse aggregate functions with identical argument. It rewrites query contains at least two aggregate functions from [sum](../../sql-reference/aggregate-functions/reference/sum.md#agg_function-sum), [count](../../sql-reference/aggregate-functions/reference/count.md#agg_function-count) or [avg](../../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg) with identical argument to [sumCount](../../sql-reference/aggregate-functions/reference/sumcount.md#agg_function-sumCount). + +Possible values: + +- 0 — Functions with identical argument are not fused. +- 1 — Functions with identical argument are fused. + +Default value: `0`. + +**Example** + +Query: + +``` sql +CREATE TABLE fuse_tbl(a Int8, b Int8) Engine = Log; +SET optimize_syntax_fuse_functions = 1; +EXPLAIN SYNTAX SELECT sum(a), sum(b), count(b), avg(b) from fuse_tbl FORMAT TSV; +``` + +Result: + +``` text +SELECT + sum(a), + sumCount(b).1, + sumCount(b).2, + (sumCount(b).1) / (sumCount(b).2) +FROM fuse_tbl +``` + +## allow_experimental_database_replicated {#allow_experimental_database_replicated} + +Enables to create databases with [Replicated](../../engines/database-engines/replicated.md) engine. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. + +Default value: `0`. + +## database_replicated_initial_query_timeout_sec {#database_replicated_initial_query_timeout_sec} + +Sets how long initial DDL query should wait for Replicated database to precess previous DDL queue entries in seconds. + +Possible values: + +- Positive integer. +- 0 — Unlimited. + +Default value: `300`. + +## distributed_ddl_task_timeout {#distributed_ddl_task_timeout} + +Sets timeout for DDL query responses from all hosts in cluster. If a DDL request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. + +Possible values: + +- Positive integer. +- 0 — Async mode. +- Negative integer — infinite timeout. + +Default value: `180`. + +## distributed_ddl_output_mode {#distributed_ddl_output_mode} + +Sets format of distributed DDL query result. + +Possible values: + +- `throw` — Returns result set with query execution status for all hosts where query is finished. If query has failed on some hosts, then it will rethrow the first exception. If query is not finished yet on some hosts and [distributed_ddl_task_timeout](#distributed_ddl_task_timeout) exceeded, then it throws `TIMEOUT_EXCEEDED` exception. +- `none` — Is similar to throw, but distributed DDL query returns no result set. +- `null_status_on_timeout` — Returns `NULL` as execution status in some rows of result set instead of throwing `TIMEOUT_EXCEEDED` if query is not finished on the corresponding hosts. +- `never_throw` — Do not throw `TIMEOUT_EXCEEDED` and do not rethrow exceptions if query has failed on some hosts. + +Default value: `throw`. + +## flatten_nested {#flatten-nested} + +Sets the data format of a [nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns. + +Possible values: + +- 1 — Nested column is flattened to separate arrays. +- 0 — Nested column stays a single array of tuples. + +Default value: `1`. + +**Usage** + +If the setting is set to `0`, it is possible to use an arbitrary level of nesting. + +**Examples** + +Query: + +``` sql +SET flatten_nested = 1; +CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); + +SHOW CREATE TABLE t_nest; +``` + +Result: + +``` text +┌─statement───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.t_nest +( + `n.a` Array(UInt32), + `n.b` Array(UInt32) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS index_granularity = 8192 │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SET flatten_nested = 0; + +CREATE TABLE t_nest (`n` Nested(a UInt32, b UInt32)) ENGINE = MergeTree ORDER BY tuple(); + +SHOW CREATE TABLE t_nest; +``` + +Result: + +``` text +┌─statement──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.t_nest +( + `n` Nested(a UInt32, b UInt32) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS index_granularity = 8192 │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +## external_table_functions_use_nulls {#external-table-functions-use-nulls} + +Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md)] table functions use Nullable columns. + +Possible values: + +- 0 — The table function explicitly uses Nullable columns. +- 1 — The table function implicitly uses Nullable columns. + +Default value: `1`. + +**Usage** + +If the setting is set to `0`, the table function does not make Nullable columns and inserts default values instead of NULL. This is also applicable for NULL values inside arrays. + +## output_format_arrow_low_cardinality_as_dictionary {#output-format-arrow-low-cardinality-as-dictionary} + +Allows to convert the [LowCardinality](../../sql-reference/data-types/lowcardinality.md) type to the `DICTIONARY` type of the [Arrow](../../interfaces/formats.md#data-format-arrow) format for `SELECT` queries. + +Possible values: + +- 0 — The `LowCardinality` type is not converted to the `DICTIONARY` type. +- 1 — The `LowCardinality` type is converted to the `DICTIONARY` type. + +Default value: `0`. + +## allow_experimental_projection_optimization {#allow-experimental-projection-optimization} + +Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md#projections) optimization when processing `SELECT` queries. + +Possible values: + +- 0 — Projection optimization disabled. +- 1 — Projection optimization enabled. + +Default value: `0`. + +## force_optimize_projection {#force-optimize-projection} + +Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md#projections) in `SELECT` queries, when projection optimization is enabled (see [allow_experimental_projection_optimization](#allow-experimental-projection-optimization) setting). + +Possible values: + +- 0 — Projection optimization is not obligatory. +- 1 — Projection optimization is obligatory. + +Default value: `0`. + +## replication_alter_partitions_sync {#replication-alter-partitions-sync} + +Allows to set up waiting for actions to be executed on replicas by [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. + +Possible values: + +- 0 — Do not wait. +- 1 — Wait for own execution. +- 2 — Wait for everyone. + +Default value: `1`. + +## replication_wait_for_inactive_replica_timeout {#replication-wait-for-inactive-replica-timeout} + +Specifies how long (in seconds) to wait for inactive replicas to execute [ALTER](../../sql-reference/statements/alter/index.md), [OPTIMIZE](../../sql-reference/statements/optimize.md) or [TRUNCATE](../../sql-reference/statements/truncate.md) queries. + +Possible values: + +- 0 — Do not wait. +- Negative integer — Wait for unlimited time. +- Positive integer — The number of seconds to wait. + +Default value: `120` seconds. + +## regexp_max_matches_per_row {#regexp-max-matches-per-row} + +Sets the maximum number of matches for a single regular expression per row. Use it to protect against memory overload when using greedy regular expression in the [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md#extractallgroups-horizontal) function. + +Possible values: + +- Positive integer. + +Default value: `1000`. + +## http_max_single_read_retries {#http-max-single-read-retries} + +Sets the maximum number of retries during a single HTTP read. + +Possible values: + +- Positive integer. + +Default value: `1024`. + +## log_queries_probability {#log-queries-probability} + +Allows a user to write to [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), and [query_views_log](../../operations/system-tables/query_views_log.md) system tables only a sample of queries selected randomly with the specified probability. It helps to reduce the load with a large volume of queries in a second. + +Possible values: + +- 0 — Queries are not logged in the system tables. +- Positive floating-point number in the range [0..1]. For example, if the setting value is `0.5`, about half of the queries are logged in the system tables. +- 1 — All queries are logged in the system tables. + +Default value: `1`. + +## short_circuit_function_evaluation {#short-circuit-function-evaluation} + +Allows calculating the [if](../../sql-reference/functions/conditional-functions.md#if), [multiIf](../../sql-reference/functions/conditional-functions.md#multiif), [and](../../sql-reference/functions/logical-functions.md#logical-and-function), and [or](../../sql-reference/functions/logical-functions.md#logical-or-function) functions according to a [short scheme](https://en.wikipedia.org/wiki/Short-circuit_evaluation). This helps optimize the execution of complex expressions in these functions and prevent possible exceptions (such as division by zero when it is not expected). + +Possible values: + +- `enable` — Enables short-circuit function evaluation for functions that are suitable for it (can throw an exception or computationally heavy). +- `force_enable` — Enables short-circuit function evaluation for all functions. +- `disable` — Disables short-circuit function evaluation. + +Default value: `enable`. + +## max_hyperscan_regexp_length {#max-hyperscan-regexp-length} + +Defines the maximum length for each regular expression in the [hyperscan multi-match functions](../../sql-reference/functions/string-search-functions.md#multimatchanyhaystack-pattern1-pattern2-patternn). + +Possible values: + +- Positive integer. +- 0 - The length is not limited. + +Default value: `0`. + +**Example** + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 3; +``` + +Result: + +```text +┌─multiMatchAny('abcd', ['ab', 'bcd', 'c', 'd'])─┐ +│ 1 │ +└────────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bcd','c','d']) SETTINGS max_hyperscan_regexp_length = 2; +``` + +Result: + +```text +Exception: Regexp length too large. +``` + +**See Also** + +- [max_hyperscan_regexp_total_length](#max-hyperscan-regexp-total-length) + +## max_hyperscan_regexp_total_length {#max-hyperscan-regexp-total-length} + +Sets the maximum length total of all regular expressions in each [hyperscan multi-match function](../../sql-reference/functions/string-search-functions.md#multimatchanyhaystack-pattern1-pattern2-patternn). + +Possible values: + +- Positive integer. +- 0 - The length is not limited. + +Default value: `0`. + +**Example** + +Query: + +```sql +SELECT multiMatchAny('abcd', ['a','b','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; +``` + +Result: + +```text +┌─multiMatchAny('abcd', ['a', 'b', 'c', 'd'])─┐ +│ 1 │ +└─────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT multiMatchAny('abcd', ['ab','bc','c','d']) SETTINGS max_hyperscan_regexp_total_length = 5; +``` + +Result: + +```text +Exception: Total regexp lengths too large. +``` + +**See Also** + +- [max_hyperscan_regexp_length](#max-hyperscan-regexp-length) + +## enable_positional_arguments {#enable-positional-arguments} + +Enables or disables supporting positional arguments for [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md) statements. When you want to use column numbers instead of column names in these clauses, set `enable_positional_arguments = 1`. + +Possible values: + +- 0 — Positional arguments aren't supported. +- 1 — Positional arguments are supported: column numbers can use instead of column names. + +Default value: `0`. + +**Example** + +Query: + +```sql +CREATE TABLE positional_arguments(one Int, two Int, three Int) ENGINE=Memory(); + +INSERT INTO positional_arguments VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20); + +SET enable_positional_arguments = 1; + +SELECT * FROM positional_arguments ORDER BY 2,3; +``` + +Result: + +```text +┌─one─┬─two─┬─three─┐ +│ 30 │ 10 │ 20 │ +│ 20 │ 20 │ 10 │ +│ 10 │ 20 │ 30 │ +└─────┴─────┴───────┘ +``` + +## optimize_move_to_prewhere {#optimize_move_to_prewhere} + +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries. + +Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. + +Possible values: + +- 0 — Automatic `PREWHERE` optimization is disabled. +- 1 — Automatic `PREWHERE` optimization is enabled. + +Default value: `1`. + +## optimize_move_to_prewhere_if_final {#optimize_move_to_prewhere_if_final} + +Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries with [FINAL](../../sql-reference/statements/select/from.md#select-from-final) modifier. + +Works only for [*MergeTree](../../engines/table-engines/mergetree-family/index.md) tables. + +Possible values: + +- 0 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is disabled. +- 1 — Automatic `PREWHERE` optimization in `SELECT` queries with `FINAL` modifier is enabled. + +Default value: `0`. + +**See Also** + +- [optimize_move_to_prewhere](#optimize_move_to_prewhere) setting + +## describe_include_subcolumns {#describe_include_subcolumns} + +Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md#finding-null) or an [Array](../../sql-reference/data-types/array.md#array-size) data type. + +Possible values: + +- 0 — Subcolumns are not included in `DESCRIBE` queries. +- 1 — Subcolumns are included in `DESCRIBE` queries. + +Default value: `0`. + +**Example** + +See an example for the [DESCRIBE](../../sql-reference/statements/describe-table.md) statement. + +## async_insert {#async-insert} + +Enables or disables asynchronous inserts. This makes sense only for insertion over HTTP protocol. Note that deduplication isn't working for such inserts. + +If enabled, the data is combined into batches before the insertion into tables, so it is possible to do small and frequent insertions into ClickHouse (up to 15000 queries per second) without buffer tables. + +The data is inserted either after the [async_insert_max_data_size](#async-insert-max-data-size) is exceeded or after [async_insert_busy_timeout_ms](#async-insert-busy-timeout-ms) milliseconds since the first `INSERT` query. If the [async_insert_stale_timeout_ms](#async-insert-stale-timeout-ms) is set to a non-zero value, the data is inserted after `async_insert_stale_timeout_ms` milliseconds since the last query. + +If [wait_for_async_insert](#wait-for-async-insert) is enabled, every client will wait for the data to be processed and flushed to the table. Otherwise, the query would be processed almost instantly, even if the data is not inserted. + +Possible values: + +- 0 — Insertions are made synchronously, one after another. +- 1 — Multiple asynchronous insertions enabled. + +Default value: `0`. + +## async_insert_threads {#async-insert-threads} + +The maximum number of threads for background data parsing and insertion. + +Possible values: + +- Positive integer. +- 0 — Asynchronous insertions are disabled. + +Default value: `16`. + +## wait_for_async_insert {#wait-for-async-insert} + +Enables or disables waiting for processing of asynchronous insertion. If enabled, server will return `OK` only after the data is inserted. Otherwise, it will return `OK` even if the data wasn't inserted. + +Possible values: + +- 0 — Server returns `OK` even if the data is not yet inserted. +- 1 — Server returns `OK` only after the data is inserted. + +Default value: `1`. + +## wait_for_async_insert_timeout {#wait-for-async-insert-timeout} + +The timeout in seconds for waiting for processing of asynchronous insertion. + +Possible values: + +- Positive integer. +- 0 — Disabled. + +Default value: [lock_acquire_timeout](#lock_acquire_timeout). + +## async_insert_max_data_size {#async-insert-max-data-size} + +The maximum size of the unparsed data in bytes collected per query before being inserted. + +Possible values: + +- Positive integer. +- 0 — Asynchronous insertions are disabled. + +Default value: `1000000`. + +## async_insert_busy_timeout_ms {#async-insert-busy-timeout-ms} + +The maximum timeout in milliseconds since the first `INSERT` query before inserting collected data. + +Possible values: + +- Positive integer. +- 0 — Timeout disabled. + +Default value: `200`. + +## async_insert_stale_timeout_ms {#async-insert-stale-timeout-ms} + +The maximum timeout in milliseconds since the last `INSERT` query before dumping collected data. If enabled, the settings prolongs the [async_insert_busy_timeout_ms](#async-insert-busy-timeout-ms) with every `INSERT` query as long as [async_insert_max_data_size](#async-insert-max-data-size) is not exceeded. + +Possible values: + +- Positive integer. +- 0 — Timeout disabled. + +Default value: `0`. + +## alter_partition_verbose_result {#alter-partition-verbose-result} + +Enables or disables the display of information about the parts to which the manipulation operations with partitions and parts have been successfully applied. +Applicable to [ATTACH PARTITION|PART](../../sql-reference/statements/alter/partition.md#alter_attach-partition) and to [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition). + +Possible values: + +- 0 — disable verbosity. +- 1 — enable verbosity. + +Default value: `0`. + +**Example** + +```sql +CREATE TABLE test(a Int64, d Date, s String) ENGINE = MergeTree PARTITION BY toYYYYMM(d) ORDER BY a; +INSERT INTO test VALUES(1, '2021-01-01', ''); +INSERT INTO test VALUES(1, '2021-01-01', ''); +ALTER TABLE test DETACH PARTITION ID '202101'; + +ALTER TABLE test ATTACH PARTITION ID '202101' SETTINGS alter_partition_verbose_result = 1; + +┌─command_type─────┬─partition_id─┬─part_name────┬─old_part_name─┐ +│ ATTACH PARTITION │ 202101 │ 202101_7_7_0 │ 202101_5_5_0 │ +│ ATTACH PARTITION │ 202101 │ 202101_8_8_0 │ 202101_6_6_0 │ +└──────────────────┴──────────────┴──────────────┴───────────────┘ + +ALTER TABLE test FREEZE SETTINGS alter_partition_verbose_result = 1; + +┌─command_type─┬─partition_id─┬─part_name────┬─backup_name─┬─backup_path───────────────────┬─part_backup_path────────────────────────────────────────────┐ +│ FREEZE ALL │ 202101 │ 202101_7_7_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_7_7_0 │ +│ FREEZE ALL │ 202101 │ 202101_8_8_0 │ 8 │ /var/lib/clickhouse/shadow/8/ │ /var/lib/clickhouse/shadow/8/data/default/test/202101_8_8_0 │ +└──────────────┴──────────────┴──────────────┴─────────────┴───────────────────────────────┴─────────────────────────────────────────────────────────────┘ +``` + +## format_capn_proto_enum_comparising_mode {#format-capn-proto-enum-comparising-mode} + +Determines how to map ClickHouse `Enum` data type and [CapnProto](../../interfaces/formats.md#capnproto) `Enum` data type from schema. + +Possible values: + +- `'by_values'` — Values in enums should be the same, names can be different. +- `'by_names'` — Names in enums should be the same, values can be different. +- `'by_name_case_insensitive'` — Names in enums should be the same case-insensitive, values can be different. + +Default value: `'by_values'`. + +## min_bytes_to_use_mmap_io {#min-bytes-to-use-mmap-io} + +This is an experimental setting. Sets the minimum amount of memory for reading large files without copying data from the kernel to userspace. Recommended threshold is about 64 MB, because [mmap/munmap](https://en.wikipedia.org/wiki/Mmap) is slow. It makes sense only for large files and helps only if data reside in the page cache. + +Possible values: + +- Positive integer. +- 0 — Big files read with only copying data from kernel to userspace. + +Default value: `0`. + +## format_custom_escaping_rule {#format-custom-escaping-rule} + +Sets the field escaping rule for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. + +Possible values: + +- `'Escaped'` — Similarly to [TSV](../../interfaces/formats.md#tabseparated). +- `'Quoted'` — Similarly to [Values](../../interfaces/formats.md#data-format-values). +- `'CSV'` — Similarly to [CSV](../../interfaces/formats.md#csv). +- `'JSON'` — Similarly to [JSONEachRow](../../interfaces/formats.md#jsoneachrow). +- `'XML'` — Similarly to [XML](../../interfaces/formats.md#xml). +- `'Raw'` — Extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](../../interfaces/formats.md#tabseparatedraw). + +Default value: `'Escaped'`. + +## format_custom_field_delimiter {#format-custom-field-delimiter} + +Sets the character that is interpreted as a delimiter between the fields for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. + +Default value: `'\t'`. + +## format_custom_row_before_delimiter {#format-custom-row-before-delimiter} + +Sets the character that is interpreted as a delimiter before the field of the first column for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. + +Default value: `''`. + +## format_custom_row_after_delimiter {#format-custom-row-after-delimiter} + +Sets the character that is interpreted as a delimiter after the field of the last column for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. + +Default value: `'\n'`. + +## format_custom_row_between_delimiter {#format-custom-row-between-delimiter} + +Sets the character that is interpreted as a delimiter between the rows for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. + +Default value: `''`. + +## format_custom_result_before_delimiter {#format-custom-result-before-delimiter} + +Sets the character that is interpreted as a prefix before the result set for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. + +Default value: `''`. + +## format_custom_result_after_delimiter {#format-custom-result-after-delimiter} + +Sets the character that is interpreted as a suffix after the result set for [CustomSeparated](../../interfaces/formats.md#format-customseparated) data format. + +Default value: `''`. + +## shutdown_wait_unfinished_queries + +Enables or disables waiting unfinished queries when shutdown server. + +Possible values: + +- 0 — Disabled. +- 1 — Enabled. The wait time equal shutdown_wait_unfinished config. + +Default value: 0. + +## shutdown_wait_unfinished + +The waiting time in seconds for currently handled connections when shutdown server. + +Default Value: 5. diff --git a/docs/en/reference/operations/ssl-zookeeper.md b/docs/en/reference/operations/ssl-zookeeper.md new file mode 100644 index 00000000000..d6043d521e7 --- /dev/null +++ b/docs/en/reference/operations/ssl-zookeeper.md @@ -0,0 +1,73 @@ +--- +sidebar_position: 45 +sidebar_label: Secured Communication with Zookeeper +--- + +# Optional secured communication between ClickHouse and Zookeeper {#secured-communication-with-zookeeper} + +You should specify `ssl.keyStore.location`, `ssl.keyStore.password` and `ssl.trustStore.location`, `ssl.trustStore.password` for communication with ClickHouse client over SSL. These options are available from Zookeeper version 3.5.2. + +You can add `zookeeper.crt` to trusted certificates. + +``` bash +sudo cp zookeeper.crt /usr/local/share/ca-certificates/zookeeper.crt +sudo update-ca-certificates +``` + +Client section in `config.xml` will look like: + +``` xml + + /etc/clickhouse-server/client.crt + /etc/clickhouse-server/client.key + true + true + sslv2,sslv3 + true + + RejectCertificateHandler + + +``` + +Add Zookeeper to ClickHouse config with some cluster and macros: + +``` xml + + + + localhost + 2281 + 1 + + + +``` + +Start `clickhouse-server`. In logs you should see: + +```text + ZooKeeper: initialized, hosts: secure://localhost:2281 +``` + +Prefix `secure://` indicates that connection is secured by SSL. + +To ensure traffic is encrypted run `tcpdump` on secured port: + +```bash +tcpdump -i any dst port 2281 -nnXS +``` + +And query in `clickhouse-client`: + +```sql +SELECT * FROM system.zookeeper WHERE path = '/'; +``` + +On unencrypted connection you will see in `tcpdump` output something like this: + +```text +..../zookeeper/quota. +``` + +On encrypted connection you should not see this. diff --git a/docs/en/reference/operations/storing-data.md b/docs/en/reference/operations/storing-data.md new file mode 100644 index 00000000000..2162ae066dd --- /dev/null +++ b/docs/en/reference/operations/storing-data.md @@ -0,0 +1,318 @@ +--- +sidebar_position: 68 +sidebar_label: External Disks for Storing Data +--- + +# External Disks for Storing Data {#external-disks} + +Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon S3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)). + +To work with data stored on `Amazon S3` disks use [S3](../engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](../engines/table-engines/integrations/hdfs.md) table engine. + +To load data from a web server with static files use a disk with type [web](#storing-data-on-webserver). + +## Zero-copy Replication {#zero-copy} + +ClickHouse supports zero-copy replication for `S3` and `HDFS` disks, which means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself. + +## Configuring HDFS {#configuring-hdfs} + +[MergeTree](../engines/table-engines/mergetree-family/mergetree.md) and [Log](../engines/table-engines/log-family/log.md) family table engines can store data to HDFS using a disk with type `HDFS`. + +Configuration markup: + +``` xml + + + + + hdfs + hdfs://hdfs1:9000/clickhouse/ + + + + + +
+ hdfs +
+
+
+
+
+ + + 0 + +
+``` + +Required parameters: + +- `endpoint` — HDFS endpoint URL in `path` format. Endpoint URL should contain a root path to store data. + +Optional parameters: + +- `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1 Mb`. + +## Using Virtual File System for Data Encryption {#encrypted-virtual-file-system} + +You can encrypt the data stored on [S3](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one. + +Example of disk configuration: + +``` xml + + + local + /path1/ + + + encrypted + disk1 + path2/ + _16_ascii_chars_ + + +``` + +For example, when ClickHouse writes data from some table to a file `store/all_1_1_0/data.bin` to `disk1`, then in fact this file will be written to the physical disk along the path `/path1/store/all_1_1_0/data.bin`. + +When writing the same file to `disk2`, it will actually be written to the physical disk at the path `/path1/path2/store/all_1_1_0/data.bin` in encrypted mode. + +Required parameters: + +- `type` — `encrypted`. Otherwise the encrypted disk is not created. +- `disk` — Type of disk for data storage. +- `key` — The key for encryption and decryption. Type: [Uint64](../sql-reference/data-types/int-uint.md). You can use `key_hex` parameter to encrypt in hexadecimal form. + You can specify multiple keys using the `id` attribute (see example above). + +Optional parameters: + +- `path` — Path to the location on the disk where the data will be saved. If not specified, the data will be saved in the root directory. +- `current_key_id` — The key used for encryption. All the specified keys can be used for decryption, and you can always switch to another key while maintaining access to previously encrypted data. +- `algorithm` — [Algorithm](../sql-reference/statements/create/table.md#create-query-encryption-codecs) for encryption. Possible values: `AES_128_CTR`, `AES_192_CTR` or `AES_256_CTR`. Default value: `AES_128_CTR`. The key length depends on the algorithm: `AES_128_CTR` — 16 bytes, `AES_192_CTR` — 24 bytes, `AES_256_CTR` — 32 bytes. + +Example of disk configuration: + +``` xml + + + + + s3 + ... + + + encrypted + disk_s3 + AES_128_CTR + 00112233445566778899aabbccddeeff + ffeeddccbbaa99887766554433221100 + 1 + + + + +``` + +## Storing Data on Web Server {#storing-data-on-webserver} + +There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. + +This is a read-only disk. Its data is only read and never modified. A new table is loaded to this disk via `ATTACH TABLE` query (see example below). Local disk is not actually used, each `SELECT` query will result in a `http` request to fetch required data. All modification of the table data will result in an exception, i.e. the following types of queries are not allowed: [CREATE TABLE](../sql-reference/statements/create/table.md), [ALTER TABLE](../sql-reference/statements/alter/index.md), [RENAME TABLE](../sql-reference/statements/rename.md#misc_operations-rename_table), [DETACH TABLE](../sql-reference/statements/detach.md) and [TRUNCATE TABLE](../sql-reference/statements/truncate.md). + +Web server storage is supported only for the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) and [Log](../engines/table-engines/log-family/log.md) engine families. To access the data stored on a `web` disk, use the [storage_policy](../engines/table-engines/mergetree-family/mergetree.md#terms) setting when executing the query. For example, `ATTACH TABLE table_web UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'`. + +A ready test case. You need to add this configuration to config: + +``` xml + + + + + web + https://clickhouse-datasets.s3.yandex.net/disk-with-static-files-tests/test-hits/ + + + + + +
+ web +
+
+
+
+
+
+``` + +And then execute this query: + +```sql +ATTACH TABLE test_hits UUID '1ae36516-d62d-4218-9ae3-6516d62da218' +( + WatchID UInt64, + JavaEnable UInt8, + Title String, + GoodEvent Int16, + EventTime DateTime, + EventDate Date, + CounterID UInt32, + ClientIP UInt32, + ClientIP6 FixedString(16), + RegionID UInt32, + UserID UInt64, + CounterClass Int8, + OS UInt8, + UserAgent UInt8, + URL String, + Referer String, + URLDomain String, + RefererDomain String, + Refresh UInt8, + IsRobot UInt8, + RefererCategories Array(UInt16), + URLCategories Array(UInt16), + URLRegions Array(UInt32), + RefererRegions Array(UInt32), + ResolutionWidth UInt16, + ResolutionHeight UInt16, + ResolutionDepth UInt8, + FlashMajor UInt8, + FlashMinor UInt8, + FlashMinor2 String, + NetMajor UInt8, + NetMinor UInt8, + UserAgentMajor UInt16, + UserAgentMinor FixedString(2), + CookieEnable UInt8, + JavascriptEnable UInt8, + IsMobile UInt8, + MobilePhone UInt8, + MobilePhoneModel String, + Params String, + IPNetworkID UInt32, + TraficSourceID Int8, + SearchEngineID UInt16, + SearchPhrase String, + AdvEngineID UInt8, + IsArtifical UInt8, + WindowClientWidth UInt16, + WindowClientHeight UInt16, + ClientTimeZone Int16, + ClientEventTime DateTime, + SilverlightVersion1 UInt8, + SilverlightVersion2 UInt8, + SilverlightVersion3 UInt32, + SilverlightVersion4 UInt16, + PageCharset String, + CodeVersion UInt32, + IsLink UInt8, + IsDownload UInt8, + IsNotBounce UInt8, + FUniqID UInt64, + HID UInt32, + IsOldCounter UInt8, + IsEvent UInt8, + IsParameter UInt8, + DontCountHits UInt8, + WithHash UInt8, + HitColor FixedString(1), + UTCEventTime DateTime, + Age UInt8, + Sex UInt8, + Income UInt8, + Interests UInt16, + Robotness UInt8, + GeneralInterests Array(UInt16), + RemoteIP UInt32, + RemoteIP6 FixedString(16), + WindowName Int32, + OpenerName Int32, + HistoryLength Int16, + BrowserLanguage FixedString(2), + BrowserCountry FixedString(2), + SocialNetwork String, + SocialAction String, + HTTPError UInt16, + SendTiming Int32, + DNSTiming Int32, + ConnectTiming Int32, + ResponseStartTiming Int32, + ResponseEndTiming Int32, + FetchTiming Int32, + RedirectTiming Int32, + DOMInteractiveTiming Int32, + DOMContentLoadedTiming Int32, + DOMCompleteTiming Int32, + LoadEventStartTiming Int32, + LoadEventEndTiming Int32, + NSToDOMContentLoadedTiming Int32, + FirstPaintTiming Int32, + RedirectCount Int8, + SocialSourceNetworkID UInt8, + SocialSourcePage String, + ParamPrice Int64, + ParamOrderID String, + ParamCurrency FixedString(3), + ParamCurrencyID UInt16, + GoalsReached Array(UInt32), + OpenstatServiceName String, + OpenstatCampaignID String, + OpenstatAdID String, + OpenstatSourceID String, + UTMSource String, + UTMMedium String, + UTMCampaign String, + UTMContent String, + UTMTerm String, + FromTag String, + HasGCLID UInt8, + RefererHash UInt64, + URLHash UInt64, + CLID UInt32, + YCLID UInt64, + ShareService String, + ShareURL String, + ShareTitle String, + ParsedParams Nested( + Key1 String, + Key2 String, + Key3 String, + Key4 String, + Key5 String, + ValueDouble Float64), + IslandID FixedString(16), + RequestNum UInt32, + RequestTry UInt8 +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(EventDate) +ORDER BY (CounterID, EventDate, intHash32(UserID)) +SAMPLE BY intHash32(UserID) +SETTINGS storage_policy='web'; +``` + +Required parameters: + +- `type` — `web`. Otherwise the disk is not created. +- `endpoint` — The endpoint URL in `path` format. Endpoint URL must contain a root path to store data, where they were uploaded. + +Optional parameters: + +- `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1` Mb. +- `remote_fs_read_backoff_threashold` — The maximum wait time when trying to read data for remote disk. Default value: `10000` seconds. +- `remote_fs_read_backoff_max_tries` — The maximum number of attempts to read with backoff. Default value: `5`. + +If a query fails with an exception `DB:Exception Unreachable URL`, then you can try to adjust the settings: [http_connection_timeout](../operations/settings/settings.md#http_connection_timeout), [http_receive_timeout](../operations/settings/settings.md#http_receive_timeout), [keep_alive_timeout](../operations/server-configuration-parameters/settings.md#keep-alive-timeout). + +To get files for upload run: +`clickhouse static-files-disk-uploader --metadata-path --output-dir ` (`--metadata-path` can be found in query `SELECT data_paths FROM system.tables WHERE name = 'table_name'`). + +When loading files by `endpoint`, they must be loaded into `/store/` path, but config must contain only `endpoint`. + +If URL is not reachable on disk load when the server is starting up tables, then all errors are caught. If in this case there were errors, tables can be reloaded (become visible) via `DETACH TABLE table_name` -> `ATTACH TABLE table_name`. If metadata was successfully loaded at server startup, then tables are available straight away. + +Use [http_max_single_read_retries](../operations/settings/settings.md#http-max-single-read-retries) setting to limit the maximum number of retries during a single HTTP read. diff --git a/docs/en/reference/operations/system-tables/asynchronous_metric_log.md b/docs/en/reference/operations/system-tables/asynchronous_metric_log.md new file mode 100644 index 00000000000..2233406162b --- /dev/null +++ b/docs/en/reference/operations/system-tables/asynchronous_metric_log.md @@ -0,0 +1,39 @@ +# asynchronous_metric_log {#system-tables-async-log} + +Contains the historical values for `system.asynchronous_metrics`, which are saved once per minute. Enabled by default. + +Columns: + +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds resolution. +- `name` ([String](../../sql-reference/data-types/string.md)) — Metric name. +- `value` ([Float64](../../sql-reference/data-types/float.md)) — Metric value. + +**Example** + +``` sql +SELECT * FROM system.asynchronous_metric_log LIMIT 10 +``` + +``` text +┌─event_date─┬──────────event_time─┬────event_time_microseconds─┬─name─────────────────────────────────────┬─────value─┐ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ CPUFrequencyMHz_0 │ 2120.9 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.arenas.all.pmuzzy │ 743 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.arenas.all.pdirty │ 26288 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.background_thread.run_intervals │ 0 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.background_thread.num_runs │ 0 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.retained │ 60694528 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.mapped │ 303161344 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.resident │ 260931584 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.metadata │ 12079488 │ +│ 2020-09-05 │ 2020-09-05 15:56:30 │ 2020-09-05 15:56:30.025227 │ jemalloc.allocated │ 133756128 │ +└────────────┴─────────────────────┴────────────────────────────┴──────────────────────────────────────────┴───────────┘ +``` + +**See Also** + +- [system.asynchronous_metrics](../system-tables/asynchronous_metrics.md) — Contains metrics, calculated periodically in the background. +- [system.metric_log](../system-tables/metric_log.md) — Contains history of metrics values from tables `system.metrics` and `system.events`, periodically flushed to disk. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/asynchronous_metric_log) diff --git a/docs/en/reference/operations/system-tables/asynchronous_metrics.md b/docs/en/reference/operations/system-tables/asynchronous_metrics.md new file mode 100644 index 00000000000..162048b06ee --- /dev/null +++ b/docs/en/reference/operations/system-tables/asynchronous_metrics.md @@ -0,0 +1,38 @@ +# asynchronous_metrics {#system_tables-asynchronous_metrics} + +Contains metrics that are calculated periodically in the background. For example, the amount of RAM in use. + +Columns: + +- `metric` ([String](../../sql-reference/data-types/string.md)) — Metric name. +- `value` ([Float64](../../sql-reference/data-types/float.md)) — Metric value. + +**Example** + +``` sql +SELECT * FROM system.asynchronous_metrics LIMIT 10 +``` + +``` text +┌─metric──────────────────────────────────┬──────value─┐ +│ jemalloc.background_thread.run_interval │ 0 │ +│ jemalloc.background_thread.num_runs │ 0 │ +│ jemalloc.background_thread.num_threads │ 0 │ +│ jemalloc.retained │ 422551552 │ +│ jemalloc.mapped │ 1682989056 │ +│ jemalloc.resident │ 1656446976 │ +│ jemalloc.metadata_thp │ 0 │ +│ jemalloc.metadata │ 10226856 │ +│ UncompressedCacheCells │ 0 │ +│ MarkCacheFiles │ 0 │ +└─────────────────────────────────────────┴────────────┘ +``` + +**See Also** + +- [Monitoring](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. +- [system.metrics](../../operations/system-tables/metrics.md#system_tables-metrics) — Contains instantly calculated metrics. +- [system.events](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that have occurred. +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` and `system.events`. + + [Original article](https://clickhouse.com/docs/en/operations/system-tables/asynchronous_metrics) diff --git a/docs/en/reference/operations/system-tables/clusters.md b/docs/en/reference/operations/system-tables/clusters.md new file mode 100644 index 00000000000..776c90b9936 --- /dev/null +++ b/docs/en/reference/operations/system-tables/clusters.md @@ -0,0 +1,71 @@ +# clusters {#system-clusters} + +Contains information about clusters available in the config file and the servers in them. + +Columns: + +- `cluster` ([String](../../sql-reference/data-types/string.md)) — The cluster name. +- `shard_num` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The shard number in the cluster, starting from 1. +- `shard_weight` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The relative weight of the shard when writing data. +- `replica_num` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The replica number in the shard, starting from 1. +- `host_name` ([String](../../sql-reference/data-types/string.md)) — The host name, as specified in the config. +- `host_address` ([String](../../sql-reference/data-types/string.md)) — The host IP address obtained from DNS. +- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The port to use for connecting to the server. +- `is_local` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the host is local. +- `user` ([String](../../sql-reference/data-types/string.md)) — The name of the user for connecting to the server. +- `default_database` ([String](../../sql-reference/data-types/string.md)) — The default database name. +- `errors_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of times this host failed to reach replica. +- `slowdowns_count` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of slowdowns that led to changing replica when establishing a connection with hedged requests. +- `estimated_recovery_time` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Seconds remaining until the replica error count is zeroed and it is considered to be back to normal. + +**Example** + +Query: + +```sql +SELECT * FROM system.clusters LIMIT 2 FORMAT Vertical; +``` + +Result: + +```text +Row 1: +────── +cluster: test_cluster_two_shards +shard_num: 1 +shard_weight: 1 +replica_num: 1 +host_name: 127.0.0.1 +host_address: 127.0.0.1 +port: 9000 +is_local: 1 +user: default +default_database: +errors_count: 0 +slowdowns_count: 0 +estimated_recovery_time: 0 + +Row 2: +────── +cluster: test_cluster_two_shards +shard_num: 2 +shard_weight: 1 +replica_num: 1 +host_name: 127.0.0.2 +host_address: 127.0.0.2 +port: 9000 +is_local: 0 +user: default +default_database: +errors_count: 0 +slowdowns_count: 0 +estimated_recovery_time: 0 +``` + +**See Also** + +- [Table engine Distributed](../../engines/table-engines/special/distributed.md) +- [distributed_replica_error_cap setting](../../operations/settings/settings.md#settings-distributed_replica_error_cap) +- [distributed_replica_error_half_life setting](../../operations/settings/settings.md#settings-distributed_replica_error_half_life) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/clusters) diff --git a/docs/en/reference/operations/system-tables/columns.md b/docs/en/reference/operations/system-tables/columns.md new file mode 100644 index 00000000000..dd5674fe5b1 --- /dev/null +++ b/docs/en/reference/operations/system-tables/columns.md @@ -0,0 +1,89 @@ +# columns {#system-columns} + +Contains information about columns in all the tables. + +You can use this table to get information similar to the [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table) query, but for multiple tables at once. + +Columns from [temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.columns` only in those session where they have been created. They are shown with the empty `database` field. + +The `system.columns` table contains the following columns (the column type is shown in brackets): + +- `database` ([String](../../sql-reference/data-types/string.md)) — Database name. +- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `name` ([String](../../sql-reference/data-types/string.md)) — Column name. +- `type` ([String](../../sql-reference/data-types/string.md)) — Column type. +- `position` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Ordinal position of a column in a table starting with 1. +- `default_kind` ([String](../../sql-reference/data-types/string.md)) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`) for the default value, or an empty string if it is not defined. +- `default_expression` ([String](../../sql-reference/data-types/string.md)) — Expression for the default value, or an empty string if it is not defined. +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of compressed data, in bytes. +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of decompressed data, in bytes. +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of marks, in bytes. +- `comment` ([String](../../sql-reference/data-types/string.md)) — Comment on the column, or an empty string if it is not defined. +- `is_in_partition_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the column is in the partition expression. +- `is_in_sorting_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the column is in the sorting key expression. +- `is_in_primary_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the column is in the primary key expression. +- `is_in_sampling_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the column is in the sampling key expression. +- `compression_codec` ([String](../../sql-reference/data-types/string.md)) — Compression codec name. +- `character_octet_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum length in bytes for binary data, character data, or text data and images. In ClickHouse makes sense only for `FixedString` data type. Otherwise, the `NULL` value is returned. +- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Accuracy of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse it is bitness for integer types and decimal precision for `Decimal` types. Otherwise, the `NULL` value is returned. +- `numeric_precision_radix` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The base of the number system is the accuracy of approximate numeric data, exact numeric data, integer data or monetary data. In ClickHouse it's 2 for integer types and 10 for `Decimal` types. Otherwise, the `NULL` value is returned. +- `numeric_scale` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The scale of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse makes sense only for `Decimal` types. Otherwise, the `NULL` value is returned. +- `datetime_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Decimal precision of `DateTime64` data type. For other data types, the `NULL` value is returned. + +**Example** + +```sql +SELECT * FROM system.columns LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: INFORMATION_SCHEMA +table: COLUMNS +name: table_catalog +type: String +position: 1 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +character_octet_length: ᴺᵁᴸᴸ +numeric_precision: ᴺᵁᴸᴸ +numeric_precision_radix: ᴺᵁᴸᴸ +numeric_scale: ᴺᵁᴸᴸ +datetime_precision: ᴺᵁᴸᴸ + +Row 2: +────── +database: INFORMATION_SCHEMA +table: COLUMNS +name: table_schema +type: String +position: 2 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +character_octet_length: ᴺᵁᴸᴸ +numeric_precision: ᴺᵁᴸᴸ +numeric_precision_radix: ᴺᵁᴸᴸ +numeric_scale: ᴺᵁᴸᴸ +datetime_precision: ᴺᵁᴸᴸ +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/columns) diff --git a/docs/en/reference/operations/system-tables/contributors.md b/docs/en/reference/operations/system-tables/contributors.md new file mode 100644 index 00000000000..3b76684b44b --- /dev/null +++ b/docs/en/reference/operations/system-tables/contributors.md @@ -0,0 +1,41 @@ +# contributors {#system-contributors} + +Contains information about contributors. The order is random at query execution time. + +Columns: + +- `name` (String) — Contributor (author) name from git log. + +**Example** + +``` sql +SELECT * FROM system.contributors LIMIT 10 +``` + +``` text +┌─name─────────────┐ +│ Olga Khvostikova │ +│ Max Vetrov │ +│ LiuYangkuan │ +│ svladykin │ +│ zamulla │ +│ Šimon Podlipský │ +│ BayoNet │ +│ Ilya Khomutov │ +│ Amy Krishnevsky │ +│ Loud_Scream │ +└──────────────────┘ +``` + +To find out yourself in the table, use a query: + +``` sql +SELECT * FROM system.contributors WHERE name = 'Olga Khvostikova' +``` + +``` text +┌─name─────────────┐ +│ Olga Khvostikova │ +└──────────────────┘ +``` +[Original article](https://clickhouse.com/docs/en/operations/system-tables/contributors) diff --git a/docs/en/reference/operations/system-tables/crash-log.md b/docs/en/reference/operations/system-tables/crash-log.md new file mode 100644 index 00000000000..be85cb78c9f --- /dev/null +++ b/docs/en/reference/operations/system-tables/crash-log.md @@ -0,0 +1,48 @@ +# crash_log {#system-tables_crash_log} + +Contains information about stack traces for fatal errors. The table does not exist in the database by default, it is created only when fatal errors occur. + +Columns: + +- `event_date` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date of the event. +- `event_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Time of the event. +- `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Timestamp of the event with nanoseconds. +- `signal` ([Int32](../../sql-reference/data-types/int-uint.md)) — Signal number. +- `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Thread ID. +- `query_id` ([String](../../sql-reference/data-types/string.md)) — Query ID. +- `trace` ([Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Stack trace at the moment of crash. Each element is a virtual memory address inside ClickHouse server process. +- `trace_full` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Stack trace at the moment of crash. Each element contains a called method inside ClickHouse server process. +- `version` ([String](../../sql-reference/data-types/string.md)) — ClickHouse server version. +- `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse server revision. +- `build_id` ([String](../../sql-reference/data-types/string.md)) — BuildID that is generated by compiler. + +**Example** + +Query: + +``` sql +SELECT * FROM system.crash_log ORDER BY event_time DESC LIMIT 1; +``` + +Result (not full): + +``` text +Row 1: +────── +event_date: 2020-10-14 +event_time: 2020-10-14 15:47:40 +timestamp_ns: 1602679660271312710 +signal: 11 +thread_id: 23624 +query_id: 428aab7c-8f5c-44e9-9607-d16b44467e69 +trace: [188531193,...] +trace_full: ['3. DB::(anonymous namespace)::FunctionFormatReadableTimeDelta::executeImpl(std::__1::vector >&, std::__1::vector > const&, unsigned long, unsigned long) const @ 0xb3cc1f9 in /home/username/work/ClickHouse/build/programs/clickhouse',...] +version: ClickHouse 20.11.1.1 +revision: 54442 +build_id: +``` + +**See also** +- [trace_log](../../operations/system-tables/trace_log.md) system table + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/crash-log) diff --git a/docs/en/reference/operations/system-tables/current-roles.md b/docs/en/reference/operations/system-tables/current-roles.md new file mode 100644 index 00000000000..81d4fad24a8 --- /dev/null +++ b/docs/en/reference/operations/system-tables/current-roles.md @@ -0,0 +1,11 @@ +# current_roles {#system_tables-current_roles} + +Contains active roles of a current user. `SET ROLE` changes the contents of this table. + +Columns: + + - `role_name` ([String](../../sql-reference/data-types/string.md))) — Role name. + - `with_admin_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Flag that shows whether `current_role` is a role with `ADMIN OPTION` privilege. + - `is_default` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Flag that shows whether `current_role` is a default role. + + [Original article](https://clickhouse.com/docs/en/operations/system-tables/current-roles) diff --git a/docs/en/reference/operations/system-tables/data_skipping_indices.md b/docs/en/reference/operations/system-tables/data_skipping_indices.md new file mode 100644 index 00000000000..71dfb046dbb --- /dev/null +++ b/docs/en/reference/operations/system-tables/data_skipping_indices.md @@ -0,0 +1,47 @@ +# data_skipping_indices {#system-data-skipping-indices} + +Contains information about existing data skipping indices in all the tables. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — Database name. +- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `name` ([String](../../sql-reference/data-types/string.md)) — Index name. +- `type` ([String](../../sql-reference/data-types/string.md)) — Index type. +- `expr` ([String](../../sql-reference/data-types/string.md)) — Expression for the index calculation. +- `granularity` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of granules in the block. +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of compressed data, in bytes. +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of decompressed data, in bytes. +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of marks, in bytes. + +**Example** + +```sql +SELECT * FROM system.data_skipping_indices LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: default +table: user_actions +name: clicks_idx +type: minmax +expr: clicks +granularity: 1 +data_compressed_bytes: 58 +data_uncompressed_bytes: 6 +marks: 48 + +Row 2: +────── +database: default +table: users +name: contacts_null_idx +type: minmax +expr: assumeNotNull(contacts_null) +granularity: 1 +data_compressed_bytes: 58 +data_uncompressed_bytes: 6 +marks: 48 +``` diff --git a/docs/en/reference/operations/system-tables/data_type_families.md b/docs/en/reference/operations/system-tables/data_type_families.md new file mode 100644 index 00000000000..2e5e7b74c66 --- /dev/null +++ b/docs/en/reference/operations/system-tables/data_type_families.md @@ -0,0 +1,36 @@ +# data_type_families {#system_tables-data_type_families} + +Contains information about supported [data types](../../sql-reference/data-types/index.md). + +Columns: + +- `name` ([String](../../sql-reference/data-types/string.md)) — Data type name. +- `case_insensitive` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Property that shows whether you can use a data type name in a query in case insensitive manner or not. For example, `Date` and `date` are both valid. +- `alias_to` ([String](../../sql-reference/data-types/string.md)) — Data type name for which `name` is an alias. + +**Example** + +``` sql +SELECT * FROM system.data_type_families WHERE alias_to = 'String' +``` + +``` text +┌─name───────┬─case_insensitive─┬─alias_to─┐ +│ LONGBLOB │ 1 │ String │ +│ LONGTEXT │ 1 │ String │ +│ TINYTEXT │ 1 │ String │ +│ TEXT │ 1 │ String │ +│ VARCHAR │ 1 │ String │ +│ MEDIUMBLOB │ 1 │ String │ +│ BLOB │ 1 │ String │ +│ TINYBLOB │ 1 │ String │ +│ CHAR │ 1 │ String │ +│ MEDIUMTEXT │ 1 │ String │ +└────────────┴──────────────────┴──────────┘ +``` + +**See Also** + +- [Syntax](../../sql-reference/syntax.md) — Information about supported syntax. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/data_type_families) diff --git a/docs/en/reference/operations/system-tables/databases.md b/docs/en/reference/operations/system-tables/databases.md new file mode 100644 index 00000000000..7245ecdcdec --- /dev/null +++ b/docs/en/reference/operations/system-tables/databases.md @@ -0,0 +1,37 @@ +# databases {#system-databases} + +Contains information about the databases that are available to the current user. + +Columns: + +- `name` ([String](../../sql-reference/data-types/string.md)) — Database name. +- `engine` ([String](../../sql-reference/data-types/string.md)) — [Database engine](../../engines/database-engines/index.md). +- `data_path` ([String](../../sql-reference/data-types/string.md)) — Data path. +- `metadata_path` ([String](../../sql-reference/data-types/enum.md)) — Metadata path. +- `uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — Database UUID. +- `comment` ([String](../../sql-reference/data-types/enum.md)) — Database comment. + +The `name` column from this system table is used for implementing the `SHOW DATABASES` query. + +**Example** + +Create a database. + +``` sql +CREATE DATABASE test; +``` + +Check all of the available databases to the user. + +``` sql +SELECT * FROM system.databases; +``` + +``` text +┌─name───────────────┬─engine─┬─data_path──────────────────┬─metadata_path───────────────────────────────────────────────────────┬─uuid─────────────────────────────────┬─comment─┐ +│ INFORMATION_SCHEMA │ Memory │ /var/lib/clickhouse/ │ │ 00000000-0000-0000-0000-000000000000 │ │ +│ default │ Atomic │ /var/lib/clickhouse/store/ │ /var/lib/clickhouse/store/d31/d317b4bd-3595-4386-81ee-c2334694128a/ │ 24363899-31d7-42a0-a436-389931d752a0 │ │ +│ information_schema │ Memory │ /var/lib/clickhouse/ │ │ 00000000-0000-0000-0000-000000000000 │ │ +│ system │ Atomic │ /var/lib/clickhouse/store/ │ /var/lib/clickhouse/store/1d1/1d1c869d-e465-4b1b-a51f-be033436ebf9/ │ 03e9f3d1-cc88-4a49-83e9-f3d1cc881a49 │ │ +└────────────────────┴────────┴────────────────────────────┴─────────────────────────────────────────────────────────────────────┴──────────────────────────────────────┴─────────┘ +``` diff --git a/docs/en/reference/operations/system-tables/detached_parts.md b/docs/en/reference/operations/system-tables/detached_parts.md new file mode 100644 index 00000000000..2fe354a4471 --- /dev/null +++ b/docs/en/reference/operations/system-tables/detached_parts.md @@ -0,0 +1,11 @@ +# detached_parts {#system_tables-detached_parts} + +Contains information about detached parts of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. The `reason` column specifies why the part was detached. + +For user-detached parts, the reason is empty. Such parts can be attached with [ALTER TABLE ATTACH PARTITION\|PART](../../sql-reference/statements/alter/partition.md#alter_attach-partition) command. + +For the description of other columns, see [system.parts](../../operations/system-tables/parts.md#system_tables-parts). + +If part name is invalid, values of some columns may be `NULL`. Such parts can be deleted with [ALTER TABLE DROP DETACHED PART](../../sql-reference/statements/alter/partition.md#alter_drop-detached). + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/detached_parts) diff --git a/docs/en/reference/operations/system-tables/dictionaries.md b/docs/en/reference/operations/system-tables/dictionaries.md new file mode 100644 index 00000000000..c41d506ff0a --- /dev/null +++ b/docs/en/reference/operations/system-tables/dictionaries.md @@ -0,0 +1,88 @@ +# dictionaries {#system_tables-dictionaries} + +Contains information about [external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database containing the dictionary created by DDL query. Empty string for other dictionaries. +- `name` ([String](../../sql-reference/data-types/string.md)) — [Dictionary name](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md). +- `uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — Dictionary UUID. +- `status` ([Enum8](../../sql-reference/data-types/enum.md)) — Dictionary status. Possible values: + - `NOT_LOADED` — Dictionary was not loaded because it was not used. + - `LOADED` — Dictionary loaded successfully. + - `FAILED` — Unable to load the dictionary as a result of an error. + - `LOADING` — Dictionary is loading now. + - `LOADED_AND_RELOADING` — Dictionary is loaded successfully, and is being reloaded right now (frequent reasons: [SYSTEM RELOAD DICTIONARY](../../sql-reference/statements/system.md#query_language-system-reload-dictionary) query, timeout, dictionary config has changed). + - `FAILED_AND_RELOADING` — Could not load the dictionary as a result of an error and is loading now. +- `origin` ([String](../../sql-reference/data-types/string.md)) — Path to the configuration file that describes the dictionary. +- `type` ([String](../../sql-reference/data-types/string.md)) — Type of a dictionary allocation. [Storing Dictionaries in Memory](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). +- `key.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [key names](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key) provided by the dictionary. +- `key.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [key types](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-key) provided by the dictionary. +- `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of [attribute names](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) provided by the dictionary. +- `attribute.types` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Corresponding array of [attribute types](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict_structure-attributes) provided by the dictionary. +- `bytes_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Amount of RAM allocated for the dictionary. +- `query_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of queries since the dictionary was loaded or since the last successful reboot. +- `hit_rate` ([Float64](../../sql-reference/data-types/float.md)) — For cache dictionaries, the percentage of uses for which the value was in the cache. +- `found_rate` ([Float64](../../sql-reference/data-types/float.md)) — The percentage of uses for which the value was found. +- `element_count` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of items stored in the dictionary. +- `load_factor` ([Float64](../../sql-reference/data-types/float.md)) — Percentage filled in the dictionary (for a hashed dictionary, the percentage filled in the hash table). +- `source` ([String](../../sql-reference/data-types/string.md)) — Text describing the [data source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) for the dictionary. +- `lifetime_min` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Minimum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. +- `lifetime_max` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Maximum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. +- `loading_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Start time for loading the dictionary. +- `last_successful_update_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — End time for loading or updating the dictionary. Helps to monitor some troubles with external sources and investigate causes. +- `loading_duration` ([Float32](../../sql-reference/data-types/float.md)) — Duration of a dictionary loading. +- `last_exception` ([String](../../sql-reference/data-types/string.md)) — Text of the error that occurs when creating or reloading the dictionary if the dictionary couldn’t be created. +- `comment` ([String](../../sql-reference/data-types/string.md)) — Text of the comment to dictionary. + +**Example** + +Configure the dictionary: + +``` sql +CREATE DICTIONARY dictionary_with_comment +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'source_table')) +LAYOUT(FLAT()) +LIFETIME(MIN 0 MAX 1000) +COMMENT 'The temporary dictionary'; +``` + +Make sure that the dictionary is loaded. + +``` sql +SELECT * FROM system.dictionaries LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +database: default +name: dictionary_with_comment +uuid: 4654d460-0d03-433a-8654-d4600d03d33a +status: NOT_LOADED +origin: 4654d460-0d03-433a-8654-d4600d03d33a +type: +key.names: ['id'] +key.types: ['UInt64'] +attribute.names: ['value'] +attribute.types: ['String'] +bytes_allocated: 0 +query_count: 0 +hit_rate: 0 +found_rate: 0 +element_count: 0 +load_factor: 0 +source: +lifetime_min: 0 +lifetime_max: 0 +loading_start_time: 1970-01-01 00:00:00 +last_successful_update_time: 1970-01-01 00:00:00 +loading_duration: 0 +last_exception: +comment: The temporary dictionary +``` diff --git a/docs/en/reference/operations/system-tables/disks.md b/docs/en/reference/operations/system-tables/disks.md new file mode 100644 index 00000000000..869c0f3cee5 --- /dev/null +++ b/docs/en/reference/operations/system-tables/disks.md @@ -0,0 +1,27 @@ +# disks {#system_tables-disks} + +Contains information about disks defined in the [server configuration](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). + +Columns: + +- `name` ([String](../../sql-reference/data-types/string.md)) — Name of a disk in the server configuration. +- `path` ([String](../../sql-reference/data-types/string.md)) — Path to the mount point in the file system. +- `free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Free space on disk in bytes. +- `total_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Disk volume in bytes. +- `keep_free_space` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Amount of disk space that should stay free on disk in bytes. Defined in the `keep_free_space_bytes` parameter of disk configuration. + +**Example** + +```sql +:) SELECT * FROM system.disks; +``` + +```text +┌─name────┬─path─────────────────┬───free_space─┬──total_space─┬─keep_free_space─┐ +│ default │ /var/lib/clickhouse/ │ 276392587264 │ 490652508160 │ 0 │ +└─────────┴──────────────────────┴──────────────┴──────────────┴─────────────────┘ + +1 rows in set. Elapsed: 0.001 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/disks) diff --git a/docs/en/reference/operations/system-tables/distributed_ddl_queue.md b/docs/en/reference/operations/system-tables/distributed_ddl_queue.md new file mode 100644 index 00000000000..0597972197d --- /dev/null +++ b/docs/en/reference/operations/system-tables/distributed_ddl_queue.md @@ -0,0 +1,64 @@ +# distributed_ddl_queue {#system_tables-distributed_ddl_queue} + +Contains information about [distributed ddl queries (ON CLUSTER clause)](../../sql-reference/distributed-ddl.md) that were executed on a cluster. + +Columns: + +- `entry` ([String](../../sql-reference/data-types/string.md)) — Query id. +- `host_name` ([String](../../sql-reference/data-types/string.md)) — Hostname. +- `host_address` ([String](../../sql-reference/data-types/string.md)) — IP address that the Hostname resolves to. +- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — Host Port. +- `status` ([Enum8](../../sql-reference/data-types/enum.md)) — Status of the query. +- `cluster` ([String](../../sql-reference/data-types/string.md)) — Cluster name. +- `query` ([String](../../sql-reference/data-types/string.md)) — Query executed. +- `initiator` ([String](../../sql-reference/data-types/string.md)) — Node that executed the query. +- `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query start time. +- `query_finish_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query finish time. +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration of query execution (in milliseconds). +- `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — Exception code from [ZooKeeper](../../operations/tips.md#zookeeper). + +**Example** + +``` sql +SELECT * +FROM system.distributed_ddl_queue +WHERE cluster = 'test_cluster' +LIMIT 2 +FORMAT Vertical + +Query id: f544e72a-6641-43f1-836b-24baa1c9632a + +Row 1: +────── +entry: query-0000000000 +host_name: clickhouse01 +host_address: 172.23.0.11 +port: 9000 +status: Finished +cluster: test_cluster +query: CREATE DATABASE test_db UUID '4a82697e-c85e-4e5b-a01e-a36f2a758456' ON CLUSTER test_cluster +initiator: clickhouse01:9000 +query_start_time: 2020-12-30 13:07:51 +query_finish_time: 2020-12-30 13:07:51 +query_duration_ms: 6 +exception_code: ZOK + +Row 2: +────── +entry: query-0000000000 +host_name: clickhouse02 +host_address: 172.23.0.12 +port: 9000 +status: Finished +cluster: test_cluster +query: CREATE DATABASE test_db UUID '4a82697e-c85e-4e5b-a01e-a36f2a758456' ON CLUSTER test_cluster +initiator: clickhouse01:9000 +query_start_time: 2020-12-30 13:07:51 +query_finish_time: 2020-12-30 13:07:51 +query_duration_ms: 6 +exception_code: ZOK + +2 rows in set. Elapsed: 0.025 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system_tables/distributed_ddl_queuedistributed_ddl_queue.md) diff --git a/docs/en/reference/operations/system-tables/distribution_queue.md b/docs/en/reference/operations/system-tables/distribution_queue.md new file mode 100644 index 00000000000..231a06458c8 --- /dev/null +++ b/docs/en/reference/operations/system-tables/distribution_queue.md @@ -0,0 +1,50 @@ +# distribution_queue {#system_tables-distribution_queue} + +Contains information about local files that are in the queue to be sent to the shards. These local files contain new parts that are created by inserting new data into the Distributed table in asynchronous mode. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database. + +- `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table. + +- `data_path` ([String](../../sql-reference/data-types/string.md)) — Path to the folder with local files. + +- `is_blocked` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag indicates whether sending local files to the server is blocked. + +- `error_count` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of errors. + +- `data_files` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of local files in a folder. + +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Size of compressed data in local files, in bytes. + +- `broken_data_files` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of files that has been marked as broken (due to an error). + +- `broken_data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Size of compressed data in broken files, in bytes. + +- `last_exception` ([String](../../sql-reference/data-types/string.md)) — Text message about the last error that occurred (if any). + +**Example** + +``` sql +SELECT * FROM system.distribution_queue LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +database: default +table: dist +data_path: ./store/268/268bc070-3aad-4b1a-9cf2-4987580161af/default@127%2E0%2E0%2E2:9000/ +is_blocked: 1 +error_count: 0 +data_files: 1 +data_compressed_bytes: 499 +last_exception: +``` + +**See Also** + +- [Distributed table engine](../../engines/table-engines/special/distributed.md) + +[Original article](https://clickhouse.com/docs/en/operations/system_tables/distribution_queue) diff --git a/docs/en/reference/operations/system-tables/enabled-roles.md b/docs/en/reference/operations/system-tables/enabled-roles.md new file mode 100644 index 00000000000..832fc6aba42 --- /dev/null +++ b/docs/en/reference/operations/system-tables/enabled-roles.md @@ -0,0 +1,12 @@ +# enabled_roles {#system_tables-enabled_roles} + +Contains all active roles at the moment, including current role of the current user and granted roles for current role. + +Columns: + +- `role_name` ([String](../../sql-reference/data-types/string.md))) — Role name. +- `with_admin_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Flag that shows whether `enabled_role` is a role with `ADMIN OPTION` privilege. +- `is_current` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Flag that shows whether `enabled_role` is a current role of a current user. +- `is_default` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Flag that shows whether `enabled_role` is a default role. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/enabled-roles) diff --git a/docs/en/reference/operations/system-tables/errors.md b/docs/en/reference/operations/system-tables/errors.md new file mode 100644 index 00000000000..8e60cf93bfa --- /dev/null +++ b/docs/en/reference/operations/system-tables/errors.md @@ -0,0 +1,36 @@ +# errors {#system_tables-errors} + +Contains error codes with the number of times they have been triggered. + +Columns: + +- `name` ([String](../../sql-reference/data-types/string.md)) — name of the error (`errorCodeToName`). +- `code` ([Int32](../../sql-reference/data-types/int-uint.md)) — code number of the error. +- `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — the number of times this error has been happened. +- `last_error_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — time when the last error happened. +- `last_error_message` ([String](../../sql-reference/data-types/string.md)) — message for the last error. +- `last_error_trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored. +- `remote` ([UInt8](../../sql-reference/data-types/int-uint.md)) — remote exception (i.e. received during one of the distributed query). + +**Example** + +``` sql +SELECT name, code, value +FROM system.errors +WHERE value > 0 +ORDER BY code ASC +LIMIT 1 + +┌─name─────────────┬─code─┬─value─┐ +│ CANNOT_OPEN_FILE │ 76 │ 1 │ +└──────────────────┴──────┴───────┘ +``` + +``` sql +WITH arrayMap(x -> demangle(addressToSymbol(x)), last_error_trace) AS all +SELECT name, arrayStringConcat(all, '\n') AS res +FROM system.errors +LIMIT 1 +SETTINGS allow_introspection_functions=1\G +``` + diff --git a/docs/en/reference/operations/system-tables/events.md b/docs/en/reference/operations/system-tables/events.md new file mode 100644 index 00000000000..445573ec978 --- /dev/null +++ b/docs/en/reference/operations/system-tables/events.md @@ -0,0 +1,34 @@ +# events {#system_tables-events} + +Contains information about the number of events that have occurred in the system. For example, in the table, you can find how many `SELECT` queries were processed since the ClickHouse server started. + +Columns: + +- `event` ([String](../../sql-reference/data-types/string.md)) — Event name. +- `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of events occurred. +- `description` ([String](../../sql-reference/data-types/string.md)) — Event description. + +**Example** + +``` sql +SELECT * FROM system.events LIMIT 5 +``` + +``` text +┌─event─────────────────────────────────┬─value─┬─description────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Query │ 12 │ Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries. │ +│ SelectQuery │ 8 │ Same as Query, but only for SELECT queries. │ +│ FileOpen │ 73 │ Number of files opened. │ +│ ReadBufferFromFileDescriptorRead │ 155 │ Number of reads (read/pread) from a file descriptor. Does not include sockets. │ +│ ReadBufferFromFileDescriptorReadBytes │ 9931 │ Number of bytes read from file descriptors. If the file is compressed, this will show the compressed data size. │ +└───────────────────────────────────────┴───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**See Also** + +- [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. +- [system.metrics](../../operations/system-tables/metrics.md#system_tables-metrics) — Contains instantly calculated metrics. +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. +- [Monitoring](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/events) diff --git a/docs/en/reference/operations/system-tables/functions.md b/docs/en/reference/operations/system-tables/functions.md new file mode 100644 index 00000000000..097b6ccd22a --- /dev/null +++ b/docs/en/reference/operations/system-tables/functions.md @@ -0,0 +1,33 @@ +# functions {#system-functions} + +Contains information about normal and aggregate functions. + +Columns: + +- `name`(`String`) – The name of the function. +- `is_aggregate`(`UInt8`) — Whether the function is aggregate. + +**Example** + +```sql + SELECT * FROM system.functions LIMIT 10; +``` + +```text +┌─name──────────────────┬─is_aggregate─┬─case_insensitive─┬─alias_to─┬─create_query─┬─origin─┐ +│ logTrace │ 0 │ 0 │ │ │ System │ +│ aes_decrypt_mysql │ 0 │ 0 │ │ │ System │ +│ aes_encrypt_mysql │ 0 │ 0 │ │ │ System │ +│ decrypt │ 0 │ 0 │ │ │ System │ +│ encrypt │ 0 │ 0 │ │ │ System │ +│ toBool │ 0 │ 0 │ │ │ System │ +│ windowID │ 0 │ 0 │ │ │ System │ +│ hopStart │ 0 │ 0 │ │ │ System │ +│ hop │ 0 │ 0 │ │ │ System │ +│ snowflakeToDateTime64 │ 0 │ 0 │ │ │ System │ +└───────────────────────┴──────────────┴──────────────────┴──────────┴──────────────┴────────┘ + +10 rows in set. Elapsed: 0.002 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/functions) diff --git a/docs/en/reference/operations/system-tables/grants.md b/docs/en/reference/operations/system-tables/grants.md new file mode 100644 index 00000000000..c848972c2d8 --- /dev/null +++ b/docs/en/reference/operations/system-tables/grants.md @@ -0,0 +1,24 @@ +# grants {#system_tables-grants} + +Privileges granted to ClickHouse user accounts. + +Columns: +- `user_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — User name. + +- `role_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Role assigned to user account. + +- `access_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Access parameters for ClickHouse user account. + +- `database` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Name of a database. + +- `table` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Name of a table. + +- `column` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Name of a column to which access is granted. + +- `is_partial_revoke` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Logical value. It shows whether some privileges have been revoked. Possible values: +- `0` — The row describes a partial revoke. +- `1` — The row describes a grant. + +- `grant_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Permission is granted `WITH GRANT OPTION`, see [GRANT](../../sql-reference/statements/grant.md#grant-privigele-syntax). + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/grants) diff --git a/docs/en/reference/operations/system-tables/graphite_retentions.md b/docs/en/reference/operations/system-tables/graphite_retentions.md new file mode 100644 index 00000000000..10e265815f4 --- /dev/null +++ b/docs/en/reference/operations/system-tables/graphite_retentions.md @@ -0,0 +1,17 @@ +# graphite_retentions {#system-graphite-retentions} + +Contains information about parameters [graphite_rollup](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-graphite) which are used in tables with [\*GraphiteMergeTree](../../engines/table-engines/mergetree-family/graphitemergetree.md) engines. + +Columns: + +- `config_name` (String) - `graphite_rollup` parameter name. +- `regexp` (String) - A pattern for the metric name. +- `function` (String) - The name of the aggregating function. +- `age` (UInt64) - The minimum age of the data in seconds. +- `precision` (UInt64) - How precisely to define the age of the data in seconds. +- `priority` (UInt16) - Pattern priority. +- `is_default` (UInt8) - Whether the pattern is the default. +- `Tables.database` (Array(String)) - Array of names of database tables that use the `config_name` parameter. +- `Tables.table` (Array(String)) - Array of table names that use the `config_name` parameter. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/graphite_retentions) diff --git a/docs/en/reference/operations/system-tables/index.md b/docs/en/reference/operations/system-tables/index.md new file mode 100644 index 00000000000..7b977ab4d51 --- /dev/null +++ b/docs/en/reference/operations/system-tables/index.md @@ -0,0 +1,74 @@ +--- +sidebar_position: 52 +sidebar_label: System Tables +--- + +# System Tables {#system-tables} + +## Introduction {#system-tables-introduction} + +System tables provide information about: + +- Server states, processes, and environment. +- Server’s internal processes. + +System tables: + +- Located in the `system` database. +- Available only for reading data. +- Can’t be dropped or altered, but can be detached. + +Most of system tables store their data in RAM. A ClickHouse server creates such system tables at the start. + +Unlike other system tables, the system log tables [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), [crash_log](../../operations/system-tables/crash-log.md) and [text_log](../../operations/system-tables/text_log.md) are served by [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine and store their data in a filesystem by default. If you remove a table from a filesystem, the ClickHouse server creates the empty one again at the time of the next data writing. If system table schema changed in a new release, then ClickHouse renames the current table and creates a new one. + +System log tables can be customized by creating a config file with the same name as the table under `/etc/clickhouse-server/config.d/`, or setting corresponding elements in `/etc/clickhouse-server/config.xml`. Elements can be customized are: + +- `database`: database the system log table belongs to. This option is deprecated now. All system log tables are under database `system`. +- `table`: table to insert data. +- `partition_by`: specify [PARTITION BY](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) expression. +- `ttl`: specify table [TTL](../../sql-reference/statements/alter/ttl.md) expression. +- `flush_interval_milliseconds`: interval of flushing data to disk. +- `engine`: provide full engine expression (starting with `ENGINE =` ) with parameters. This option is contradict with `partition_by` and `ttl`. If set together, the server would raise an exception and exit. + +An example: + +```xml + + + system + query_log
+ toYYYYMM(event_date) + event_date + INTERVAL 30 DAY DELETE + + 7500 +
+
+``` + +By default, table growth is unlimited. To control a size of a table, you can use [TTL](../../sql-reference/statements/alter/ttl.md#manipulations-with-table-ttl) settings for removing outdated log records. Also you can use the partitioning feature of `MergeTree`-engine tables. + +## Sources of System Metrics {#system-tables-sources-of-system-metrics} + +For collecting system metrics ClickHouse server uses: + +- `CAP_NET_ADMIN` capability. +- [procfs](https://en.wikipedia.org/wiki/Procfs) (only in Linux). + +**procfs** + +If ClickHouse server does not have `CAP_NET_ADMIN` capability, it tries to fall back to `ProcfsMetricsProvider`. `ProcfsMetricsProvider` allows collecting per-query system metrics (for CPU and I/O). + +If procfs is supported and enabled on the system, ClickHouse server collects these metrics: + +- `OSCPUVirtualTimeMicroseconds` +- `OSCPUWaitMicroseconds` +- `OSIOWaitMicroseconds` +- `OSReadChars` +- `OSWriteChars` +- `OSReadBytes` +- `OSWriteBytes` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/) diff --git a/docs/en/reference/operations/system-tables/information_schema.md b/docs/en/reference/operations/system-tables/information_schema.md new file mode 100644 index 00000000000..df5b012f2b6 --- /dev/null +++ b/docs/en/reference/operations/system-tables/information_schema.md @@ -0,0 +1,210 @@ +# INFORMATION_SCHEMA {#information-schema} + +`INFORMATION_SCHEMA` (`information_schema`) is a system database that contains views. Using these views, you can get information about the metadata of database objects. These views read data from the columns of the [system.columns](../../operations/system-tables/columns.md), [system.databases](../../operations/system-tables/databases.md) and [system.tables](../../operations/system-tables/tables.md) system tables. + +The structure and composition of system tables may change in different versions of the product, but the support of the `information_schema` makes it possible to change the structure of system tables without changing the method of access to metadata. Metadata requests do not depend on the DBMS used. + +``` sql +SHOW TABLES FROM INFORMATION_SCHEMA; +``` + +``` text +┌─name─────┐ +│ COLUMNS │ +│ SCHEMATA │ +│ TABLES │ +│ VIEWS │ +└──────────┘ +``` + +`INFORMATION_SCHEMA` contains the following views: + +- [COLUMNS](#columns) +- [SCHEMATA](#schemata) +- [TABLES](#tables) +- [VIEWS](#views) + +## COLUMNS {#columns} + +Contains columns read from the [system.columns](../../operations/system-tables/columns.md) system table and columns that are not supported in ClickHouse or do not make sense (always `NULL`), but must be by the standard. + +Columns: + +- `table_catalog` ([String](../../sql-reference/data-types/string.md)) — The name of the database in which the table is located. +- `table_schema` ([String](../../sql-reference/data-types/string.md)) — The name of the database in which the table is located. +- `table_name` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `column_name` ([String](../../sql-reference/data-types/string.md)) — Column name. +- `ordinal_position` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Ordinal position of a column in a table starting with 1. +- `column_default` ([String](../../sql-reference/data-types/string.md)) — Expression for the default value, or an empty string if it is not defined. +- `is_nullable` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the column type is `Nullable`. +- `data_type` ([String](../../sql-reference/data-types/string.md)) — Column type. +- `character_maximum_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum length in bytes for binary data, character data, or text data and images. In ClickHouse makes sense only for `FixedString` data type. Otherwise, the `NULL` value is returned. +- `character_octet_length` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum length in bytes for binary data, character data, or text data and images. In ClickHouse makes sense only for `FixedString` data type. Otherwise, the `NULL` value is returned. +- `numeric_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Accuracy of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse it is bitness for integer types and decimal precision for `Decimal` types. Otherwise, the `NULL` value is returned. +- `numeric_precision_radix` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The base of the number system is the accuracy of approximate numeric data, exact numeric data, integer data or monetary data. In ClickHouse it's 2 for integer types and 10 for `Decimal` types. Otherwise, the `NULL` value is returned. +- `numeric_scale` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The scale of approximate numeric data, exact numeric data, integer data, or monetary data. In ClickHouse makes sense only for `Decimal` types. Otherwise, the `NULL` value is returned. +- `datetime_precision` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Decimal precision of `DateTime64` data type. For other data types, the `NULL` value is returned. +- `character_set_catalog` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `character_set_schema` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `character_set_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `collation_catalog` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `collation_schema` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `collation_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `domain_catalog` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `domain_schema` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `domain_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. + +**Example** + +Query: + +``` sql +SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE (table_schema=currentDatabase() OR table_schema='') AND table_name NOT LIKE '%inner%' LIMIT 1 FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +table_catalog: default +table_schema: default +table_name: describe_example +column_name: id +ordinal_position: 1 +column_default: +is_nullable: 0 +data_type: UInt64 +character_maximum_length: ᴺᵁᴸᴸ +character_octet_length: ᴺᵁᴸᴸ +numeric_precision: 64 +numeric_precision_radix: 2 +numeric_scale: 0 +datetime_precision: ᴺᵁᴸᴸ +character_set_catalog: ᴺᵁᴸᴸ +character_set_schema: ᴺᵁᴸᴸ +character_set_name: ᴺᵁᴸᴸ +collation_catalog: ᴺᵁᴸᴸ +collation_schema: ᴺᵁᴸᴸ +collation_name: ᴺᵁᴸᴸ +domain_catalog: ᴺᵁᴸᴸ +domain_schema: ᴺᵁᴸᴸ +domain_name: ᴺᵁᴸᴸ +``` + +## SCHEMATA {#schemata} + +Contains columns read from the [system.databases](../../operations/system-tables/databases.md) system table and columns that are not supported in ClickHouse or do not make sense (always `NULL`), but must be by the standard. + +Columns: + +- `catalog_name` ([String](../../sql-reference/data-types/string.md)) — The name of the database. +- `schema_name` ([String](../../sql-reference/data-types/string.md)) — The name of the database. +- `schema_owner` ([String](../../sql-reference/data-types/string.md)) — Schema owner name, always `'default'`. +- `default_character_set_catalog` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `default_character_set_schema` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `default_character_set_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. +- `sql_path` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — `NULL`, not supported. + +**Example** + +Query: + +``` sql +SELECT * FROM information_schema.schemata WHERE schema_name ILIKE 'information_schema' LIMIT 1 FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +catalog_name: INFORMATION_SCHEMA +schema_name: INFORMATION_SCHEMA +schema_owner: default +default_character_set_catalog: ᴺᵁᴸᴸ +default_character_set_schema: ᴺᵁᴸᴸ +default_character_set_name: ᴺᵁᴸᴸ +sql_path: ᴺᵁᴸᴸ +``` + +## TABLES {#tables} + +Contains columns read from the [system.tables](../../operations/system-tables/tables.md) system table. + +Columns: + +- `table_catalog` ([String](../../sql-reference/data-types/string.md)) — The name of the database in which the table is located. +- `table_schema` ([String](../../sql-reference/data-types/string.md)) — The name of the database in which the table is located. +- `table_name` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `table_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Table type. Possible values: + - `BASE TABLE` + - `VIEW` + - `FOREIGN TABLE` + - `LOCAL TEMPORARY` + - `SYSTEM VIEW` + +**Example** + +Query: + +``` sql +SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE (table_schema = currentDatabase() OR table_schema = '') AND table_name NOT LIKE '%inner%' LIMIT 1 FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +table_catalog: default +table_schema: default +table_name: describe_example +table_type: BASE TABLE +``` + +## VIEWS {#views} + +Contains columns read from the [system.tables](../../operations/system-tables/tables.md) system table, when the table engine [View](../../engines/table-engines/special/view.md) is used. + +Columns: + +- `table_catalog` ([String](../../sql-reference/data-types/string.md)) — The name of the database in which the table is located. +- `table_schema` ([String](../../sql-reference/data-types/string.md)) — The name of the database in which the table is located. +- `table_name` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `view_definition` ([String](../../sql-reference/data-types/string.md)) — `SELECT` query for view. +- `check_option` ([String](../../sql-reference/data-types/string.md)) — `NONE`, no checking. +- `is_updatable` ([Enum8](../../sql-reference/data-types/enum.md)) — `NO`, the view is not updated. +- `is_insertable_into` ([Enum8](../../sql-reference/data-types/enum.md)) — Shows whether the created view is [materialized](../../sql-reference/statements/create/view/#materialized). Possible values: + - `NO` — The created view is not materialized. + - `YES` — The created view is materialized. +- `is_trigger_updatable` ([Enum8](../../sql-reference/data-types/enum.md)) — `NO`, the trigger is not updated. +- `is_trigger_deletable` ([Enum8](../../sql-reference/data-types/enum.md)) — `NO`, the trigger is not deleted. +- `is_trigger_insertable_into` ([Enum8](../../sql-reference/data-types/enum.md)) — `NO`, no data is inserted into the trigger. + +**Example** + +Query: + +``` sql +CREATE VIEW v (n Nullable(Int32), f Float64) AS SELECT n, f FROM t; +CREATE MATERIALIZED VIEW mv ENGINE = Null AS SELECT * FROM system.one; +SELECT * FROM information_schema.views WHERE table_schema = currentDatabase() LIMIT 1 FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +table_catalog: default +table_schema: default +table_name: mv +view_definition: SELECT * FROM system.one +check_option: NONE +is_updatable: NO +is_insertable_into: YES +is_trigger_updatable: NO +is_trigger_deletable: NO +is_trigger_insertable_into: NO +``` diff --git a/docs/en/reference/operations/system-tables/licenses.md b/docs/en/reference/operations/system-tables/licenses.md new file mode 100644 index 00000000000..fad6e16fd8a --- /dev/null +++ b/docs/en/reference/operations/system-tables/licenses.md @@ -0,0 +1,39 @@ +# licenses + +Сontains licenses of third-party libraries that are located in the [contrib](https://github.com/ClickHouse/ClickHouse/tree/master/contrib) directory of ClickHouse sources. + +Columns: + +- `library_name` ([String](../../sql-reference/data-types/string.md)) — Name of the library, which is license connected with. +- `license_type` ([String](../../sql-reference/data-types/string.md)) — License type — e.g. Apache, MIT. +- `license_path` ([String](../../sql-reference/data-types/string.md)) — Path to the file with the license text. +- `license_text` ([String](../../sql-reference/data-types/string.md)) — License text. + +**Example** + +``` sql +SELECT library_name, license_type, license_path FROM system.licenses LIMIT 15 +``` + +``` text +┌─library_name───────┬─license_type─┬─license_path────────────────────────┐ +│ FastMemcpy │ MIT │ /contrib/FastMemcpy/LICENSE │ +│ arrow │ Apache │ /contrib/arrow/LICENSE.txt │ +│ avro │ Apache │ /contrib/avro/LICENSE.txt │ +│ aws-c-common │ Apache │ /contrib/aws-c-common/LICENSE │ +│ aws-c-event-stream │ Apache │ /contrib/aws-c-event-stream/LICENSE │ +│ aws-checksums │ Apache │ /contrib/aws-checksums/LICENSE │ +│ aws │ Apache │ /contrib/aws/LICENSE.txt │ +│ base64 │ BSD 2-clause │ /contrib/base64/LICENSE │ +│ boost │ Boost │ /contrib/boost/LICENSE_1_0.txt │ +│ brotli │ MIT │ /contrib/brotli/LICENSE │ +│ capnproto │ MIT │ /contrib/capnproto/LICENSE │ +│ cassandra │ Apache │ /contrib/cassandra/LICENSE.txt │ +│ cctz │ Apache │ /contrib/cctz/LICENSE.txt │ +│ cityhash102 │ MIT │ /contrib/cityhash102/COPYING │ +│ cppkafka │ BSD 2-clause │ /contrib/cppkafka/LICENSE │ +└────────────────────┴──────────────┴─────────────────────────────────────┘ + +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/licenses) diff --git a/docs/en/reference/operations/system-tables/merge_tree_settings.md b/docs/en/reference/operations/system-tables/merge_tree_settings.md new file mode 100644 index 00000000000..0324d5c633d --- /dev/null +++ b/docs/en/reference/operations/system-tables/merge_tree_settings.md @@ -0,0 +1,54 @@ +# merge_tree_settings {#system-merge_tree_settings} + +Contains information about settings for `MergeTree` tables. + +Columns: + +- `name` (String) — Setting name. +- `value` (String) — Setting value. +- `description` (String) — Setting description. +- `type` (String) — Setting type (implementation specific string value). +- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed. + +**Example** +```sql +:) SELECT * FROM system.merge_tree_settings LIMIT 4 FORMAT Vertical; +``` + +```text +Row 1: +────── +name: index_granularity +value: 8192 +changed: 0 +description: How many rows correspond to one primary key value. +type: SettingUInt64 + +Row 2: +────── +name: min_bytes_for_wide_part +value: 0 +changed: 0 +description: Minimal uncompressed size in bytes to create part in wide format instead of compact +type: SettingUInt64 + +Row 3: +────── +name: min_rows_for_wide_part +value: 0 +changed: 0 +description: Minimal number of rows to create part in wide format instead of compact +type: SettingUInt64 + +Row 4: +────── +name: merge_max_block_size +value: 8192 +changed: 0 +description: How many rows in blocks should be formed for merge operations. +type: SettingUInt64 + +4 rows in set. Elapsed: 0.001 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/merge_tree_settings) diff --git a/docs/en/reference/operations/system-tables/merges.md b/docs/en/reference/operations/system-tables/merges.md new file mode 100644 index 00000000000..f512e00fc89 --- /dev/null +++ b/docs/en/reference/operations/system-tables/merges.md @@ -0,0 +1,25 @@ +# merges {#system-merges} + +Contains information about merges and part mutations currently in process for tables in the MergeTree family. + +Columns: + +- `database` (String) — The name of the database the table is in. +- `table` (String) — Table name. +- `elapsed` (Float64) — The time elapsed (in seconds) since the merge started. +- `progress` (Float64) — The percentage of completed work from 0 to 1. +- `num_parts` (UInt64) — The number of pieces to be merged. +- `result_part_name` (String) — The name of the part that will be formed as the result of merging. +- `is_mutation` (UInt8) — 1 if this process is a part mutation. +- `total_size_bytes_compressed` (UInt64) — The total size of the compressed data in the merged chunks. +- `total_size_marks` (UInt64) — The total number of marks in the merged parts. +- `bytes_read_uncompressed` (UInt64) — Number of bytes read, uncompressed. +- `rows_read` (UInt64) — Number of rows read. +- `bytes_written_uncompressed` (UInt64) — Number of bytes written, uncompressed. +- `rows_written` (UInt64) — Number of rows written. +- `memory_usage` (UInt64) — Memory consumption of the merge process. +- `thread_id` (UInt64) — Thread ID of the merge process. +- `merge_type` — The type of current merge. Empty if it's an mutation. +- `merge_algorithm` — The algorithm used in current merge. Empty if it's an mutation. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/merges) diff --git a/docs/en/reference/operations/system-tables/metric_log.md b/docs/en/reference/operations/system-tables/metric_log.md new file mode 100644 index 00000000000..55b0d800ead --- /dev/null +++ b/docs/en/reference/operations/system-tables/metric_log.md @@ -0,0 +1,51 @@ +# metric_log {#system_tables-metric_log} + +Contains history of metrics values from tables `system.metrics` and `system.events`, periodically flushed to disk. + +Columns: +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds resolution. + +**Example** + +``` sql +SELECT * FROM system.metric_log LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +event_date: 2020-09-05 +event_time: 2020-09-05 16:22:33 +event_time_microseconds: 2020-09-05 16:22:33.196807 +milliseconds: 196 +ProfileEvent_Query: 0 +ProfileEvent_SelectQuery: 0 +ProfileEvent_InsertQuery: 0 +ProfileEvent_FailedQuery: 0 +ProfileEvent_FailedSelectQuery: 0 +... +... +CurrentMetric_Revision: 54439 +CurrentMetric_VersionInteger: 20009001 +CurrentMetric_RWLockWaitingReaders: 0 +CurrentMetric_RWLockWaitingWriters: 0 +CurrentMetric_RWLockActiveReaders: 0 +CurrentMetric_RWLockActiveWriters: 0 +CurrentMetric_GlobalThread: 74 +CurrentMetric_GlobalThreadActive: 26 +CurrentMetric_LocalThread: 0 +CurrentMetric_LocalThreadActive: 0 +CurrentMetric_DistributedFilesToInsert: 0 +``` + +**See also** + +- [metric_log setting](../../operations/server-configuration-parameters/settings.md#metric_log) — Enabling and disabling the setting. +- [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md) — Contains periodically calculated metrics. +- [system.events](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred. +- [system.metrics](../../operations/system-tables/metrics.md) — Contains instantly calculated metrics. +- [Monitoring](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/metric_log) diff --git a/docs/en/reference/operations/system-tables/metrics.md b/docs/en/reference/operations/system-tables/metrics.md new file mode 100644 index 00000000000..d4e06e1aca6 --- /dev/null +++ b/docs/en/reference/operations/system-tables/metrics.md @@ -0,0 +1,41 @@ +# metrics {#system_tables-metrics} + +Contains metrics which can be calculated instantly, or have a current value. For example, the number of simultaneously processed queries or the current replica delay. This table is always up to date. + +Columns: + +- `metric` ([String](../../sql-reference/data-types/string.md)) — Metric name. +- `value` ([Int64](../../sql-reference/data-types/int-uint.md)) — Metric value. +- `description` ([String](../../sql-reference/data-types/string.md)) — Metric description. + +The list of supported metrics you can find in the [src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CurrentMetrics.cpp) source file of ClickHouse. + +**Example** + +``` sql +SELECT * FROM system.metrics LIMIT 10 +``` + +``` text +┌─metric─────────────────────┬─value─┬─description──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Query │ 1 │ Number of executing queries │ +│ Merge │ 0 │ Number of executing background merges │ +│ PartMutation │ 0 │ Number of mutations (ALTER DELETE/UPDATE) │ +│ ReplicatedFetch │ 0 │ Number of data parts being fetched from replicas │ +│ ReplicatedSend │ 0 │ Number of data parts being sent to replicas │ +│ ReplicatedChecks │ 0 │ Number of data parts checking for consistency │ +│ BackgroundPoolTask │ 0 │ Number of active tasks in BackgroundProcessingPool (merges, mutations, fetches, or replication queue bookkeeping) │ +│ BackgroundSchedulePoolTask │ 0 │ Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc. │ +│ DiskSpaceReservedForMerge │ 0 │ Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts. │ +│ DistributedSend │ 0 │ Number of connections to remote servers sending data that was INSERTed into Distributed tables. Both synchronous and asynchronous mode. │ +└────────────────────────────┴───────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**See Also** + +- [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. +- [system.events](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred. +- [system.metric_log](../../operations/system-tables/metric_log.md#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` and `system.events`. +- [Monitoring](../../operations/monitoring.md) — Base concepts of ClickHouse monitoring. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/metrics) diff --git a/docs/en/reference/operations/system-tables/mutations.md b/docs/en/reference/operations/system-tables/mutations.md new file mode 100644 index 00000000000..507146d93de --- /dev/null +++ b/docs/en/reference/operations/system-tables/mutations.md @@ -0,0 +1,49 @@ +# mutations {#system_tables-mutations} + +The table contains information about [mutations](../../sql-reference/statements/alter/index.md#mutations) of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables and their progress. Each mutation command is represented by a single row. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database to which the mutation was applied. + +- `table` ([String](../../sql-reference/data-types/string.md)) — The name of the table to which the mutation was applied. + +- `mutation_id` ([String](../../sql-reference/data-types/string.md)) — The ID of the mutation. For replicated tables these IDs correspond to znode names in the `/mutations/` directory in ZooKeeper. For non-replicated tables the IDs correspond to file names in the data directory of the table. + +- `command` ([String](../../sql-reference/data-types/string.md)) — The mutation command string (the part of the query after `ALTER TABLE [db.]table`). + +- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the mutation command was submitted for execution. + +- `block_numbers.partition_id` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — For mutations of replicated tables, the array contains the partitions' IDs (one record for each partition). For mutations of non-replicated tables the array is empty. + +- `block_numbers.number` ([Array](../../sql-reference/data-types/array.md)([Int64](../../sql-reference/data-types/int-uint.md))) — For mutations of replicated tables, the array contains one record for each partition, with the block number that was acquired by the mutation. Only parts that contain blocks with numbers less than this number will be mutated in the partition. + + In non-replicated tables, block numbers in all partitions form a single sequence. This means that for mutations of non-replicated tables, the column will contain one record with a single block number acquired by the mutation. + +- `parts_to_do_names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — An array of names of data parts that need to be mutated for the mutation to complete. + +- `parts_to_do` ([Int64](../../sql-reference/data-types/int-uint.md)) — The number of data parts that need to be mutated for the mutation to complete. + +- `is_done` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The flag whether the mutation is done or not. Possible values: + - `1` if the mutation is completed, + - `0` if the mutation is still in process. + +:::note +Even if `parts_to_do = 0` it is possible that a mutation of a replicated table is not completed yet because of a long-running `INSERT` query, that will create a new data part needed to be mutated. +::: + +If there were problems with mutating some data parts, the following columns contain additional information: + +- `latest_failed_part` ([String](../../sql-reference/data-types/string.md)) — The name of the most recent part that could not be mutated. + +- `latest_fail_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — The date and time of the most recent part mutation failure. + +- `latest_fail_reason` ([String](../../sql-reference/data-types/string.md)) — The exception message that caused the most recent part mutation failure. + +**See Also** + +- [Mutations](../../sql-reference/statements/alter/index.md#mutations) +- [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine +- [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md) family + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/mutations) diff --git a/docs/en/reference/operations/system-tables/numbers.md b/docs/en/reference/operations/system-tables/numbers.md new file mode 100644 index 00000000000..29828bfe796 --- /dev/null +++ b/docs/en/reference/operations/system-tables/numbers.md @@ -0,0 +1,32 @@ +# numbers {#system-numbers} + +This table contains a single UInt64 column named `number` that contains almost all the natural numbers starting from zero. + +You can use this table for tests, or if you need to do a brute force search. + +Reads from this table are not parallelized. + +**Example** + +```sql +:) SELECT * FROM system.numbers LIMIT 10; +``` + +```text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ + +10 rows in set. Elapsed: 0.001 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/numbers) diff --git a/docs/en/reference/operations/system-tables/numbers_mt.md b/docs/en/reference/operations/system-tables/numbers_mt.md new file mode 100644 index 00000000000..02155db4711 --- /dev/null +++ b/docs/en/reference/operations/system-tables/numbers_mt.md @@ -0,0 +1,30 @@ +# numbers_mt {#system-numbers-mt} + +The same as [system.numbers](../../operations/system-tables/numbers.md) but reads are parallelized. The numbers can be returned in any order. + +Used for tests. + +**Example** + +```sql +:) SELECT * FROM system.numbers_mt LIMIT 10; +``` + +```text +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +│ 3 │ +│ 4 │ +│ 5 │ +│ 6 │ +│ 7 │ +│ 8 │ +│ 9 │ +└────────┘ + +10 rows in set. Elapsed: 0.001 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/numbers_mt) diff --git a/docs/en/reference/operations/system-tables/one.md b/docs/en/reference/operations/system-tables/one.md new file mode 100644 index 00000000000..9b84c0bfcd6 --- /dev/null +++ b/docs/en/reference/operations/system-tables/one.md @@ -0,0 +1,23 @@ +# one {#system-one} + +This table contains a single row with a single `dummy` UInt8 column containing the value 0. + +This table is used if a `SELECT` query does not specify the `FROM` clause. + +This is similar to the `DUAL` table found in other DBMSs. + +**Example** + +```sql +:) SELECT * FROM system.one LIMIT 10; +``` + +```text +┌─dummy─┐ +│ 0 │ +└───────┘ + +1 rows in set. Elapsed: 0.001 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/one) diff --git a/docs/en/reference/operations/system-tables/opentelemetry_span_log.md b/docs/en/reference/operations/system-tables/opentelemetry_span_log.md new file mode 100644 index 00000000000..89af72d6620 --- /dev/null +++ b/docs/en/reference/operations/system-tables/opentelemetry_span_log.md @@ -0,0 +1,53 @@ +# opentelemetry_span_log {#system_tables-opentelemetry_span_log} + +Contains information about [trace spans](https://opentracing.io/docs/overview/spans/) for executed queries. + +Columns: + +- `trace_id` ([UUID](../../sql-reference/data-types/uuid.md)) — ID of the trace for executed query. + +- `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the `trace span`. + +- `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the parent `trace span`. + +- `operation_name` ([String](../../sql-reference/data-types/string.md)) — The name of the operation. + +- `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The start time of the `trace span` (in microseconds). + +- `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The finish time of the `trace span` (in microseconds). + +- `finish_date` ([Date](../../sql-reference/data-types/date.md)) — The finish date of the `trace span`. + +- `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — [Attribute](https://opentelemetry.io/docs/go/instrumentation/#attributes) names depending on the `trace span`. They are filled in according to the recommendations in the [OpenTelemetry](https://opentelemetry.io/) standard. + +- `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Attribute values depending on the `trace span`. They are filled in according to the recommendations in the `OpenTelemetry` standard. + +**Example** + +Query: + +``` sql +SELECT * FROM system.opentelemetry_span_log LIMIT 1 FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +trace_id: cdab0847-0d62-61d5-4d38-dd65b19a1914 +span_id: 701487461015578150 +parent_span_id: 2991972114672045096 +operation_name: DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl() +start_time_us: 1612374594529090 +finish_time_us: 1612374594529108 +finish_date: 2021-02-03 +attribute.names: [] +attribute.values: [] +``` + +**See Also** + +- [OpenTelemetry](../../operations/opentelemetry.md) + +[Original article](https://clickhouse.com/docs/en/operations/system_tables/opentelemetry_span_log) diff --git a/docs/en/reference/operations/system-tables/part_log.md b/docs/en/reference/operations/system-tables/part_log.md new file mode 100644 index 00000000000..00eaca23862 --- /dev/null +++ b/docs/en/reference/operations/system-tables/part_log.md @@ -0,0 +1,69 @@ +# part_log {#system_tables-part-log} + +The `system.part_log` table is created only if the [part_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-part-log) server setting is specified. + +This table contains information about events that occurred with [data parts](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) in the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family tables, such as adding or merging data. + +The `system.part_log` table contains the following columns: + +- `query_id` ([String](../../sql-reference/data-types/string.md)) — Identifier of the `INSERT` query that created this data part. +- `event_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of the event that occurred with the data part. Can have one of the following values: + - `NEW_PART` — Inserting of a new data part. + - `MERGE_PARTS` — Merging of data parts. + - `DOWNLOAD_PART` — Downloading a data part. + - `REMOVE_PART` — Removing or detaching a data part using [DETACH PARTITION](../../sql-reference/statements/alter/partition.md#alter_detach-partition). + - `MUTATE_PART` — Mutating of a data part. + - `MOVE_PART` — Moving the data part from the one disk to another one. +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision. +- `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration. +- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database the data part is in. +- `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table the data part is in. +- `part_name` ([String](../../sql-reference/data-types/string.md)) — Name of the data part. +- `partition_id` ([String](../../sql-reference/data-types/string.md)) — ID of the partition that the data part was inserted to. The column takes the `all` value if the partitioning is by `tuple()`. +- `path_on_disk` ([String](../../sql-reference/data-types/string.md)) — Absolute path to the folder with data part files. +- `rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of rows in the data part. +- `size_in_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Size of the data part in bytes. +- `merged_from` ([Array(String)](../../sql-reference/data-types/array.md)) — An array of names of the parts which the current part was made up from (after the merge). +- `bytes_uncompressed` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Size of uncompressed bytes. +- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of rows was read during the merge. +- `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of bytes was read during the merge. +- `peak_memory_usage` ([Int64](../../sql-reference/data-types/int-uint.md)) — The maximum difference between the amount of allocated and freed memory in context of this thread. +- `error` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The code number of the occurred error. +- `exception` ([String](../../sql-reference/data-types/string.md)) — Text message of the occurred error. + +The `system.part_log` table is created after the first inserting data to the `MergeTree` table. + +**Example** + +``` sql +SELECT * FROM system.part_log LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +query_id: 983ad9c7-28d5-4ae1-844e-603116b7de31 +event_type: NewPart +event_date: 2021-02-02 +event_time: 2021-02-02 11:14:28 +event_time_microseconds: 2021-02-02 11:14:28.861919 +duration_ms: 35 +database: default +table: log_mt_2 +part_name: all_1_1_0 +partition_id: all +path_on_disk: db/data/default/log_mt_2/all_1_1_0/ +rows: 115418 +size_in_bytes: 1074311 +merged_from: [] +bytes_uncompressed: 0 +read_rows: 0 +read_bytes: 0 +peak_memory_usage: 0 +error: 0 +exception: +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/part_log) diff --git a/docs/en/reference/operations/system-tables/parts.md b/docs/en/reference/operations/system-tables/parts.md new file mode 100644 index 00000000000..845c63e5626 --- /dev/null +++ b/docs/en/reference/operations/system-tables/parts.md @@ -0,0 +1,168 @@ +# parts {#system_tables-parts} + +Contains information about parts of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. + +Each row describes one data part. + +Columns: + +- `partition` ([String](../../sql-reference/data-types/string.md)) – The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter/index.md#query_language_queries_alter) query. + + Formats: + + - `YYYYMM` for automatic partitioning by month. + - `any_string` when partitioning manually. + +- `name` ([String](../../sql-reference/data-types/string.md)) – Name of the data part. + +- `part_type` ([String](../../sql-reference/data-types/string.md)) — The data part storing format. + + Possible Values: + + - `Wide` — Each column is stored in a separate file in a filesystem. + - `Compact` — All columns are stored in one file in a filesystem. + + Data storing format is controlled by the `min_bytes_for_wide_part` and `min_rows_for_wide_part` settings of the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table. + + - `active` ([UInt8](../../sql-reference/data-types/int-uint.md)) – Flag that indicates whether the data part is active. If a data part is active, it’s used in a table. Otherwise, it’s deleted. Inactive data parts remain after merging. + +- `marks` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` by the index granularity (usually 8192) (this hint does not work for adaptive granularity). + +- `rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The number of rows. + +- `bytes_on_disk` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Total size of all the data part files in bytes. + +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The size of the file with marks. + +- `secondary_indices_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Total size of compressed data for secondary indices in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `secondary_indices_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Total size of uncompressed data for secondary indices in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `secondary_indices_marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The size of the file with marks for secondary indices. + +- `modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – The time the directory with the data part was modified. This usually corresponds to the time of data part creation. + +- `remove_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – The time when the data part became inactive. + +- `refcount` ([UInt32](../../sql-reference/data-types/int-uint.md)) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. + +- `min_date` ([Date](../../sql-reference/data-types/date.md)) – The minimum value of the date key in the data part. + +- `max_date` ([Date](../../sql-reference/data-types/date.md)) – The maximum value of the date key in the data part. + +- `min_time` ([DateTime](../../sql-reference/data-types/datetime.md)) – The minimum value of the date and time key in the data part. + +- `max_time`([DateTime](../../sql-reference/data-types/datetime.md)) – The maximum value of the date and time key in the data part. + +- `partition_id` ([String](../../sql-reference/data-types/string.md)) – ID of the partition. + +- `min_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The minimum number of data parts that make up the current part after merging. + +- `max_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The maximum number of data parts that make up the current part after merging. + +- `level` ([UInt32](../../sql-reference/data-types/int-uint.md)) – Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. + +- `data_version` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). + +- `primary_key_bytes_in_memory` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The amount of memory (in bytes) used by primary key values. + +- `primary_key_bytes_in_memory_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md)) – The amount of memory (in bytes) reserved for primary key values. + +- `is_frozen` ([UInt8](../../sql-reference/data-types/int-uint.md)) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup does not exist. For more details, see [FREEZE PARTITION](../../sql-reference/statements/alter/partition.md#alter_freeze-partition) + +- `database` ([String](../../sql-reference/data-types/string.md)) – Name of the database. + +- `table` ([String](../../sql-reference/data-types/string.md)) – Name of the table. + +- `engine` ([String](../../sql-reference/data-types/string.md)) – Name of the table engine without parameters. + +- `path` ([String](../../sql-reference/data-types/string.md)) – Absolute path to the folder with data part files. + +- `disk_name` ([String](../../sql-reference/data-types/string.md)) – Name of a disk that stores the data part. + +- `hash_of_all_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) of compressed files. + +- `hash_of_uncompressed_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) of uncompressed files (files with marks, index file etc.). + +- `uncompressed_hash_of_compressed_files` ([String](../../sql-reference/data-types/string.md)) – [sipHash128](../../sql-reference/functions/hash-functions.md#hash_functions-siphash128) of data in the compressed files as if they were uncompressed. + +- `delete_ttl_info_min` ([DateTime](../../sql-reference/data-types/datetime.md)) — The minimum value of the date and time key for [TTL DELETE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + +- `delete_ttl_info_max` ([DateTime](../../sql-reference/data-types/datetime.md)) — The maximum value of the date and time key for [TTL DELETE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + +- `move_ttl_info.expression` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of expressions. Each expression defines a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + +:::warning +The `move_ttl_info.expression` array is kept mostly for backward compatibility, now the simpliest way to check `TTL MOVE` rule is to use the `move_ttl_info.min` and `move_ttl_info.max` fields. +::: + +- `move_ttl_info.min` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the minimum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + +- `move_ttl_info.max` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the maximum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + +- `bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Alias for `bytes_on_disk`. + +- `marks_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) – Alias for `marks_bytes`. + +**Example** + +``` sql +SELECT * FROM system.parts LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +partition: tuple() +name: all_1_4_1_6 +part_type: Wide +active: 1 +marks: 2 +rows: 6 +bytes_on_disk: 310 +data_compressed_bytes: 157 +data_uncompressed_bytes: 91 +secondary_indices_compressed_bytes: 58 +secondary_indices_uncompressed_bytes: 6 +secondary_indices_marks_bytes: 48 +marks_bytes: 144 +modification_time: 2020-06-18 13:01:49 +remove_time: 1970-01-01 00:00:00 +refcount: 1 +min_date: 1970-01-01 +max_date: 1970-01-01 +min_time: 1970-01-01 00:00:00 +max_time: 1970-01-01 00:00:00 +partition_id: all +min_block_number: 1 +max_block_number: 4 +level: 1 +data_version: 6 +primary_key_bytes_in_memory: 8 +primary_key_bytes_in_memory_allocated: 64 +is_frozen: 0 +database: default +table: months +engine: MergeTree +disk_name: default +path: /var/lib/clickhouse/data/default/months/all_1_4_1_6/ +hash_of_all_files: 2d0657a16d9430824d35e327fcbd87bf +hash_of_uncompressed_files: 84950cc30ba867c77a408ae21332ba29 +uncompressed_hash_of_compressed_files: 1ad78f1c6843bbfb99a2c931abe7df7d +delete_ttl_info_min: 1970-01-01 00:00:00 +delete_ttl_info_max: 1970-01-01 00:00:00 +move_ttl_info.expression: [] +move_ttl_info.min: [] +move_ttl_info.max: [] +``` + +**See Also** + +- [MergeTree family](../../engines/table-engines/mergetree-family/mergetree.md) +- [TTL for Columns and Tables](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/parts) diff --git a/docs/en/reference/operations/system-tables/parts_columns.md b/docs/en/reference/operations/system-tables/parts_columns.md new file mode 100644 index 00000000000..e87be3fcd43 --- /dev/null +++ b/docs/en/reference/operations/system-tables/parts_columns.md @@ -0,0 +1,148 @@ +# parts_columns {#system_tables-parts_columns} + +Contains information about parts and columns of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. + +Each row describes one data part. + +Columns: + +- `partition` ([String](../../sql-reference/data-types/string.md)) — The partition name. To learn what a partition is, see the description of the [ALTER](../../sql-reference/statements/alter/index.md#query_language_queries_alter) query. + + Formats: + + - `YYYYMM` for automatic partitioning by month. + - `any_string` when partitioning manually. + +- `name` ([String](../../sql-reference/data-types/string.md)) — Name of the data part. + +- `part_type` ([String](../../sql-reference/data-types/string.md)) — The data part storing format. + + Possible values: + + - `Wide` — Each column is stored in a separate file in a filesystem. + - `Compact` — All columns are stored in one file in a filesystem. + + Data storing format is controlled by the `min_bytes_for_wide_part` and `min_rows_for_wide_part` settings of the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table. + +- `active` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the data part is active. If a data part is active, it’s used in a table. Otherwise, it’s deleted. Inactive data parts remain after merging. + +- `marks` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of marks. To get the approximate number of rows in a data part, multiply `marks` by the index granularity (usually 8192) (this hint does not work for adaptive granularity). + +- `rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of rows. + +- `bytes_on_disk` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Total size of all the data part files in bytes. + +- `data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of the file with marks. + +- `modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The time the directory with the data part was modified. This usually corresponds to the time of data part creation. + +- `remove_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The time when the data part became inactive. + +- `refcount` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. + +- `min_date` ([Date](../../sql-reference/data-types/date.md)) — The minimum value of the date key in the data part. + +- `max_date` ([Date](../../sql-reference/data-types/date.md)) — The maximum value of the date key in the data part. + +- `partition_id` ([String](../../sql-reference/data-types/string.md)) — ID of the partition. + +- `min_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The minimum number of data parts that make up the current part after merging. + +- `max_block_number` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The maximum number of data parts that make up the current part after merging. + +- `level` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. + +- `data_version` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). + +- `primary_key_bytes_in_memory` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The amount of memory (in bytes) used by primary key values. + +- `primary_key_bytes_in_memory_allocated` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The amount of memory (in bytes) reserved for primary key values. + +- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database. + +- `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table. + +- `engine` ([String](../../sql-reference/data-types/string.md)) — Name of the table engine without parameters. + +- `disk_name` ([String](../../sql-reference/data-types/string.md)) — Name of a disk that stores the data part. + +- `path` ([String](../../sql-reference/data-types/string.md)) — Absolute path to the folder with data part files. + +- `column` ([String](../../sql-reference/data-types/string.md)) — Name of the column. + +- `type` ([String](../../sql-reference/data-types/string.md)) — Column type. + +- `column_position` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Ordinal position of a column in a table starting with 1. + +- `default_kind` ([String](../../sql-reference/data-types/string.md)) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`) for the default value, or an empty string if it is not defined. + +- `default_expression` ([String](../../sql-reference/data-types/string.md)) — Expression for the default value, or an empty string if it is not defined. + +- `column_bytes_on_disk` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Total size of the column in bytes. + +- `column_data_compressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Total size of compressed data in the column, in bytes. + +- `column_data_uncompressed_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Total size of the decompressed data in the column, in bytes. + +- `column_marks_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The size of the column with marks, in bytes. + +- `bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Alias for `bytes_on_disk`. + +- `marks_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Alias for `marks_bytes`. + +**Example** + +``` sql +SELECT * FROM system.parts_columns LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +partition: tuple() +name: all_1_2_1 +part_type: Wide +active: 1 +marks: 2 +rows: 2 +bytes_on_disk: 155 +data_compressed_bytes: 56 +data_uncompressed_bytes: 4 +marks_bytes: 96 +modification_time: 2020-09-23 10:13:36 +remove_time: 2106-02-07 06:28:15 +refcount: 1 +min_date: 1970-01-01 +max_date: 1970-01-01 +partition_id: all +min_block_number: 1 +max_block_number: 2 +level: 1 +data_version: 1 +primary_key_bytes_in_memory: 2 +primary_key_bytes_in_memory_allocated: 64 +database: default +table: 53r93yleapyears +engine: MergeTree +disk_name: default +path: /var/lib/clickhouse/data/default/53r93yleapyears/all_1_2_1/ +column: id +type: Int8 +column_position: 1 +default_kind: +default_expression: +column_bytes_on_disk: 76 +column_data_compressed_bytes: 28 +column_data_uncompressed_bytes: 2 +column_marks_bytes: 48 +``` + +**See Also** + +- [MergeTree family](../../engines/table-engines/mergetree-family/mergetree.md) + +[Original article](https://clickhouse.com/docs/en/operations/system_tables/parts_columns) diff --git a/docs/en/reference/operations/system-tables/processes.md b/docs/en/reference/operations/system-tables/processes.md new file mode 100644 index 00000000000..f261ee9b696 --- /dev/null +++ b/docs/en/reference/operations/system-tables/processes.md @@ -0,0 +1,61 @@ +# processes {#system_tables-processes} + +This system table is used for implementing the `SHOW PROCESSLIST` query. + +Columns: + +- `user` (String) – The user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the `default` user. The field contains the username for a specific query, not for a query that this query initiated. +- `address` (String) – The IP address the request was made from. The same for distributed processing. To track where a distributed query was originally made from, look at `system.processes` on the query requestor server. +- `elapsed` (Float64) – The time in seconds since request execution started. +- `rows_read` (UInt64) – The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. +- `bytes_read` (UInt64) – The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. +- `total_rows_approx` (UInt64) – The approximation of the total number of rows that should be read. For distributed processing, on the requestor server, this is the total for all remote servers. It can be updated during request processing, when new sources to process become known. +- `memory_usage` (UInt64) – Amount of RAM the request uses. It might not include some types of dedicated memory. See the [max_memory_usage](../../operations/settings/query-complexity.md#settings_max_memory_usage) setting. +- `query` (String) – The query text. For `INSERT`, it does not include the data to insert. +- `query_id` (String) – Query ID, if defined. + +```sql +:) SELECT * FROM system.processes LIMIT 10 FORMAT Vertical; +``` + +```text +Row 1: +────── +is_initial_query: 1 +user: default +query_id: 35a360fa-3743-441d-8e1f-228c938268da +address: ::ffff:172.23.0.1 +port: 47588 +initial_user: default +initial_query_id: 35a360fa-3743-441d-8e1f-228c938268da +initial_address: ::ffff:172.23.0.1 +initial_port: 47588 +interface: 1 +os_user: bharatnc +client_hostname: tower +client_name: ClickHouse +client_revision: 54437 +client_version_major: 20 +client_version_minor: 7 +client_version_patch: 2 +http_method: 0 +http_user_agent: +quota_key: +elapsed: 0.000582537 +is_cancelled: 0 +read_rows: 0 +read_bytes: 0 +total_rows_approx: 0 +written_rows: 0 +written_bytes: 0 +memory_usage: 0 +peak_memory_usage: 0 +query: SELECT * from system.processes LIMIT 10 FORMAT Vertical; +thread_ids: [67] +ProfileEvents: {'Query':1,'SelectQuery':1,'ReadCompressedBytes':36,'CompressedReadBufferBlocks':1,'CompressedReadBufferBytes':10,'IOBufferAllocs':1,'IOBufferAllocBytes':89,'ContextLock':15,'RWLockAcquiredReadLocks':1} +Settings: {'background_pool_size':'32','load_balancing':'random','allow_suspicious_low_cardinality_types':'1','distributed_aggregation_memory_efficient':'1','skip_unavailable_shards':'1','log_queries':'1','max_bytes_before_external_group_by':'20000000000','max_bytes_before_external_sort':'20000000000','allow_introspection_functions':'1'} + +1 rows in set. Elapsed: 0.002 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/processes) diff --git a/docs/en/reference/operations/system-tables/query_log.md b/docs/en/reference/operations/system-tables/query_log.md new file mode 100644 index 00000000000..a8fda41f7c2 --- /dev/null +++ b/docs/en/reference/operations/system-tables/query_log.md @@ -0,0 +1,189 @@ +# query_log {#system_tables-query_log} + +Contains information about executed queries, for example, start time, duration of processing, error messages. + +:::note +This table does not contain the ingested data for `INSERT` queries. +::: + +You can change settings of queries logging in the [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) section of the server configuration. + +You can disable queries logging by setting [log_queries = 0](../../operations/settings/settings.md#settings-log-queries). We do not recommend to turn off logging because information in this table is important for solving issues. + +The flushing period of data is set in `flush_interval_milliseconds` parameter of the [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) server settings section. To force flushing, use the [SYSTEM FLUSH LOGS](../../sql-reference/statements/system.md#query_language-system-flush_logs) query. + +ClickHouse does not delete data from the table automatically. See [Introduction](../../operations/system-tables/index.md#system-tables-introduction) for more details. + +The `system.query_log` table registers two kinds of queries: + +1. Initial queries that were run directly by the client. +2. Child queries that were initiated by other queries (for distributed query execution). For these types of queries, information about the parent queries is shown in the `initial_*` columns. + +Each query creates one or two rows in the `query_log` table, depending on the status (see the `type` column) of the query: + +1. If the query execution was successful, two rows with the `QueryStart` and `QueryFinish` types are created. +2. If an error occurred during query processing, two events with the `QueryStart` and `ExceptionWhileProcessing` types are created. +3. If an error occurred before launching the query, a single event with the `ExceptionBeforeStart` type is created. + +You can use the [log_queries_probability](../../operations/settings/settings.md#log-queries-probability) setting to reduce the number of queries, registered in the `query_log` table. + +You can use the [log_formatted_queries](../../operations/settings/settings.md#settings-log-formatted-queries) setting to log formatted queries to the `formatted_query` column. + +Columns: + +- `type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of an event that occurred when executing the query. Values: + - `'QueryStart' = 1` — Successful start of query execution. + - `'QueryFinish' = 2` — Successful end of query execution. + - `'ExceptionBeforeStart' = 3` — Exception before the start of query execution. + - `'ExceptionWhileProcessing' = 4` — Exception during the query execution. +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Query starting date. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query starting time. +- `event_time_microseconds` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query starting time with microseconds precision. +- `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Start time of query execution. +- `query_start_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Start time of query execution with microsecond precision. +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Duration of query execution in milliseconds. +- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Total number of rows read from all tables and table functions participated in query. It includes usual subqueries, subqueries for `IN` and `JOIN`. For distributed queries `read_rows` includes the total number of rows read at all replicas. Each replica sends it’s `read_rows` value, and the server-initiator of the query summarizes all received and local values. The cache volumes do not affect this value. +- `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Total number of bytes read from all tables and table functions participated in query. It includes usual subqueries, subqueries for `IN` and `JOIN`. For distributed queries `read_bytes` includes the total number of rows read at all replicas. Each replica sends it’s `read_bytes` value, and the server-initiator of the query summarizes all received and local values. The cache volumes do not affect this value. +- `written_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. +- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. +- `result_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of rows in a result of the `SELECT` query, or a number of rows in the `INSERT` query. +- `result_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — RAM volume in bytes used to store a query result. +- `memory_usage` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Memory consumption by the query. +- `current_database` ([String](../../sql-reference/data-types/string.md)) — Name of the current database. +- `query` ([String](../../sql-reference/data-types/string.md)) — Query string. +- `formatted_query` ([String](../../sql-reference/data-types/string.md)) — Formatted query string. +- `normalized_query_hash` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Identical hash value without the values of literals for similar queries. +- `query_kind` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — Type of the query. +- `databases` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the databases present in the query. +- `tables` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the tables present in the query. +- `views` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the (materialized or live) views present in the query. +- `columns` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the columns present in the query. +- `projections` ([String](../../sql-reference/data-types/string.md)) — Names of the projections used during the query execution. +- `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — Code of an exception. +- `exception` ([String](../../sql-reference/data-types/string.md)) — Exception message. +- `stack_trace` ([String](../../sql-reference/data-types/string.md)) — [Stack trace](https://en.wikipedia.org/wiki/Stack_trace). An empty string, if the query was completed successfully. +- `is_initial_query` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Query type. Possible values: + - 1 — Query was initiated by the client. + - 0 — Query was initiated by another query as part of distributed query execution. +- `user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who initiated the current query. +- `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the query. +- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that was used to make the query. +- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to make the query. +- `initial_user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who ran the initial query (for distributed query execution). +- `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the initial query (for distributed query execution). +- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that the parent query was launched from. +- `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to make the parent query. +- `initial_query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Initial query starting time (for distributed query execution). +- `initial_query_start_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Initial query starting time with microseconds precision (for distributed query execution). +- `interface` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Interface that the query was initiated from. Possible values: + - 1 — TCP. + - 2 — HTTP. +- `os_user` ([String](../../sql-reference/data-types/string.md)) — Operating system username who runs [clickhouse-client](../../interfaces/cli.md). +- `client_hostname` ([String](../../sql-reference/data-types/string.md)) — Hostname of the client machine where the [clickhouse-client](../../interfaces/cli.md) or another TCP client is run. +- `client_name` ([String](../../sql-reference/data-types/string.md)) — The [clickhouse-client](../../interfaces/cli.md) or another TCP client name. +- `client_revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Revision of the [clickhouse-client](../../interfaces/cli.md) or another TCP client. +- `client_version_major` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Major version of the [clickhouse-client](../../interfaces/cli.md) or another TCP client. +- `client_version_minor` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Minor version of the [clickhouse-client](../../interfaces/cli.md) or another TCP client. +- `client_version_patch` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Patch component of the [clickhouse-client](../../interfaces/cli.md) or another TCP client version. +- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: + - 0 — The query was launched from the TCP interface. + - 1 — `GET` method was used. + - 2 — `POST` method was used. +- `http_user_agent` ([String](../../sql-reference/data-types/string.md)) — HTTP header `UserAgent` passed in the HTTP query. +- `http_referer` ([String](../../sql-reference/data-types/string.md)) — HTTP header `Referer` passed in the HTTP query (contains an absolute or partial address of the page making the query). +- `forwarded_for` ([String](../../sql-reference/data-types/string.md)) — HTTP header `X-Forwarded-For` passed in the HTTP query. +- `quota_key` ([String](../../sql-reference/data-types/string.md)) — The `quota key` specified in the [quotas](../../operations/quotas.md) setting (see `keyed`). +- `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse revision. +- `ProfileEvents` ([Map(String, UInt64)](../../sql-reference/data-types/array.md)) — ProfileEvents that measure different metrics. The description of them could be found in the table [system.events](../../operations/system-tables/events.md#system_tables-events) +- `Settings` ([Map(String, String)](../../sql-reference/data-types/array.md)) — Settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. +- `log_comment` ([String](../../sql-reference/data-types/string.md)) — Log comment. It can be set to arbitrary string no longer than [max_query_size](../../operations/settings/settings.md#settings-max_query_size). An empty string if it is not defined. +- `thread_ids` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Thread ids that are participating in query execution. +- `used_aggregate_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `aggregate functions`, which were used during query execution. +- `used_aggregate_function_combinators` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `aggregate functions combinators`, which were used during query execution. +- `used_database_engines` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `database engines`, which were used during query execution. +- `used_data_type_families` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `data type families`, which were used during query execution. +- `used_dictionaries` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `dictionaries`, which were used during query execution. +- `used_formats` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `formats`, which were used during query execution. +- `used_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `functions`, which were used during query execution. +- `used_storages` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `storages`, which were used during query execution. +- `used_table_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `table functions`, which were used during query execution. + +**Example** + +``` sql +SELECT * FROM system.query_log WHERE type = 'QueryFinish' ORDER BY query_start_time DESC LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +type: QueryFinish +event_date: 2021-11-03 +event_time: 2021-11-03 16:13:54 +event_time_microseconds: 2021-11-03 16:13:54.953024 +query_start_time: 2021-11-03 16:13:54 +query_start_time_microseconds: 2021-11-03 16:13:54.952325 +query_duration_ms: 0 +read_rows: 69 +read_bytes: 6187 +written_rows: 0 +written_bytes: 0 +result_rows: 69 +result_bytes: 48256 +memory_usage: 0 +current_database: default +query: DESCRIBE TABLE system.query_log +formatted_query: +normalized_query_hash: 8274064835331539124 +query_kind: +databases: [] +tables: [] +columns: [] +projections: [] +views: [] +exception_code: 0 +exception: +stack_trace: +is_initial_query: 1 +user: default +query_id: 7c28bbbb-753b-4eba-98b1-efcbe2b9bdf6 +address: ::ffff:127.0.0.1 +port: 40452 +initial_user: default +initial_query_id: 7c28bbbb-753b-4eba-98b1-efcbe2b9bdf6 +initial_address: ::ffff:127.0.0.1 +initial_port: 40452 +initial_query_start_time: 2021-11-03 16:13:54 +initial_query_start_time_microseconds: 2021-11-03 16:13:54.952325 +interface: 1 +os_user: sevirov +client_hostname: clickhouse.ru-central1.internal +client_name: ClickHouse +client_revision: 54449 +client_version_major: 21 +client_version_minor: 10 +client_version_patch: 1 +http_method: 0 +http_user_agent: +http_referer: +forwarded_for: +quota_key: +revision: 54456 +log_comment: +thread_ids: [30776,31174] +ProfileEvents: {'Query':1,'NetworkSendElapsedMicroseconds':59,'NetworkSendBytes':2643,'SelectedRows':69,'SelectedBytes':6187,'ContextLock':9,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':817,'UserTimeMicroseconds':427,'SystemTimeMicroseconds':212,'OSCPUVirtualTimeMicroseconds':639,'OSReadChars':894,'OSWriteChars':319} +Settings: {'load_balancing':'random','max_memory_usage':'10000000000'} +used_aggregate_functions: [] +used_aggregate_function_combinators: [] +used_database_engines: [] +used_data_type_families: [] +used_dictionaries: [] +used_formats: [] +used_functions: [] +used_storages: [] +used_table_functions: [] +``` + +**See Also** + +- [system.query_thread_log](../../operations/system-tables/query_thread_log.md#system_tables-query_thread_log) — This table contains information about each query execution thread. diff --git a/docs/en/reference/operations/system-tables/query_thread_log.md b/docs/en/reference/operations/system-tables/query_thread_log.md new file mode 100644 index 00000000000..072a311b7db --- /dev/null +++ b/docs/en/reference/operations/system-tables/query_thread_log.md @@ -0,0 +1,119 @@ +# query_thread_log {#system_tables-query_thread_log} + +Contains information about threads that execute queries, for example, thread name, thread start time, duration of query processing. + +To start logging: + +1. Configure parameters in the [query_thread_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_thread_log) section. +2. Set [log_query_threads](../../operations/settings/settings.md#settings-log-query-threads) to 1. + +The flushing period of data is set in `flush_interval_milliseconds` parameter of the [query_thread_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_thread_log) server settings section. To force flushing, use the [SYSTEM FLUSH LOGS](../../sql-reference/statements/system.md#query_language-system-flush_logs) query. + +ClickHouse does not delete data from the table automatically. See [Introduction](../../operations/system-tables/index.md#system-tables-introduction) for more details. + +You can use the [log_queries_probability](../../operations/settings/settings.md#log-queries-probability) setting to reduce the number of queries, registered in the `query_thread_log` table. + +Columns: + +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the thread has finished execution of the query. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the thread has finished execution of the query. +- `event_time_microsecinds` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the thread has finished execution of the query with microseconds precision. +- `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Start time of query execution. +- `query_start_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Start time of query execution with microsecond precision. +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Duration of query execution. +- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of read rows. +- `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of read bytes. +- `written_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. +- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. +- `memory_usage` ([Int64](../../sql-reference/data-types/int-uint.md)) — The difference between the amount of allocated and freed memory in context of this thread. +- `peak_memory_usage` ([Int64](../../sql-reference/data-types/int-uint.md)) — The maximum difference between the amount of allocated and freed memory in context of this thread. +- `thread_name` ([String](../../sql-reference/data-types/string.md)) — Name of the thread. +- `thread_number` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Internal thread ID. +- `thread_id` ([Int32](../../sql-reference/data-types/int-uint.md)) — thread ID. +- `master_thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — OS initial ID of initial thread. +- `query` ([String](../../sql-reference/data-types/string.md)) — Query string. +- `is_initial_query` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Query type. Possible values: + - 1 — Query was initiated by the client. + - 0 — Query was initiated by another query for distributed query execution. +- `user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who initiated the current query. +- `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the query. +- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that was used to make the query. +- `port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — The client port that was used to make the query. +- `initial_user` ([String](../../sql-reference/data-types/string.md)) — Name of the user who ran the initial query (for distributed query execution). +- `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the initial query (for distributed query execution). +- `initial_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address that the parent query was launched from. +- `initial_port` ([UInt16](../../sql-reference/data-types/int-uint.md#uint-ranges)) — The client port that was used to make the parent query. +- `interface` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Interface that the query was initiated from. Possible values: + - 1 — TCP. + - 2 — HTTP. +- `os_user` ([String](../../sql-reference/data-types/string.md)) — OS’s username who runs [clickhouse-client](../../interfaces/cli.md). +- `client_hostname` ([String](../../sql-reference/data-types/string.md)) — Hostname of the client machine where the [clickhouse-client](../../interfaces/cli.md) or another TCP client is run. +- `client_name` ([String](../../sql-reference/data-types/string.md)) — The [clickhouse-client](../../interfaces/cli.md) or another TCP client name. +- `client_revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Revision of the [clickhouse-client](../../interfaces/cli.md) or another TCP client. +- `client_version_major` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Major version of the [clickhouse-client](../../interfaces/cli.md) or another TCP client. +- `client_version_minor` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Minor version of the [clickhouse-client](../../interfaces/cli.md) or another TCP client. +- `client_version_patch` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Patch component of the [clickhouse-client](../../interfaces/cli.md) or another TCP client version. +- `http_method` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — HTTP method that initiated the query. Possible values: + - 0 — The query was launched from the TCP interface. + - 1 — `GET` method was used. + - 2 — `POST` method was used. +- `http_user_agent` ([String](../../sql-reference/data-types/string.md)) — The `UserAgent` header passed in the HTTP request. +- `quota_key` ([String](../../sql-reference/data-types/string.md)) — The “quota key” specified in the [quotas](../../operations/quotas.md) setting (see `keyed`). +- `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse revision. +- `ProfileEvents` ([Map(String, UInt64)](../../sql-reference/data-types/array.md)) — ProfileEvents that measure different metrics for this thread. The description of them could be found in the table [system.events](#system_tables-events). + +**Example** + +``` sql + SELECT * FROM system.query_thread_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2020-09-11 +event_time: 2020-09-11 10:08:17 +event_time_microseconds: 2020-09-11 10:08:17.134042 +query_start_time: 2020-09-11 10:08:17 +query_start_time_microseconds: 2020-09-11 10:08:17.063150 +query_duration_ms: 70 +read_rows: 0 +read_bytes: 0 +written_rows: 1 +written_bytes: 12 +memory_usage: 4300844 +peak_memory_usage: 4300844 +thread_name: TCPHandler +thread_id: 638133 +master_thread_id: 638133 +query: INSERT INTO test1 VALUES +is_initial_query: 1 +user: default +query_id: 50a320fd-85a8-49b8-8761-98a86bcbacef +address: ::ffff:127.0.0.1 +port: 33452 +initial_user: default +initial_query_id: 50a320fd-85a8-49b8-8761-98a86bcbacef +initial_address: ::ffff:127.0.0.1 +initial_port: 33452 +interface: 1 +os_user: bharatnc +client_hostname: tower +client_name: ClickHouse +client_revision: 54437 +client_version_major: 20 +client_version_minor: 7 +client_version_patch: 2 +http_method: 0 +http_user_agent: +quota_key: +revision: 54440 +ProfileEvents: {'Query':1,'SelectQuery':1,'ReadCompressedBytes':36,'CompressedReadBufferBlocks':1,'CompressedReadBufferBytes':10,'IOBufferAllocs':1,'IOBufferAllocBytes':89,'ContextLock':15,'RWLockAcquiredReadLocks':1} +``` + +**See Also** + +- [system.query_log](../../operations/system-tables/query_log.md#system_tables-query_log) — Description of the `query_log` system table which contains common information about queries execution. +- [system.query_views_log](../../operations/system-tables/query_views_log.md#system_tables-query_views_log) — This table contains information about each view executed during a query. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/query_thread_log) diff --git a/docs/en/reference/operations/system-tables/query_views_log.md b/docs/en/reference/operations/system-tables/query_views_log.md new file mode 100644 index 00000000000..5aa69522869 --- /dev/null +++ b/docs/en/reference/operations/system-tables/query_views_log.md @@ -0,0 +1,86 @@ +# query_views_log {#system_tables-query_views_log} + +Contains information about the dependent views executed when running a query, for example, the view type or the execution time. + +To start logging: + +1. Configure parameters in the [query_views_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_views_log) section. +2. Set [log_query_views](../../operations/settings/settings.md#settings-log-query-views) to 1. + +The flushing period of data is set in `flush_interval_milliseconds` parameter of the [query_views_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query_views_log) server settings section. To force flushing, use the [SYSTEM FLUSH LOGS](../../sql-reference/statements/system.md#query_language-system-flush_logs) query. + +ClickHouse does not delete data from the table automatically. See [Introduction](../../operations/system-tables/index.md#system-tables-introduction) for more details. + +You can use the [log_queries_probability](../../operations/settings/settings.md#log-queries-probability) setting to reduce the number of queries, registered in the `query_views_log` table. + +Columns: + +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the last event of the view happened. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the view finished execution. +- `event_time_microseconds` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the view finished execution with microseconds precision. +- `view_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Duration of view execution (sum of its stages) in milliseconds. +- `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the initial query (for distributed query execution). +- `view_name` ([String](../../sql-reference/data-types/string.md)) — Name of the view. +- `view_uuid` ([UUID](../../sql-reference/data-types/uuid.md)) — UUID of the view. +- `view_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of the view. Values: + - `'Default' = 1` — [Default views](../../sql-reference/statements/create/view.md#normal). Should not appear in this log. + - `'Materialized' = 2` — [Materialized views](../../sql-reference/statements/create/view.md#materialized). + - `'Live' = 3` — [Live views](../../sql-reference/statements/create/view.md#live-view). +- `view_query` ([String](../../sql-reference/data-types/string.md)) — The query executed by the view. +- `view_target` ([String](../../sql-reference/data-types/string.md)) — The name of the view target table. +- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of read rows. +- `read_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of read bytes. +- `written_rows` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of written rows. +- `written_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Number of written bytes. +- `peak_memory_usage` ([Int64](../../sql-reference/data-types/int-uint.md)) — The maximum difference between the amount of allocated and freed memory in context of this view. +- `ProfileEvents` ([Map(String, UInt64)](../../sql-reference/data-types/array.md)) — ProfileEvents that measure different metrics. The description of them could be found in the table [system.events](../../operations/system-tables/events.md#system_tables-events). +- `status` ([Enum8](../../sql-reference/data-types/enum.md)) — Status of the view. Values: + - `'QueryStart' = 1` — Successful start the view execution. Should not appear. + - `'QueryFinish' = 2` — Successful end of the view execution. + - `'ExceptionBeforeStart' = 3` — Exception before the start of the view execution. + - `'ExceptionWhileProcessing' = 4` — Exception during the view execution. +- `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — Code of an exception. +- `exception` ([String](../../sql-reference/data-types/string.md)) — Exception message. +- `stack_trace` ([String](../../sql-reference/data-types/string.md)) — [Stack trace](https://en.wikipedia.org/wiki/Stack_trace). An empty string, if the query was completed successfully. + +**Example** + +Query: + +``` sql +SELECT * FROM system.query_views_log LIMIT 1 \G; +``` + +Result: + +``` text +Row 1: +────── +event_date: 2021-06-22 +event_time: 2021-06-22 13:23:07 +event_time_microseconds: 2021-06-22 13:23:07.738221 +view_duration_ms: 0 +initial_query_id: c3a1ac02-9cad-479b-af54-9e9c0a7afd70 +view_name: default.matview_inner +view_uuid: 00000000-0000-0000-0000-000000000000 +view_type: Materialized +view_query: SELECT * FROM default.table_b +view_target: default.`.inner.matview_inner` +read_rows: 4 +read_bytes: 64 +written_rows: 2 +written_bytes: 32 +peak_memory_usage: 4196188 +ProfileEvents: {'FileOpen':2,'WriteBufferFromFileDescriptorWrite':2,'WriteBufferFromFileDescriptorWriteBytes':187,'IOBufferAllocs':3,'IOBufferAllocBytes':3145773,'FunctionExecute':3,'DiskWriteElapsedMicroseconds':13,'InsertedRows':2,'InsertedBytes':16,'SelectedRows':4,'SelectedBytes':48,'ContextLock':16,'RWLockAcquiredReadLocks':1,'RealTimeMicroseconds':698,'SoftPageFaults':4,'OSReadChars':463} +status: QueryFinish +exception_code: 0 +exception: +stack_trace: +``` + +**See Also** + +- [system.query_log](../../operations/system-tables/query_log.md#system_tables-query_log) — Description of the `query_log` system table which contains common information about queries execution. +- [system.query_thread_log](../../operations/system-tables/query_thread_log.md#system_tables-query_thread_log) — This table contains information about each query execution thread. + +[Original article](https://clickhouse.com/docs/en/operations/system_tables/query_thread_log) diff --git a/docs/en/reference/operations/system-tables/quota_limits.md b/docs/en/reference/operations/system-tables/quota_limits.md new file mode 100644 index 00000000000..e1873ecfa92 --- /dev/null +++ b/docs/en/reference/operations/system-tables/quota_limits.md @@ -0,0 +1,21 @@ +# quota_limits {#system_tables-quota_limits} + +Contains information about maximums for all intervals of all quotas. Any number of rows or zero can correspond to one quota. + +Columns: +- `quota_name` ([String](../../sql-reference/data-types/string.md)) — Quota name. +- `duration` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Length of the time interval for calculating resource consumption, in seconds. +- `is_randomized_interval` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Logical value. It shows whether the interval is randomized. Interval always starts at the same time if it is not randomized. For example, an interval of 1 minute always starts at an integer number of minutes (i.e. it can start at 11:20:00, but it never starts at 11:20:01), an interval of one day always starts at midnight UTC. If interval is randomized, the very first interval starts at random time, and subsequent intervals starts one by one. Values: +- `0` — Interval is not randomized. +- `1` — Interval is randomized. +- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of queries. +- `max_query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of select queries. +- `max_query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of insert queries. +- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of errors. +- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of result rows. +- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of RAM volume in bytes used to store a queries result. +- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of rows read from all tables and table functions participated in queries. +- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of bytes read from all tables and table functions participated in queries. +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Maximum of the query execution time, in seconds. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/quota_limits) diff --git a/docs/en/reference/operations/system-tables/quota_usage.md b/docs/en/reference/operations/system-tables/quota_usage.md new file mode 100644 index 00000000000..ad9f9b8c44f --- /dev/null +++ b/docs/en/reference/operations/system-tables/quota_usage.md @@ -0,0 +1,32 @@ +# quota_usage {#system_tables-quota_usage} + +Quota usage by the current user: how much is used and how much is left. + +Columns: +- `quota_name` ([String](../../sql-reference/data-types/string.md)) — Quota name. +- `quota_key`([String](../../sql-reference/data-types/string.md)) — Key value. For example, if keys = \[`ip address`\], `quota_key` may have a value ‘192.168.1.1’. +- `start_time`([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Start time for calculating resource consumption. +- `end_time`([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md))) — End time for calculating resource consumption. +- `duration` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Length of the time interval for calculating resource consumption, in seconds. +- `queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of requests on this interval. +- `query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of select requests on this interval. +- `query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of insert requests on this interval. +- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of requests. +- `errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The number of queries that threw an exception. +- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of errors. +- `result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of rows given as a result. +- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of result rows. +- `result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — RAM volume in bytes used to store a queries result. +- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum RAM volume used to store a queries result, in bytes. +- `read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of source rows read from tables for running the query on all remote servers. +- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of rows read from all tables and table functions participated in queries. +- `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of bytes read from all tables and table functions participated in queries. +- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum of bytes read from all tables and table functions. +- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — The total query execution time, in seconds (wall time). +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Maximum of query execution time. + +## See Also {#see-also} + +- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/quota_usage) diff --git a/docs/en/reference/operations/system-tables/quotas.md b/docs/en/reference/operations/system-tables/quotas.md new file mode 100644 index 00000000000..0a435919b14 --- /dev/null +++ b/docs/en/reference/operations/system-tables/quotas.md @@ -0,0 +1,28 @@ +# quotas {#system_tables-quotas} + +Contains information about [quotas](../../operations/system-tables/quotas.md). + +Columns: +- `name` ([String](../../sql-reference/data-types/string.md)) — Quota name. +- `id` ([UUID](../../sql-reference/data-types/uuid.md)) — Quota ID. +- `storage`([String](../../sql-reference/data-types/string.md)) — Storage of quotas. Possible value: “users.xml” if a quota configured in the users.xml file, “disk” if a quota configured by an SQL-query. +- `keys` ([Array](../../sql-reference/data-types/array.md)([Enum8](../../sql-reference/data-types/enum.md))) — Key specifies how the quota should be shared. If two connections use the same quota and key, they share the same amounts of resources. Values: + - `[]` — All users share the same quota. + - `['user_name']` — Connections with the same user name share the same quota. + - `['ip_address']` — Connections from the same IP share the same quota. + - `['client_key']` — Connections with the same key share the same quota. A key must be explicitly provided by a client. When using [clickhouse-client](../../interfaces/cli.md), pass a key value in the `--quota_key` parameter, or use the `quota_key` parameter in the client configuration file. When using HTTP interface, use the `X-ClickHouse-Quota` header. + - `['user_name', 'client_key']` — Connections with the same `client_key` share the same quota. If a key isn’t provided by a client, the qouta is tracked for `user_name`. + - `['client_key', 'ip_address']` — Connections with the same `client_key` share the same quota. If a key isn’t provided by a client, the qouta is tracked for `ip_address`. +- `durations` ([Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Time interval lengths in seconds. +- `apply_to_all` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Logical value. It shows which users the quota is applied to. Values: + - `0` — The quota applies to users specify in the `apply_to_list`. + - `1` — The quota applies to all users except those listed in `apply_to_except`. +- `apply_to_list` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of user names/[roles](../../operations/access-rights.md#role-management) that the quota should be applied to. +- `apply_to_except` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of user names/roles that the quota should not apply to. + +## See Also {#see-also} + +- [SHOW QUOTAS](../../sql-reference/statements/show.md#show-quotas-statement) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/quotas) + diff --git a/docs/en/reference/operations/system-tables/quotas_usage.md b/docs/en/reference/operations/system-tables/quotas_usage.md new file mode 100644 index 00000000000..43811a75187 --- /dev/null +++ b/docs/en/reference/operations/system-tables/quotas_usage.md @@ -0,0 +1,35 @@ +# quotas_usage {#system_tables-quotas_usage} + +Quota usage by all users. + +Columns: +- `quota_name` ([String](../../sql-reference/data-types/string.md)) — Quota name. +- `quota_key` ([String](../../sql-reference/data-types/string.md)) — Key value. +- `is_current` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Quota usage for current user. +- `start_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md)))) — Start time for calculating resource consumption. +- `end_time` ([Nullable](../../sql-reference/data-types/nullable.md)([DateTime](../../sql-reference/data-types/datetime.md)))) — End time for calculating resource consumption. +- `duration` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt32](../../sql-reference/data-types/int-uint.md))) — Length of the time interval for calculating resource consumption, in seconds. +- `queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of requests in this interval. +- `max_queries` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of requests. +- `query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of select requests in this interval. +- `max_query_selects` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of select requests. +- `query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of insert requests in this interval. +- `max_query_inserts` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of insert requests. +- `errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The number of queries that threw an exception. +- `max_errors` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of errors. +- `result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of rows given as a result. +- `max_result_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum of source rows read from tables. +- `result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — RAM volume in bytes used to store a queries result. +- `max_result_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum RAM volume used to store a queries result, in bytes. +- `read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md)))) — The total number of source rows read from tables for running the query on all remote servers. +- `max_read_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum number of rows read from all tables and table functions participated in queries. +- `read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — The total number of bytes read from all tables and table functions participated in queries. +- `max_read_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Maximum of bytes read from all tables and table functions. +- `execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — The total query execution time, in seconds (wall time). +- `max_execution_time` ([Nullable](../../sql-reference/data-types/nullable.md)([Float64](../../sql-reference/data-types/float.md))) — Maximum of query execution time. + +## See Also {#see-also} + +- [SHOW QUOTA](../../sql-reference/statements/show.md#show-quota-statement) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/quotas_usage) diff --git a/docs/en/reference/operations/system-tables/replicas.md b/docs/en/reference/operations/system-tables/replicas.md new file mode 100644 index 00000000000..6ec0f184e15 --- /dev/null +++ b/docs/en/reference/operations/system-tables/replicas.md @@ -0,0 +1,132 @@ +# replicas {#system_tables-replicas} + +Contains information and status for replicated tables residing on the local server. +This table can be used for monitoring. The table contains a row for every Replicated\* table. + +Example: + +``` sql +SELECT * +FROM system.replicas +WHERE table = 'test_table' +FORMAT Vertical +``` + +``` text +Query id: dc6dcbcb-dc28-4df9-ae27-4354f5b3b13e + +Row 1: +─────── +database: db +table: test_table +engine: ReplicatedMergeTree +is_leader: 1 +can_become_leader: 1 +is_readonly: 0 +is_session_expired: 0 +future_parts: 0 +parts_to_check: 0 +zookeeper_path: /test/test_table +replica_name: r1 +replica_path: /test/test_table/replicas/r1 +columns_version: -1 +queue_size: 27 +inserts_in_queue: 27 +merges_in_queue: 0 +part_mutations_in_queue: 0 +queue_oldest_time: 2021-10-12 14:48:48 +inserts_oldest_time: 2021-10-12 14:48:48 +merges_oldest_time: 1970-01-01 03:00:00 +part_mutations_oldest_time: 1970-01-01 03:00:00 +oldest_part_to_get: 1_17_17_0 +oldest_part_to_merge_to: +oldest_part_to_mutate_to: +log_max_index: 206 +log_pointer: 207 +last_queue_update: 2021-10-12 14:50:08 +absolute_delay: 99 +total_replicas: 5 +active_replicas: 5 +last_queue_update_exception: +zookeeper_exception: +replica_is_active: {'r1':1,'r2':1} +``` + +Columns: + +- `database` (`String`) - Database name +- `table` (`String`) - Table name +- `engine` (`String`) - Table engine name +- `is_leader` (`UInt8`) - Whether the replica is the leader. + Multiple replicas can be leaders at the same time. A replica can be prevented from becoming a leader using the `merge_tree` setting `replicated_can_become_leader`. The leaders are responsible for scheduling background merges. + Note that writes can be performed to any replica that is available and has a session in ZK, regardless of whether it is a leader. +- `can_become_leader` (`UInt8`) - Whether the replica can be a leader. +- `is_readonly` (`UInt8`) - Whether the replica is in read-only mode. + This mode is turned on if the config does not have sections with ZooKeeper, if an unknown error occurred when reinitializing sessions in ZooKeeper, and during session reinitialization in ZooKeeper. +- `is_session_expired` (`UInt8`) - the session with ZooKeeper has expired. Basically the same as `is_readonly`. +- `future_parts` (`UInt32`) - The number of data parts that will appear as the result of INSERTs or merges that haven’t been done yet. +- `parts_to_check` (`UInt32`) - The number of data parts in the queue for verification. A part is put in the verification queue if there is suspicion that it might be damaged. +- `zookeeper_path` (`String`) - Path to table data in ZooKeeper. +- `replica_name` (`String`) - Replica name in ZooKeeper. Different replicas of the same table have different names. +- `replica_path` (`String`) - Path to replica data in ZooKeeper. The same as concatenating ‘zookeeper_path/replicas/replica_path’. +- `columns_version` (`Int32`) - Version number of the table structure. Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven’t made all of the ALTERs yet. +- `queue_size` (`UInt32`) - Size of the queue for operations waiting to be performed. Operations include inserting blocks of data, merges, and certain other actions. It usually coincides with `future_parts`. +- `inserts_in_queue` (`UInt32`) - Number of inserts of blocks of data that need to be made. Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong. +- `merges_in_queue` (`UInt32`) - The number of merges waiting to be made. Sometimes merges are lengthy, so this value may be greater than zero for a long time. +- `part_mutations_in_queue` (`UInt32`) - The number of mutations waiting to be made. +- `queue_oldest_time` (`DateTime`) - If `queue_size` greater than 0, shows when the oldest operation was added to the queue. +- `inserts_oldest_time` (`DateTime`) - See `queue_oldest_time` +- `merges_oldest_time` (`DateTime`) - See `queue_oldest_time` +- `part_mutations_oldest_time` (`DateTime`) - See `queue_oldest_time` + +The next 4 columns have a non-zero value only where there is an active session with ZK. + +- `log_max_index` (`UInt64`) - Maximum entry number in the log of general activity. +- `log_pointer` (`UInt64`) - Maximum entry number in the log of general activity that the replica copied to its execution queue, plus one. If `log_pointer` is much smaller than `log_max_index`, something is wrong. +- `last_queue_update` (`DateTime`) - When the queue was updated last time. +- `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has. +- `total_replicas` (`UInt8`) - The total number of known replicas of this table. +- `active_replicas` (`UInt8`) - The number of replicas of this table that have a session in ZooKeeper (i.e., the number of functioning replicas). +- `last_queue_update_exception` (`String`) - When the queue contains broken entries. Especially important when ClickHouse breaks backward compatibility between versions and log entries written by newer versions aren't parseable by old versions. +- `zookeeper_exception` (`String`) - The last exception message, got if the error happened when fetching the info from ZooKeeper. +- `replica_is_active` ([Map(String, UInt8)](../../sql-reference/data-types/map.md)) — Map between replica name and is replica active. + +If you request all the columns, the table may work a bit slowly, since several reads from ZooKeeper are made for each row. +If you do not request the last 4 columns (log_max_index, log_pointer, total_replicas, active_replicas), the table works quickly. + +For example, you can check that everything is working correctly like this: + +``` sql +SELECT + database, + table, + is_leader, + is_readonly, + is_session_expired, + future_parts, + parts_to_check, + columns_version, + queue_size, + inserts_in_queue, + merges_in_queue, + log_max_index, + log_pointer, + total_replicas, + active_replicas +FROM system.replicas +WHERE + is_readonly + OR is_session_expired + OR future_parts > 20 + OR parts_to_check > 10 + OR queue_size > 20 + OR inserts_in_queue > 10 + OR log_max_index - log_pointer > 10 + OR total_replicas < 2 + OR active_replicas < total_replicas +``` + +If this query does not return anything, it means that everything is fine. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/replicas) + diff --git a/docs/en/reference/operations/system-tables/replicated_fetches.md b/docs/en/reference/operations/system-tables/replicated_fetches.md new file mode 100644 index 00000000000..438d1572109 --- /dev/null +++ b/docs/en/reference/operations/system-tables/replicated_fetches.md @@ -0,0 +1,70 @@ +# replicated_fetches {#system_tables-replicated_fetches} + +Contains information about currently running background fetches. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database. + +- `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table. + +- `elapsed` ([Float64](../../sql-reference/data-types/float.md)) — The time elapsed (in seconds) since showing currently running background fetches started. + +- `progress` ([Float64](../../sql-reference/data-types/float.md)) — The percentage of completed work from 0 to 1. + +- `result_part_name` ([String](../../sql-reference/data-types/string.md)) — The name of the part that will be formed as the result of showing currently running background fetches. + +- `result_part_path` ([String](../../sql-reference/data-types/string.md)) — Absolute path to the part that will be formed as the result of showing currently running background fetches. + +- `partition_id` ([String](../../sql-reference/data-types/string.md)) — ID of the partition. + +- `total_size_bytes_compressed` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The total size (in bytes) of the compressed data in the result part. + +- `bytes_read_compressed` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of compressed bytes read from the result part. + +- `source_replica_path` ([String](../../sql-reference/data-types/string.md)) — Absolute path to the source replica. + +- `source_replica_hostname` ([String](../../sql-reference/data-types/string.md)) — Hostname of the source replica. + +- `source_replica_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — Port number of the source replica. + +- `interserver_scheme` ([String](../../sql-reference/data-types/string.md)) — Name of the interserver scheme. + +- `URI` ([String](../../sql-reference/data-types/string.md)) — Uniform resource identifier. + +- `to_detached` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The flag indicates whether the currently running background fetch is being performed using the `TO DETACHED` expression. + +- `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Thread identifier. + +**Example** + +``` sql +SELECT * FROM system.replicated_fetches LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +database: default +table: t +elapsed: 7.243039876 +progress: 0.41832135995612835 +result_part_name: all_0_0_0 +result_part_path: /var/lib/clickhouse/store/700/70080a04-b2de-4adf-9fa5-9ea210e81766/all_0_0_0/ +partition_id: all +total_size_bytes_compressed: 1052783726 +bytes_read_compressed: 440401920 +source_replica_path: /clickhouse/test/t/replicas/1 +source_replica_hostname: node1 +source_replica_port: 9009 +interserver_scheme: http +URI: http://node1:9009/?endpoint=DataPartsExchange%3A%2Fclickhouse%2Ftest%2Ft%2Freplicas%2F1&part=all_0_0_0&client_protocol_version=4&compress=false +to_detached: 0 +thread_id: 54 +``` + +**See Also** + +- [Managing ReplicatedMergeTree Tables](../../sql-reference/statements/system/#query-language-system-replicated) + +[Original article](https://clickhouse.com/docs/en/operations/system_tables/replicated_fetches) diff --git a/docs/en/reference/operations/system-tables/replication_queue.md b/docs/en/reference/operations/system-tables/replication_queue.md new file mode 100644 index 00000000000..a8a51162dae --- /dev/null +++ b/docs/en/reference/operations/system-tables/replication_queue.md @@ -0,0 +1,91 @@ +# replication_queue {#system_tables-replication_queue} + +Contains information about tasks from replication queues stored in ZooKeeper for tables in the `ReplicatedMergeTree` family. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database. + +- `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table. + +- `replica_name` ([String](../../sql-reference/data-types/string.md)) — Replica name in ZooKeeper. Different replicas of the same table have different names. + +- `position` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Position of the task in the queue. + +- `node_name` ([String](../../sql-reference/data-types/string.md)) — Node name in ZooKeeper. + +- `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue, one of: + + - `GET_PART` — Get the part from another replica. + - `ATTACH_PART` — Attach the part, possibly from our own replica (if found in the `detached` folder). You may think of it as a `GET_PART` with some optimizations as they're nearly identical. + - `MERGE_PARTS` — Merge the parts. + - `DROP_RANGE` — Delete the parts in the specified partition in the specified number range. + - `CLEAR_COLUMN` — NOTE: Deprecated. Drop specific column from specified partition. + - `CLEAR_INDEX` — NOTE: Deprecated. Drop specific index from specified partition. + - `REPLACE_RANGE` — Drop a certain range of parts and replace them with new ones. + - `MUTATE_PART` — Apply one or several mutations to the part. + - `ALTER_METADATA` — Apply alter modification according to global /metadata and /columns paths. + +- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution. + +- `required_quorum` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of replicas waiting for the task to complete with confirmation of completion. This column is only relevant for the `GET_PARTS` task. + +- `source_replica` ([String](../../sql-reference/data-types/string.md)) — Name of the source replica. + +- `new_part_name` ([String](../../sql-reference/data-types/string.md)) — Name of the new part. + +- `parts_to_merge` ([Array](../../sql-reference/data-types/array.md) ([String](../../sql-reference/data-types/string.md))) — Names of parts to merge or update. + +- `is_detach` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The flag indicates whether the `DETACH_PARTS` task is in the queue. + +- `is_currently_executing` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The flag indicates whether a specific task is being performed right now. + +- `num_tries` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of failed attempts to complete the task. + +- `last_exception` ([String](../../sql-reference/data-types/string.md)) — Text message about the last error that occurred (if any). + +- `last_attempt_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last attempted. + +- `num_postponed` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of postponed tasks. + +- `postpone_reason` ([String](../../sql-reference/data-types/string.md)) — The reason why the task was postponed. + +- `last_postpone_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last postponed. + +- `merge_type` ([String](../../sql-reference/data-types/string.md)) — Type of the current merge. Empty if it's a mutation. + +**Example** + +``` sql +SELECT * FROM system.replication_queue LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +database: merge +table: visits_v2 +replica_name: mtgiga001-1t +position: 15 +node_name: queue-0009325559 +type: MERGE_PARTS +create_time: 2020-12-07 14:04:21 +required_quorum: 0 +source_replica: mtgiga001-1t +new_part_name: 20201130_121373_121384_2 +parts_to_merge: ['20201130_121373_121378_1','20201130_121379_121379_0','20201130_121380_121380_0','20201130_121381_121381_0','20201130_121382_121382_0','20201130_121383_121383_0','20201130_121384_121384_0'] +is_detach: 0 +is_currently_executing: 0 +num_tries: 36 +last_exception: Code: 226, e.displayText() = DB::Exception: Marks file '/opt/clickhouse/data/merge/visits_v2/tmp_fetch_20201130_121373_121384_2/CounterID.mrk' does not exist (version 20.8.7.15 (official build)) +last_attempt_time: 2020-12-08 17:35:54 +num_postponed: 0 +postpone_reason: +last_postpone_time: 1970-01-01 03:00:00 +``` + +**See Also** + +- [Managing ReplicatedMergeTree Tables](../../sql-reference/statements/system.md#query-language-system-replicated) + +[Original article](https://clickhouse.com/docs/en/operations/system_tables/replication_queue) diff --git a/docs/en/reference/operations/system-tables/role-grants.md b/docs/en/reference/operations/system-tables/role-grants.md new file mode 100644 index 00000000000..cb0c5bf0b0b --- /dev/null +++ b/docs/en/reference/operations/system-tables/role-grants.md @@ -0,0 +1,21 @@ +# role_grants + +Contains the role grants for users and roles. To add entries to this table, use `GRANT role TO user`. + +Columns: + +- `user_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — User name. + +- `role_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Role name. + +- `granted_role_name` ([String](../../sql-reference/data-types/string.md)) — Name of role granted to the `role_name` role. To grant one role to another one use `GRANT role1 TO role2`. + +- `granted_role_is_default` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Flag that shows whether `granted_role` is a default role. Possible values: + - 1 — `granted_role` is a default role. + - 0 — `granted_role` is not a default role. + +- `with_admin_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Flag that shows whether `granted_role` is a role with [ADMIN OPTION](../../sql-reference/statements/grant.md#admin-option-privilege) privilege. Possible values: + - 1 — The role has `ADMIN OPTION` privilege. + - 0 — The role without `ADMIN OPTION` privilege. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/role-grants) diff --git a/docs/en/reference/operations/system-tables/roles.md b/docs/en/reference/operations/system-tables/roles.md new file mode 100644 index 00000000000..1f8fe349c7f --- /dev/null +++ b/docs/en/reference/operations/system-tables/roles.md @@ -0,0 +1,15 @@ +# roles {#system_tables-roles} + +Contains information about configured [roles](../../operations/access-rights.md#role-management). + +Columns: + +- `name` ([String](../../sql-reference/data-types/string.md)) — Role name. +- `id` ([UUID](../../sql-reference/data-types/uuid.md)) — Role ID. +- `storage` ([String](../../sql-reference/data-types/string.md)) — Path to the storage of roles. Configured in the `access_control_path` parameter. + +## See Also {#see-also} + +- [SHOW ROLES](../../sql-reference/statements/show.md#show-roles-statement) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/roles) diff --git a/docs/en/reference/operations/system-tables/row_policies.md b/docs/en/reference/operations/system-tables/row_policies.md new file mode 100644 index 00000000000..2bff037751b --- /dev/null +++ b/docs/en/reference/operations/system-tables/row_policies.md @@ -0,0 +1,34 @@ +# row_policies {#system_tables-row_policies} + +Contains filters for one particular table, as well as a list of roles and/or users which should use this row policy. + +Columns: +- `name` ([String](../../sql-reference/data-types/string.md)) — Name of a row policy. + +- `short_name` ([String](../../sql-reference/data-types/string.md)) — Short name of a row policy. Names of row policies are compound, for example: myfilter ON mydb.mytable. Here "myfilter ON mydb.mytable" is the name of the row policy, "myfilter" is it's short name. + +- `database` ([String](../../sql-reference/data-types/string.md)) — Database name. + +- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. + +- `id` ([UUID](../../sql-reference/data-types/uuid.md)) — Row policy ID. + +- `storage` ([String](../../sql-reference/data-types/string.md)) — Name of the directory where the row policy is stored. + +- `select_filter` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Condition which is used to filter rows. + +- `is_restrictive` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether the row policy restricts access to rows, see [CREATE ROW POLICY](../../sql-reference/statements/create/row-policy.md#create-row-policy-as). Value: +- `0` — The row policy is defined with `AS PERMISSIVE` clause. +- `1` — The row policy is defined with `AS RESTRICTIVE` clause. + +- `apply_to_all` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows that the row policies set for all roles and/or users. + +- `apply_to_list` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of the roles and/or users to which the row policies is applied. + +- `apply_to_except` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — The row policies is applied to all roles and/or users excepting of the listed ones. + +## See Also {#see-also} + +- [SHOW POLICIES](../../sql-reference/statements/show.md#show-policies-statement) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/row_policies) diff --git a/docs/en/reference/operations/system-tables/session_log.md b/docs/en/reference/operations/system-tables/session_log.md new file mode 100644 index 00000000000..9ee7e294bfd --- /dev/null +++ b/docs/en/reference/operations/system-tables/session_log.md @@ -0,0 +1,77 @@ +# session_log {#system_tables-session_log} + +Contains information about all successful and failed login and logout events. + +Columns: + +- `type` ([Enum8](../../sql-reference/data-types/enum.md)) — Login/logout result. Possible values: + - `LoginFailure` — Login error. + - `LoginSuccess` — Successful login. + - `Logout` — Logout from the system. +- `auth_id` ([UUID](../../sql-reference/data-types/uuid.md)) — Authentication ID, which is a UUID that is automatically generated each time user logins. +- `session_id` ([String](../../sql-reference/data-types/string.md)) — Session ID that is passed by client via [HTTP](../../interfaces/http.md) interface. +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Login/logout date. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Login/logout time. +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Login/logout starting time with microseconds precision. +- `user` ([String](../../sql-reference/data-types/string.md)) — User name. +- `auth_type` ([Enum8](../../sql-reference/data-types/enum.md)) — The authentication type. Possible values: + - `NO_PASSWORD` + - `PLAINTEXT_PASSWORD` + - `SHA256_PASSWORD` + - `DOUBLE_SHA1_PASSWORD` + - `LDAP` + - `KERBEROS` +- `profiles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — The list of profiles set for all roles and/or users. +- `roles` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — The list of roles to which the profile is applied. +- `settings` ([Array](../../sql-reference/data-types/array.md)([Tuple](../../sql-reference/data-types/tuple.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md), [String](../../sql-reference/data-types/string.md)))) — Settings that were changed when the client logged in/out. +- `client_address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — The IP address that was used to log in/out. +- `client_port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The client port that was used to log in/out. +- `interface` ([Enum8](../../sql-reference/data-types/enum.md)) — The interface from which the login was initiated. Possible values: + - `TCP` + - `HTTP` + - `gRPC` + - `MySQL` + - `PostgreSQL` +- `client_hostname` ([String](../../sql-reference/data-types/string.md)) — The hostname of the client machine where the [clickhouse-client](../../interfaces/cli.md) or another TCP client is run. +- `client_name` ([String](../../sql-reference/data-types/string.md)) — The `clickhouse-client` or another TCP client name. +- `client_revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Revision of the `clickhouse-client` or another TCP client. +- `client_version_major` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The major version of the `clickhouse-client` or another TCP client. +- `client_version_minor` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The minor version of the `clickhouse-client` or another TCP client. +- `client_version_patch` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Patch component of the `clickhouse-client` or another TCP client version. +- `failure_reason` ([String](../../sql-reference/data-types/string.md)) — The exception message containing the reason for the login/logout failure. + +**Example** + +Query: + +``` sql +SELECT * FROM system.session_log LIMIT 1 FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +type: LoginSuccess +auth_id: 45e6bd83-b4aa-4a23-85e6-bd83b4aa1a23 +session_id: +event_date: 2021-10-14 +event_time: 2021-10-14 20:33:52 +event_time_microseconds: 2021-10-14 20:33:52.104247 +user: default +auth_type: PLAINTEXT_PASSWORD +profiles: ['default'] +roles: [] +settings: [('load_balancing','random'),('max_memory_usage','10000000000')] +client_address: ::ffff:127.0.0.1 +client_port: 38490 +interface: TCP +client_hostname: +client_name: ClickHouse client +client_revision: 54449 +client_version_major: 21 +client_version_minor: 10 +client_version_patch: 0 +failure_reason: +``` diff --git a/docs/en/reference/operations/system-tables/settings.md b/docs/en/reference/operations/system-tables/settings.md new file mode 100644 index 00000000000..ce6f3cd4724 --- /dev/null +++ b/docs/en/reference/operations/system-tables/settings.md @@ -0,0 +1,53 @@ +# settings {#system-tables-system-settings} + +Contains information about session settings for current user. + +Columns: + +- `name` ([String](../../sql-reference/data-types/string.md)) — Setting name. +- `value` ([String](../../sql-reference/data-types/string.md)) — Setting value. +- `changed` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether a setting is changed from its default value. +- `description` ([String](../../sql-reference/data-types/string.md)) — Short setting description. +- `min` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Minimum value of the setting, if any is set via [constraints](../../operations/settings/constraints-on-settings.md#constraints-on-settings). If the setting has no minimum value, contains [NULL](../../sql-reference/syntax.md#null-literal). +- `max` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Maximum value of the setting, if any is set via [constraints](../../operations/settings/constraints-on-settings.md#constraints-on-settings). If the setting has no maximum value, contains [NULL](../../sql-reference/syntax.md#null-literal). +- `readonly` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows whether the current user can change the setting: + - `0` — Current user can change the setting. + - `1` — Current user can’t change the setting. + +**Example** + +The following example shows how to get information about settings which name contains `min_i`. + +``` sql +SELECT * +FROM system.settings +WHERE name LIKE '%min_i%' +``` + +``` text +┌─name────────────────────────────────────────┬─value─────┬─changed─┬─description───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─min──┬─max──┬─readonly─┐ +│ min_insert_block_size_rows │ 1048576 │ 0 │ Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough. │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 0 │ +│ min_insert_block_size_bytes │ 268435456 │ 0 │ Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough. │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 0 │ +│ read_backoff_min_interval_between_events_ms │ 1000 │ 0 │ Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time. │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 0 │ +└─────────────────────────────────────────────┴───────────┴─────────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────┴──────┴──────────┘ +``` + +Using of `WHERE changed` can be useful, for example, when you want to check: + +- Whether settings in configuration files are loaded correctly and are in use. +- Settings that changed in the current session. + + + +``` sql +SELECT * FROM system.settings WHERE changed AND name='load_balancing' +``` + +**See also** + +- [Settings](../../operations/settings/index.md#session-settings-intro) +- [Permissions for Queries](../../operations/settings/permissions-for-queries.md#settings_readonly) +- [Constraints on Settings](../../operations/settings/constraints-on-settings.md) +- [SHOW SETTINGS](../../sql-reference/statements/show.md#show-settings) statement + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/settings) diff --git a/docs/en/reference/operations/system-tables/settings_profile_elements.md b/docs/en/reference/operations/system-tables/settings_profile_elements.md new file mode 100644 index 00000000000..5a010d6239a --- /dev/null +++ b/docs/en/reference/operations/system-tables/settings_profile_elements.md @@ -0,0 +1,30 @@ +# settings_profile_elements {#system_tables-settings_profile_elements} + +Describes the content of the settings profile: + +- Сonstraints. +- Roles and users that the setting applies to. +- Parent settings profiles. + +Columns: +- `profile_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Setting profile name. + +- `user_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — User name. + +- `role_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Role name. + +- `index` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Sequential number of the settings profile element. + +- `setting_name` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Setting name. + +- `value` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Setting value. + +- `min` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — The minimum value of the setting. `NULL` if not set. + +- `max` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — The maximum value of the setting. NULL if not set. + +- `readonly` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges))) — Profile that allows only read queries. + +- `inherit_profile` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — A parent profile for this setting profile. `NULL` if not set. Setting profile will inherit all the settings' values and constraints (`min`, `max`, `readonly`) from its parent profiles. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/settings_profile_elements) diff --git a/docs/en/reference/operations/system-tables/settings_profiles.md b/docs/en/reference/operations/system-tables/settings_profiles.md new file mode 100644 index 00000000000..ab2020b375d --- /dev/null +++ b/docs/en/reference/operations/system-tables/settings_profiles.md @@ -0,0 +1,24 @@ +# settings_profiles {#system_tables-settings_profiles} + +Contains properties of configured setting profiles. + +Columns: +- `name` ([String](../../sql-reference/data-types/string.md)) — Setting profile name. + +- `id` ([UUID](../../sql-reference/data-types/uuid.md)) — Setting profile ID. + +- `storage` ([String](../../sql-reference/data-types/string.md)) — Path to the storage of setting profiles. Configured in the `access_control_path` parameter. + +- `num_elements` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of elements for this profile in the `system.settings_profile_elements` table. + +- `apply_to_all` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows that the settings profile set for all roles and/or users. + +- `apply_to_list` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of the roles and/or users to which the setting profile is applied. + +- `apply_to_except` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — The setting profile is applied to all roles and/or users excepting of the listed ones. + +## See Also {#see-also} + +- [SHOW PROFILES](../../sql-reference/statements/show.md#show-profiles-statement) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/settings_profiles) diff --git a/docs/en/reference/operations/system-tables/stack_trace.md b/docs/en/reference/operations/system-tables/stack_trace.md new file mode 100644 index 00000000000..2aa1c5af125 --- /dev/null +++ b/docs/en/reference/operations/system-tables/stack_trace.md @@ -0,0 +1,91 @@ +# stack_trace {#system-tables_stack_trace} + +Contains stack traces of all server threads. Allows developers to introspect the server state. + +To analyze stack frames, use the `addressToLine`, `addressToLineWithInlines`, `addressToSymbol` and `demangle` [introspection functions](../../sql-reference/functions/introspection.md). + +Columns: + +- `thread_name` ([String](../../sql-reference/data-types/string.md)) — Thread name. +- `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Thread identifier. +- `query_id` ([String](../../sql-reference/data-types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query_log](../system-tables/query_log.md) system table. +- `trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored. + +**Example** + +Enabling introspection functions: + +``` sql +SET allow_introspection_functions = 1; +``` + +Getting symbols from ClickHouse object files: + +``` sql +WITH arrayMap(x -> demangle(addressToSymbol(x)), trace) AS all SELECT thread_name, thread_id, query_id, arrayStringConcat(all, '\n') AS res FROM system.stack_trace LIMIT 1 \G; +``` + +``` text +Row 1: +────── +thread_name: clickhouse-serv + +thread_id: 686 +query_id: 1a11f70b-626d-47c1-b948-f9c7b206395d +res: sigqueue +DB::StorageSystemStackTrace::fillData(std::__1::vector::mutable_ptr, std::__1::allocator::mutable_ptr > >&, DB::Context const&, DB::SelectQueryInfo const&) const +DB::IStorageSystemOneBlock::read(std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&, DB::SelectQueryInfo const&, DB::Context const&, DB::QueryProcessingStage::Enum, unsigned long, unsigned int) +DB::InterpreterSelectQuery::executeFetchColumns(DB::QueryProcessingStage::Enum, DB::QueryPipeline&, std::__1::shared_ptr const&, std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) +DB::InterpreterSelectQuery::executeImpl(DB::QueryPipeline&, std::__1::shared_ptr const&, std::__1::optional) +DB::InterpreterSelectQuery::execute() +DB::InterpreterSelectWithUnionQuery::execute() +DB::executeQueryImpl(char const*, char const*, DB::Context&, bool, DB::QueryProcessingStage::Enum, bool, DB::ReadBuffer*) +DB::executeQuery(std::__1::basic_string, std::__1::allocator > const&, DB::Context&, bool, DB::QueryProcessingStage::Enum, bool) +DB::TCPHandler::runImpl() +DB::TCPHandler::run() +Poco::Net::TCPServerConnection::start() +Poco::Net::TCPServerDispatcher::run() +Poco::PooledThread::run() +Poco::ThreadImpl::runnableEntry(void*) +start_thread +__clone +``` + +Getting filenames and line numbers in ClickHouse source code: + +``` sql +WITH arrayMap(x -> addressToLine(x), trace) AS all, arrayFilter(x -> x LIKE '%/dbms/%', all) AS dbms SELECT thread_name, thread_id, query_id, arrayStringConcat(notEmpty(dbms) ? dbms : all, '\n') AS res FROM system.stack_trace LIMIT 1 \G; +``` + +``` text +Row 1: +────── +thread_name: clickhouse-serv + +thread_id: 686 +query_id: cad353e7-1c29-4b2e-949f-93e597ab7a54 +res: /lib/x86_64-linux-gnu/libc-2.27.so +/build/obj-x86_64-linux-gnu/../src/Storages/System/StorageSystemStackTrace.cpp:182 +/build/obj-x86_64-linux-gnu/../contrib/libcxx/include/vector:656 +/build/obj-x86_64-linux-gnu/../src/Interpreters/InterpreterSelectQuery.cpp:1338 +/build/obj-x86_64-linux-gnu/../src/Interpreters/InterpreterSelectQuery.cpp:751 +/build/obj-x86_64-linux-gnu/../contrib/libcxx/include/optional:224 +/build/obj-x86_64-linux-gnu/../src/Interpreters/InterpreterSelectWithUnionQuery.cpp:192 +/build/obj-x86_64-linux-gnu/../src/Interpreters/executeQuery.cpp:384 +/build/obj-x86_64-linux-gnu/../src/Interpreters/executeQuery.cpp:643 +/build/obj-x86_64-linux-gnu/../src/Server/TCPHandler.cpp:251 +/build/obj-x86_64-linux-gnu/../src/Server/TCPHandler.cpp:1197 +/build/obj-x86_64-linux-gnu/../contrib/poco/Net/src/TCPServerConnection.cpp:57 +/build/obj-x86_64-linux-gnu/../contrib/libcxx/include/atomic:856 +/build/obj-x86_64-linux-gnu/../contrib/poco/Foundation/include/Poco/Mutex_POSIX.h:59 +/build/obj-x86_64-linux-gnu/../contrib/poco/Foundation/include/Poco/AutoPtr.h:223 +/lib/x86_64-linux-gnu/libpthread-2.27.so +/lib/x86_64-linux-gnu/libc-2.27.so +``` + +**See Also** + +- [Introspection Functions](../../sql-reference/functions/introspection.md) — Which introspection functions are available and how to use them. +- [system.trace_log](../system-tables/trace_log.md) — Contains stack traces collected by the sampling query profiler. +- [arrayMap](../../sql-reference/functions/array-functions.md#array-map) — Description and usage example of the `arrayMap` function. +- [arrayFilter](../../sql-reference/functions/array-functions.md#array-filter) — Description and usage example of the `arrayFilter` function. diff --git a/docs/en/reference/operations/system-tables/storage_policies.md b/docs/en/reference/operations/system-tables/storage_policies.md new file mode 100644 index 00000000000..adbb2f8434e --- /dev/null +++ b/docs/en/reference/operations/system-tables/storage_policies.md @@ -0,0 +1,17 @@ +# storage_policies {#system_tables-storage_policies} + +Contains information about storage policies and volumes defined in the [server configuration](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). + +Columns: + +- `policy_name` ([String](../../sql-reference/data-types/string.md)) — Name of the storage policy. +- `volume_name` ([String](../../sql-reference/data-types/string.md)) — Volume name defined in the storage policy. +- `volume_priority` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Volume order number in the configuration, the data fills the volumes according this priority, i.e. data during inserts and merges is written to volumes with a lower priority (taking into account other rules: TTL, `max_data_part_size`, `move_factor`). +- `disks` ([Array(String)](../../sql-reference/data-types/array.md)) — Disk names, defined in the storage policy. +- `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). +- `move_factor` ([Float64](../../sql-reference/data-types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. +- `prefer_not_to_merge` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Value of the `prefer_not_to_merge` setting. When this setting is enabled, merging data on this volume is not allowed. This allows controlling how ClickHouse works with slow disks. + +If the storage policy contains more then one volume, then information for each volume is stored in the individual row of the table. + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/storage_policies) diff --git a/docs/en/reference/operations/system-tables/table_engines.md b/docs/en/reference/operations/system-tables/table_engines.md new file mode 100644 index 00000000000..d3ac8da1d70 --- /dev/null +++ b/docs/en/reference/operations/system-tables/table_engines.md @@ -0,0 +1,38 @@ +# table_engines {#system-table-engines} + +Contains description of table engines supported by server and their feature support information. + +This table contains the following columns (the column type is shown in brackets): + +- `name` (String) — The name of table engine. +- `supports_settings` (UInt8) — Flag that indicates if table engine supports `SETTINGS` clause. +- `supports_skipping_indices` (UInt8) — Flag that indicates if table engine supports [skipping indices](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-data_skipping-indexes). +- `supports_ttl` (UInt8) — Flag that indicates if table engine supports [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). +- `supports_sort_order` (UInt8) — Flag that indicates if table engine supports clauses `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` and `SAMPLE_BY`. +- `supports_replication` (UInt8) — Flag that indicates if table engine supports [data replication](../../engines/table-engines/mergetree-family/replication.md). +- `supports_duduplication` (UInt8) — Flag that indicates if table engine supports data deduplication. +- `supports_parallel_insert` (UInt8) — Flag that indicates if table engine supports parallel insert (see [`max_insert_threads`](../../operations/settings/settings.md#settings-max-insert-threads) setting). + +Example: + +``` sql +SELECT * +FROM system.table_engines +WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') +``` + +``` text +┌─name──────────────────────────┬─supports_settings─┬─supports_skipping_indices─┬─supports_sort_order─┬─supports_ttl─┬─supports_replication─┬─supports_deduplication─┬─supports_parallel_insert─┐ +│ MergeTree │ 1 │ 1 │ 1 │ 1 │ 0 │ 0 │ 1 │ +│ Kafka │ 1 │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ +│ ReplicatedCollapsingMergeTree │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ +└───────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────────┴──────────────┴──────────────────────┴────────────────────────┴──────────────────────────┘ +``` + +**See also** + +- MergeTree family [query clauses](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) +- Kafka [settings](../../engines/table-engines/integrations/kafka.md#table_engine-kafka-creating-a-table) +- Join [settings](../../engines/table-engines/special/join.md#join-limitations-and-settings) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/table_engines) diff --git a/docs/en/reference/operations/system-tables/tables.md b/docs/en/reference/operations/system-tables/tables.md new file mode 100644 index 00000000000..8286d51aed6 --- /dev/null +++ b/docs/en/reference/operations/system-tables/tables.md @@ -0,0 +1,125 @@ +# tables {#system-tables} + +Contains metadata of each table that the server knows about. + +[Detached](../../sql-reference/statements/detach.md) tables are not shown in `system.tables`. + +[Temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.tables` only in those session where they have been created. They are shown with the empty `database` field and with the `is_temporary` flag switched on. + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the table is in. + +- `name` ([String](../../sql-reference/data-types/string.md)) — Table name. + +- `engine` ([String](../../sql-reference/data-types/string.md)) — Table engine name (without parameters). + +- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) - Flag that indicates whether the table is temporary. + +- `data_path` ([String](../../sql-reference/data-types/string.md)) - Path to the table data in the file system. + +- `metadata_path` ([String](../../sql-reference/data-types/string.md)) - Path to the table metadata in the file system. + +- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - Time of latest modification of the table metadata. + +- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Database dependencies. + +- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Table dependencies ([MaterializedView](../../engines/table-engines/special/materializedview.md) tables based on the current table). + +- `create_table_query` ([String](../../sql-reference/data-types/string.md)) - The query that was used to create the table. + +- `engine_full` ([String](../../sql-reference/data-types/string.md)) - Parameters of the table engine. + +- `as_select` ([String](../../sql-reference/data-types/string.md)) - `SELECT` query for view. + +- `partition_key` ([String](../../sql-reference/data-types/string.md)) - The partition key expression specified in the table. + +- `sorting_key` ([String](../../sql-reference/data-types/string.md)) - The sorting key expression specified in the table. + +- `primary_key` ([String](../../sql-reference/data-types/string.md)) - The primary key expression specified in the table. + +- `sampling_key` ([String](../../sql-reference/data-types/string.md)) - The sampling key expression specified in the table. + +- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - The storage policy: + + - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) + - [Distributed](../../engines/table-engines/special/distributed.md#distributed) + +- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows, if it is possible to quickly determine exact number of rows in the table, otherwise `NULL` (including underying `Buffer` table). + +- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of bytes, if it is possible to quickly determine exact number of bytes for the table on storage, otherwise `NULL` (does not includes any underlying storage). + + - If the table stores data on disk, returns used space on disk (i.e. compressed). + - If the table stores data in memory, returns approximated number of used bytes in memory. + +- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows INSERTed since server start (only for `Buffer` tables). + +- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of bytes INSERTed since server start (only for `Buffer` tables). + +- `comment` ([String](../../sql-reference/data-types/string.md)) - The comment for the table. + +- `has_own_data` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Flag that indicates whether the table itself stores some data on disk or only accesses some other source. + +The `system.tables` table is used in `SHOW TABLES` query implementation. + +**Example** + +```sql +SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: base +name: t1 +uuid: 81b1c20a-b7c6-4116-a2ce-7583fb6b6736 +engine: MergeTree +is_temporary: 0 +data_paths: ['/var/lib/clickhouse/store/81b/81b1c20a-b7c6-4116-a2ce-7583fb6b6736/'] +metadata_path: /var/lib/clickhouse/store/461/461cf698-fd0b-406d-8c01-5d8fd5748a91/t1.sql +metadata_modification_time: 2021-01-25 19:14:32 +dependencies_database: [] +dependencies_table: [] +create_table_query: CREATE TABLE base.t1 (`n` UInt64) ENGINE = MergeTree ORDER BY n SETTINGS index_granularity = 8192 +engine_full: MergeTree ORDER BY n SETTINGS index_granularity = 8192 +as_select: SELECT database AS table_catalog +partition_key: +sorting_key: n +primary_key: n +sampling_key: +storage_policy: default +total_rows: 1 +total_bytes: 99 +lifetime_rows: ᴺᵁᴸᴸ +lifetime_bytes: ᴺᵁᴸᴸ +comment: +has_own_data: 0 + +Row 2: +────── +database: default +name: 53r93yleapyears +uuid: 00000000-0000-0000-0000-000000000000 +engine: MergeTree +is_temporary: 0 +data_paths: ['/var/lib/clickhouse/data/default/53r93yleapyears/'] +metadata_path: /var/lib/clickhouse/metadata/default/53r93yleapyears.sql +metadata_modification_time: 2020-09-23 09:05:36 +dependencies_database: [] +dependencies_table: [] +create_table_query: CREATE TABLE default.`53r93yleapyears` (`id` Int8, `febdays` Int8) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192 +engine_full: MergeTree ORDER BY id SETTINGS index_granularity = 8192 +as_select: SELECT name AS catalog_name +partition_key: +sorting_key: id +primary_key: id +sampling_key: +storage_policy: default +total_rows: 2 +total_bytes: 155 +lifetime_rows: ᴺᵁᴸᴸ +lifetime_bytes: ᴺᵁᴸᴸ +comment: +has_own_data: 0 +``` diff --git a/docs/en/reference/operations/system-tables/text_log.md b/docs/en/reference/operations/system-tables/text_log.md new file mode 100644 index 00000000000..e4967dc8d0b --- /dev/null +++ b/docs/en/reference/operations/system-tables/text_log.md @@ -0,0 +1,53 @@ +# text_log {#system_tables-text_log} + +Contains logging entries. The logging level which goes to this table can be limited to the `text_log.level` server setting. + +Columns: + +- `event_date` (Date) — Date of the entry. +- `event_time` (DateTime) — Time of the entry. +- `event_time_microseconds` (DateTime) — Time of the entry with microseconds precision. +- `microseconds` (UInt32) — Microseconds of the entry. +- `thread_name` (String) — Name of the thread from which the logging was done. +- `thread_id` (UInt64) — OS thread ID. +- `level` (`Enum8`) — Entry level. Possible values: + - `1` or `'Fatal'`. + - `2` or `'Critical'`. + - `3` or `'Error'`. + - `4` or `'Warning'`. + - `5` or `'Notice'`. + - `6` or `'Information'`. + - `7` or `'Debug'`. + - `8` or `'Trace'`. +- `query_id` (String) — ID of the query. +- `logger_name` (LowCardinality(String)) — Name of the logger (i.e. `DDLWorker`). +- `message` (String) — The message itself. +- `revision` (UInt32) — ClickHouse revision. +- `source_file` (LowCardinality(String)) — Source file from which the logging was done. +- `source_line` (UInt64) — Source line from which the logging was done. + +**Example** + +``` sql +SELECT * FROM system.text_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2020-09-10 +event_time: 2020-09-10 11:23:07 +event_time_microseconds: 2020-09-10 11:23:07.871397 +microseconds: 871397 +thread_name: clickhouse-serv +thread_id: 564917 +level: Information +query_id: +logger_name: DNSCacheUpdater +message: Update period 15 seconds +revision: 54440 +source_file: /ClickHouse/src/Interpreters/DNSCacheUpdater.cpp; void DB::DNSCacheUpdater::start() +source_line: 45 +``` + + [Original article](https://clickhouse.com/docs/en/operations/system-tables/text_log) diff --git a/docs/en/reference/operations/system-tables/time_zones.md b/docs/en/reference/operations/system-tables/time_zones.md new file mode 100644 index 00000000000..899e115152f --- /dev/null +++ b/docs/en/reference/operations/system-tables/time_zones.md @@ -0,0 +1,30 @@ +# time_zones {#system-time_zones} + +Contains a list of time zones that are supported by the ClickHouse server. This list of timezones might vary depending on the version of ClickHouse. + +Columns: + +- `time_zone` (String) — List of supported time zones. + +**Example** + +``` sql +SELECT * FROM system.time_zones LIMIT 10 +``` + +``` text +┌─time_zone──────────┐ +│ Africa/Abidjan │ +│ Africa/Accra │ +│ Africa/Addis_Ababa │ +│ Africa/Algiers │ +│ Africa/Asmara │ +│ Africa/Asmera │ +│ Africa/Bamako │ +│ Africa/Bangui │ +│ Africa/Banjul │ +│ Africa/Bissau │ +└────────────────────┘ +``` + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/time_zones) diff --git a/docs/en/reference/operations/system-tables/trace_log.md b/docs/en/reference/operations/system-tables/trace_log.md new file mode 100644 index 00000000000..ace5662e919 --- /dev/null +++ b/docs/en/reference/operations/system-tables/trace_log.md @@ -0,0 +1,57 @@ +# trace_log {#system_tables-trace_log} + +Contains stack traces collected by the sampling query profiler. + +ClickHouse creates this table when the [trace_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-trace_log) server configuration section is set. Also the [query_profiler_real_time_period_ns](../../operations/settings/settings.md#query_profiler_real_time_period_ns) and [query_profiler_cpu_time_period_ns](../../operations/settings/settings.md#query_profiler_cpu_time_period_ns) settings should be set. + +To analyze logs, use the `addressToLine`, `addressToLineWithInlines`, `addressToSymbol` and `demangle` introspection functions. + +Columns: + +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Date of sampling moment. + +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Timestamp of the sampling moment. + +- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Timestamp of the sampling moment with microseconds precision. + +- `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Timestamp of the sampling moment in nanoseconds. + +- `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse server build revision. + + When connecting to the server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1 revision 54429.`. This field contains the `revision`, but not the `version` of a server. + +- `trace_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Trace type: + + - `Real` represents collecting stack traces by wall-clock time. + - `CPU` represents collecting stack traces by CPU time. + - `Memory` represents collecting allocations and deallocations when memory allocation exceeds the subsequent watermark. + - `MemorySample` represents collecting random allocations and deallocations. + +- `thread_number` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Thread identifier. + +- `query_id` ([String](../../sql-reference/data-types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query_log](#system_tables-query_log) system table. + +- `trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process. + +**Example** + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2020-09-10 +event_time: 2020-09-10 11:23:09 +event_time_microseconds: 2020-09-10 11:23:09.872924 +timestamp_ns: 1599762189872924510 +revision: 54440 +trace_type: Memory +thread_id: 564963 +query_id: +trace: [371912858,371912789,371798468,371799717,371801313,371790250,624462773,566365041,566440261,566445834,566460071,566459914,566459842,566459580,566459469,566459389,566459341,566455774,371993941,371988245,372158848,372187428,372187309,372187093,372185478,140222123165193,140222122205443] +size: 5244400 +``` + + [Original article](https://clickhouse.com/docs/en/operations/system-tables/trace_log) diff --git a/docs/en/reference/operations/system-tables/users.md b/docs/en/reference/operations/system-tables/users.md new file mode 100644 index 00000000000..95691f4497c --- /dev/null +++ b/docs/en/reference/operations/system-tables/users.md @@ -0,0 +1,34 @@ +# users {#system_tables-users} + +Contains a list of [user accounts](../../operations/access-rights.md#user-account-management) configured at the server. + +Columns: +- `name` ([String](../../sql-reference/data-types/string.md)) — User name. + +- `id` ([UUID](../../sql-reference/data-types/uuid.md)) — User ID. + +- `storage` ([String](../../sql-reference/data-types/string.md)) — Path to the storage of users. Configured in the `access_control_path` parameter. + +- `auth_type` ([Enum8](../../sql-reference/data-types/enum.md)('no_password' = 0,'plaintext_password' = 1, 'sha256_password' = 2, 'double_sha1_password' = 3)) — Shows the authentication type. There are multiple ways of user identification: with no password, with plain text password, with [SHA256](https://ru.wikipedia.org/wiki/SHA-2)-encoded password or with [double SHA-1](https://ru.wikipedia.org/wiki/SHA-1)-encoded password. + +- `auth_params` ([String](../../sql-reference/data-types/string.md)) — Authentication parameters in the JSON format depending on the `auth_type`. + +- `host_ip` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — IP addresses of hosts that are allowed to connect to the ClickHouse server. + +- `host_names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Names of hosts that are allowed to connect to the ClickHouse server. + +- `host_names_regexp` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Regular expression for host names that are allowed to connect to the ClickHouse server. + +- `host_names_like` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Names of hosts that are allowed to connect to the ClickHouse server, set using the LIKE predicate. + +- `default_roles_all` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Shows that all granted roles set for user by default. + +- `default_roles_list` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — List of granted roles provided by default. + +- `default_roles_except` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — All the granted roles set as default excepting of the listed ones. + +## See Also {#see-also} + +- [SHOW USERS](../../sql-reference/statements/show.md#show-users-statement) + +[Original article](https://clickhouse.com/docs/en/operations/system-tables/users) diff --git a/docs/en/reference/operations/system-tables/zookeeper.md b/docs/en/reference/operations/system-tables/zookeeper.md new file mode 100644 index 00000000000..e8232483f6f --- /dev/null +++ b/docs/en/reference/operations/system-tables/zookeeper.md @@ -0,0 +1,75 @@ +# zookeeper {#system-zookeeper} + +The table does not exist if ZooKeeper is not configured. Allows reading data from the ZooKeeper cluster defined in the config. +The query must either have a ‘path =’ condition or a `path IN` condition set with the `WHERE` clause as shown below. This corresponds to the path of the children in ZooKeeper that you want to get data for. + +The query `SELECT * FROM system.zookeeper WHERE path = '/clickhouse'` outputs data for all children on the `/clickhouse` node. +To output data for all root nodes, write path = ‘/’. +If the path specified in ‘path’ does not exist, an exception will be thrown. + +The query `SELECT * FROM system.zookeeper WHERE path IN ('/', '/clickhouse')` outputs data for all children on the `/` and `/clickhouse` node. +If in the specified ‘path’ collection has does not exist path, an exception will be thrown. +It can be used to do a batch of ZooKeeper path queries. + +Columns: + +- `name` (String) — The name of the node. +- `path` (String) — The path to the node. +- `value` (String) — Node value. +- `dataLength` (Int32) — Size of the value. +- `numChildren` (Int32) — Number of descendants. +- `czxid` (Int64) — ID of the transaction that created the node. +- `mzxid` (Int64) — ID of the transaction that last changed the node. +- `pzxid` (Int64) — ID of the transaction that last deleted or added descendants. +- `ctime` (DateTime) — Time of node creation. +- `mtime` (DateTime) — Time of the last modification of the node. +- `version` (Int32) — Node version: the number of times the node was changed. +- `cversion` (Int32) — Number of added or removed descendants. +- `aversion` (Int32) — Number of changes to the ACL. +- `ephemeralOwner` (Int64) — For ephemeral nodes, the ID of the session that owns this node. + +Example: + +``` sql +SELECT * +FROM system.zookeeper +WHERE path = '/clickhouse/tables/01-08/visits/replicas' +FORMAT Vertical +``` + +``` text +Row 1: +────── +name: example01-08-1 +value: +czxid: 932998691229 +mzxid: 932998691229 +ctime: 2015-03-27 16:49:51 +mtime: 2015-03-27 16:49:51 +version: 0 +cversion: 47 +aversion: 0 +ephemeralOwner: 0 +dataLength: 0 +numChildren: 7 +pzxid: 987021031383 +path: /clickhouse/tables/01-08/visits/replicas + +Row 2: +────── +name: example01-08-2 +value: +czxid: 933002738135 +mzxid: 933002738135 +ctime: 2015-03-27 16:57:01 +mtime: 2015-03-27 16:57:01 +version: 0 +cversion: 37 +aversion: 0 +ephemeralOwner: 0 +dataLength: 0 +numChildren: 7 +pzxid: 987021252247 +path: /clickhouse/tables/01-08/visits/replicas +``` +[Original article](https://clickhouse.com/docs/en/operations/system-tables/zookeeper) diff --git a/docs/en/reference/operations/system-tables/zookeeper_log.md b/docs/en/reference/operations/system-tables/zookeeper_log.md new file mode 100644 index 00000000000..919c4245d5d --- /dev/null +++ b/docs/en/reference/operations/system-tables/zookeeper_log.md @@ -0,0 +1,129 @@ +# zookeeper_log {#system-zookeeper_log} + +This table contains information about the parameters of the request to the ZooKeeper server and the response from it. + +For requests, only columns with request parameters are filled in, and the remaining columns are filled with default values (`0` or `NULL`). When the response arrives, the data from the response is added to the other columns. + +Columns with request parameters: + +- `type` ([Enum](../../sql-reference/data-types/enum.md)) — Event type in the ZooKeeper client. Can have one of the following values: + - `Request` — The request has been sent. + - `Response` — The response was received. + - `Finalize` — The connection is lost, no response was received. +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the event happened. +- `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the event happened. +- `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address of ZooKeeper server that was used to make the request. +- `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The port of ZooKeeper server that was used to make the request. +- `session_id` ([Int64](../../sql-reference/data-types/int-uint.md)) — The session ID that the ZooKeeper server sets for each connection. +- `xid` ([Int32](../../sql-reference/data-types/int-uint.md)) — The ID of the request within the session. This is usually a sequential request number. It is the same for the request row and the paired `response`/`finalize` row. +- `has_watch` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The request whether the [watch](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#ch_zkWatches) has been set. +- `op_num` ([Enum](../../sql-reference/data-types/enum.md)) — The type of request or response. +- `path` ([String](../../sql-reference/data-types/string.md)) — The path to the ZooKeeper node specified in the request, or an empty string if the request not requires specifying a path. +- `data` ([String](../../sql-reference/data-types/string.md)) — The data written to the ZooKeeper node (for the `SET` and `CREATE` requests — what the request wanted to write, for the response to the `GET` request — what was read) or an empty string. +- `is_ephemeral` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Is the ZooKeeper node being created as an [ephemeral](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#Ephemeral+Nodes). +- `is_sequential` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Is the ZooKeeper node being created as an [sequential](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#Sequence+Nodes+--+Unique+Naming). +- `version` ([Nullable(Int32)](../../sql-reference/data-types/nullable.md)) — The version of the ZooKeeper node that the request expects when executing. This is supported for `CHECK`, `SET`, `REMOVE` requests (is relevant `-1` if the request does not check the version or `NULL` for other requests that do not support version checking). +- `requests_size` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of requests included in the multi request (this is a special request that consists of several consecutive ordinary requests and executes them atomically). All requests included in multi request will have the same `xid`. +- `request_idx` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of the request included in multi request (for multi request — `0`, then in order from `1`). + +Columns with request response parameters: + +- `zxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — ZooKeeper transaction ID. The serial number issued by the ZooKeeper server in response to a successfully executed request (`0` if the request was not executed/returned an error/the client does not know whether the request was executed). +- `error` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — Error code. Can have many values, here are just some of them: + - `ZOK` — The request was executed seccessfully. + - `ZCONNECTIONLOSS` — The connection was lost. + - `ZOPERATIONTIMEOUT` — The request execution timeout has expired. + - `ZSESSIONEXPIRED` — The session has expired. + - `NULL` — The request is completed. +- `watch_type` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — The type of the `watch` event (for responses with `op_num` = `Watch`), for the remaining responses: `NULL`. +- `watch_state` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — The status of the `watch` event (for responses with `op_num` = `Watch`), for the remaining responses: `NULL`. +- `path_created` ([String](../../sql-reference/data-types/string.md)) — The path to the created ZooKeeper node (for responses to the `CREATE` request), may differ from the `path` if the node is created as a `sequential`. +- `stat_czxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The `zxid` of the change that caused this ZooKeeper node to be created. +- `stat_mzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The `zxid` of the change that last modified this ZooKeeper node. +- `stat_pzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The transaction ID of the change that last modified childern of this ZooKeeper node. +- `stat_version` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of changes to the data of this ZooKeeper node. +- `stat_cversion` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of changes to the children of this ZooKeeper node. +- `stat_dataLength` ([Int32](../../sql-reference/data-types/int-uint.md)) — The length of the data field of this ZooKeeper node. +- `stat_numChildren` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of children of this ZooKeeper node. +- `children` ([Array(String)](../../sql-reference/data-types/array.md)) — The list of child ZooKeeper nodes (for responses to `LIST` request). + +**Example** + +Query: + +``` sql +SELECT * FROM system.zookeeper_log WHERE (session_id = '106662742089334927') AND (xid = '10858') FORMAT Vertical; +``` + +Result: + +``` text +Row 1: +────── +type: Request +event_date: 2021-08-09 +event_time: 2021-08-09 21:38:30.291792 +address: :: +port: 2181 +session_id: 106662742089334927 +xid: 10858 +has_watch: 1 +op_num: List +path: /clickhouse/task_queue/ddl +data: +is_ephemeral: 0 +is_sequential: 0 +version: ᴺᵁᴸᴸ +requests_size: 0 +request_idx: 0 +zxid: 0 +error: ᴺᵁᴸᴸ +watch_type: ᴺᵁᴸᴸ +watch_state: ᴺᵁᴸᴸ +path_created: +stat_czxid: 0 +stat_mzxid: 0 +stat_pzxid: 0 +stat_version: 0 +stat_cversion: 0 +stat_dataLength: 0 +stat_numChildren: 0 +children: [] + +Row 2: +────── +type: Response +event_date: 2021-08-09 +event_time: 2021-08-09 21:38:30.292086 +address: :: +port: 2181 +session_id: 106662742089334927 +xid: 10858 +has_watch: 1 +op_num: List +path: /clickhouse/task_queue/ddl +data: +is_ephemeral: 0 +is_sequential: 0 +version: ᴺᵁᴸᴸ +requests_size: 0 +request_idx: 0 +zxid: 16926267 +error: ZOK +watch_type: ᴺᵁᴸᴸ +watch_state: ᴺᵁᴸᴸ +path_created: +stat_czxid: 16925469 +stat_mzxid: 16925469 +stat_pzxid: 16926179 +stat_version: 0 +stat_cversion: 7 +stat_dataLength: 0 +stat_numChildren: 7 +children: ['query-0000000006','query-0000000005','query-0000000004','query-0000000003','query-0000000002','query-0000000001','query-0000000000'] +``` + +**See Also** + +- [ZooKeeper](../../operations/tips.md#zookeeper) +- [ZooKeeper guide](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html) diff --git a/docs/en/reference/operations/tips.md b/docs/en/reference/operations/tips.md new file mode 100644 index 00000000000..c727c636579 --- /dev/null +++ b/docs/en/reference/operations/tips.md @@ -0,0 +1,279 @@ +--- +sidebar_position: 58 +sidebar_label: Usage Recommendations +--- + +# Usage Recommendations {#usage-recommendations} + +## CPU Scaling Governor {#cpu-scaling-governor} + +Always use the `performance` scaling governor. The `on-demand` scaling governor works much worse with constantly high demand. + +``` bash +$ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor +``` + +## CPU Limitations {#cpu-limitations} + +Processors can overheat. Use `dmesg` to see if the CPU’s clock rate was limited due to overheating. +The restriction can also be set externally at the datacenter level. You can use `turbostat` to monitor it under a load. + +## RAM {#ram} + +For small amounts of data (up to ~200 GB compressed), it is best to use as much memory as the volume of data. +For large amounts of data and when processing interactive (online) queries, you should use a reasonable amount of RAM (128 GB or more) so the hot data subset will fit in the cache of pages. +Even for data volumes of ~50 TB per server, using 128 GB of RAM significantly improves query performance compared to 64 GB. + +Do not disable overcommit. The value `cat /proc/sys/vm/overcommit_memory` should be 0 or 1. Run + +``` bash +$ echo 0 | sudo tee /proc/sys/vm/overcommit_memory +``` + +Use `perf top` to watch the time spent in the kernel for memory management. +Permanent huge pages also do not need to be allocated. + +:::warning +If your system has less than 16 GB of RAM, you may experience various memory exceptions because default settings do not match this amount of memory. The recommended amount of RAM is 32 GB or more. You can use ClickHouse in a system with a small amount of RAM, even with 2 GB of RAM, but it requires additional tuning and can ingest at a low rate. +::: + +## Storage Subsystem {#storage-subsystem} + +If your budget allows you to use SSD, use SSD. +If not, use HDD. SATA HDDs 7200 RPM will do. + +Give preference to a lot of servers with local hard drives over a smaller number of servers with attached disk shelves. +But for storing archives with rare queries, shelves will work. + +## RAID {#raid} + +When using HDD, you can combine their RAID-10, RAID-5, RAID-6 or RAID-50. +For Linux, software RAID is better (with `mdadm`). We do not recommend using LVM. +When creating RAID-10, select the `far` layout. +If your budget allows, choose RAID-10. + +If you have more than 4 disks, use RAID-6 (preferred) or RAID-50, instead of RAID-5. +When using RAID-5, RAID-6 or RAID-50, always increase stripe_cache_size, since the default value is usually not the best choice. + +``` bash +$ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size +``` + +Calculate the exact number from the number of devices and the block size, using the formula: `2 * num_devices * chunk_size_in_bytes / 4096`. + +A block size of 64 KB is sufficient for most RAID configurations. The average clickhouse-server write size is approximately 1 MB (1024 KB), and thus the recommended stripe size is also 1 MB. The block size can be optimized if needed when set to 1 MB divided by the number of non-parity disks in the RAID array, such that each write is parallelized across all available non-parity disks. +Never set the block size too small or too large. + +You can use RAID-0 on SSD. +Regardless of RAID use, always use replication for data security. + +Enable NCQ with a long queue. For HDD, choose the CFQ scheduler, and for SSD, choose noop. Don’t reduce the ‘readahead’ setting. +For HDD, enable the write cache. + +Make sure that [fstrim](https://en.wikipedia.org/wiki/Trim_(computing)) is enabled for NVME and SSD disks in your OS (usually it's implemented using a cronjob or systemd service). + +## File System {#file-system} + +Ext4 is the most reliable option. Set the mount options `noatime`. +XFS should be avoided. It works mostly fine but there are some reports about lower performance. +Most other file systems should also work fine. + +Do not use compressed filesystems, because ClickHouse does compression on its own and better. +It's not recommended to use encrypted filesystems, because you can use builtin encryption in ClickHouse, which is better. + +## Linux Kernel {#linux-kernel} + +Don’t use an outdated Linux kernel. + +## Network {#network} + +If you are using IPv6, increase the size of the route cache. +The Linux kernel prior to 3.2 had a multitude of problems with IPv6 implementation. + +Use at least a 10 GB network, if possible. 1 Gb will also work, but it will be much worse for patching replicas with tens of terabytes of data, or for processing distributed queries with a large amount of intermediate data. + +## Huge Pages {#huge-pages} + +If you are using old Linux kernel, disable transparent huge pages. It interferes with memory allocators, which leads to significant performance degradation. +On newer Linux kernels transparent huge pages are alright. + +``` bash +$ echo 'madvise' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled +``` + +## Hypervisor configuration + +If you are using OpenStack, set +``` +cpu_mode=host-passthrough +``` +in nova.conf. + +If you are using libvirt, set +``` + +``` +in XML configuration. + +This is important for ClickHouse to be able to get correct information with `cpuid` instruction. +Otherwise you may get `Illegal instruction` crashes when hypervisor is run on old CPU models. + +## ZooKeeper {#zookeeper} + +You are probably already using ZooKeeper for other purposes. You can use the same installation of ZooKeeper, if it isn’t already overloaded. + +It’s best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated. + +You should never use manually written scripts to transfer data between different ZooKeeper clusters, because the result will be incorrect for sequential nodes. Never use the “zkcopy” utility for the same reason: https://github.com/ksprojects/zkcopy/issues/15 + +If you want to divide an existing ZooKeeper cluster into two, the correct way is to increase the number of its replicas and then reconfigure it as two independent clusters. + +Do not run ZooKeeper on the same servers as ClickHouse. Because ZooKeeper is very sensitive for latency and ClickHouse may utilize all available system resources. + +You can have ZooKeeper observers in an ensemble but ClickHouse servers should not interact with observers. + +Do not change `minSessionTimeout` setting, large values may affect ClickHouse restart stability. + +With the default settings, ZooKeeper is a time bomb: + +> The ZooKeeper server won’t delete files from old snapshots and logs when using the default configuration (see autopurge), and this is the responsibility of the operator. + +This bomb must be defused. + +The ZooKeeper (3.5.1) configuration below is used in a large production environment: + +zoo.cfg: + +``` bash +# http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html + +# The number of milliseconds of each tick +tickTime=2000 +# The number of ticks that the initial +# synchronization phase can take +# This value is not quite motivated +initLimit=300 +# The number of ticks that can pass between +# sending a request and getting an acknowledgement +syncLimit=10 + +maxClientCnxns=2000 + +# It is the maximum value that client may request and the server will accept. +# It is Ok to have high maxSessionTimeout on server to allow clients to work with high session timeout if they want. +# But we request session timeout of 30 seconds by default (you can change it with session_timeout_ms in ClickHouse config). +maxSessionTimeout=60000000 +# the directory where the snapshot is stored. +dataDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/data +# Place the dataLogDir to a separate physical disc for better performance +dataLogDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/logs + +autopurge.snapRetainCount=10 +autopurge.purgeInterval=1 + + +# To avoid seeks ZooKeeper allocates space in the transaction log file in +# blocks of preAllocSize kilobytes. The default block size is 64M. One reason +# for changing the size of the blocks is to reduce the block size if snapshots +# are taken more often. (Also, see snapCount). +preAllocSize=131072 + +# Clients can submit requests faster than ZooKeeper can process them, +# especially if there are a lot of clients. To prevent ZooKeeper from running +# out of memory due to queued requests, ZooKeeper will throttle clients so that +# there is no more than globalOutstandingLimit outstanding requests in the +# system. The default limit is 1,000.ZooKeeper logs transactions to a +# transaction log. After snapCount transactions are written to a log file a +# snapshot is started and a new transaction log file is started. The default +# snapCount is 10,000. +snapCount=3000000 + +# If this option is defined, requests will be will logged to a trace file named +# traceFile.year.month.day. +#traceFile= + +# Leader accepts client connections. Default value is "yes". The leader machine +# coordinates updates. For higher update throughput at thes slight expense of +# read throughput the leader can be configured to not accept clients and focus +# on coordination. +leaderServes=yes + +standaloneEnabled=false +dynamicConfigFile=/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/zoo.cfg.dynamic +``` + +Java version: + +``` text +openjdk 11.0.5-shenandoah 2019-10-15 +OpenJDK Runtime Environment (build 11.0.5-shenandoah+10-adhoc.heretic.src) +OpenJDK 64-Bit Server VM (build 11.0.5-shenandoah+10-adhoc.heretic.src, mixed mode) +``` + +JVM parameters: + +``` bash +NAME=zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} +ZOOCFGDIR=/etc/$NAME/conf + +# TODO this is really ugly +# How to find out, which jars are needed? +# seems, that log4j requires the log4j.properties file to be in the classpath +CLASSPATH="$ZOOCFGDIR:/usr/build/classes:/usr/build/lib/*.jar:/usr/share/zookeeper-3.6.2/lib/audience-annotations-0.5.0.jar:/usr/share/zookeeper-3.6.2/lib/commons-cli-1.2.jar:/usr/share/zookeeper-3.6.2/lib/commons-lang-2.6.jar:/usr/share/zookeeper-3.6.2/lib/jackson-annotations-2.10.3.jar:/usr/share/zookeeper-3.6.2/lib/jackson-core-2.10.3.jar:/usr/share/zookeeper-3.6.2/lib/jackson-databind-2.10.3.jar:/usr/share/zookeeper-3.6.2/lib/javax.servlet-api-3.1.0.jar:/usr/share/zookeeper-3.6.2/lib/jetty-http-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-io-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-security-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-server-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-servlet-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jetty-util-9.4.24.v20191120.jar:/usr/share/zookeeper-3.6.2/lib/jline-2.14.6.jar:/usr/share/zookeeper-3.6.2/lib/json-simple-1.1.1.jar:/usr/share/zookeeper-3.6.2/lib/log4j-1.2.17.jar:/usr/share/zookeeper-3.6.2/lib/metrics-core-3.2.5.jar:/usr/share/zookeeper-3.6.2/lib/netty-buffer-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-codec-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-common-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-handler-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-resolver-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-transport-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-transport-native-epoll-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/netty-transport-native-unix-common-4.1.50.Final.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient_common-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient_hotspot-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/simpleclient_servlet-0.6.0.jar:/usr/share/zookeeper-3.6.2/lib/slf4j-api-1.7.25.jar:/usr/share/zookeeper-3.6.2/lib/slf4j-log4j12-1.7.25.jar:/usr/share/zookeeper-3.6.2/lib/snappy-java-1.1.7.jar:/usr/share/zookeeper-3.6.2/lib/zookeeper-3.6.2.jar:/usr/share/zookeeper-3.6.2/lib/zookeeper-jute-3.6.2.jar:/usr/share/zookeeper-3.6.2/lib/zookeeper-prometheus-metrics-3.6.2.jar:/usr/share/zookeeper-3.6.2/etc" + +ZOOCFG="$ZOOCFGDIR/zoo.cfg" +ZOO_LOG_DIR=/var/log/$NAME +USER=zookeeper +GROUP=zookeeper +PIDDIR=/var/run/$NAME +PIDFILE=$PIDDIR/$NAME.pid +SCRIPTNAME=/etc/init.d/$NAME +JAVA=/usr/local/jdk-11/bin/java +ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain" +ZOO_LOG4J_PROP="INFO,ROLLINGFILE" +JMXLOCALONLY=false +JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ + -Xmx{{ '{{' }} cluster.get('xmx','1G') {{ '}}' }} \ + -Xlog:safepoint,gc*=info,age*=debug:file=/var/log/$NAME/zookeeper-gc.log:time,level,tags:filecount=16,filesize=16M + -verbose:gc \ + -XX:+UseG1GC \ + -Djute.maxbuffer=8388608 \ + -XX:MaxGCPauseMillis=50" +``` + +Salt init: + +``` text +description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service" + +start on runlevel [2345] +stop on runlevel [!2345] + +respawn + +limit nofile 8192 8192 + +pre-start script + [ -r "/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment" ] || exit 0 + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment + [ -d $ZOO_LOG_DIR ] || mkdir -p $ZOO_LOG_DIR + chown $USER:$GROUP $ZOO_LOG_DIR +end script + +script + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment + [ -r /etc/default/zookeeper ] && . /etc/default/zookeeper + if [ -z "$JMXDISABLE" ]; then + JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=$JMXLOCALONLY" + fi + exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} \ + -- -cp $CLASSPATH $JAVA_OPTS -Dzookeeper.log.dir=${ZOO_LOG_DIR} \ + -Dzookeeper.root.logger=${ZOO_LOG4J_PROP} $ZOOMAIN $ZOOCFG +end script +``` + +## Antivirus software {#antivirus-software} + +If you use antivirus software configure it to skip folders with Clickhouse datafiles (`/var/lib/clickhouse`) otherwise performance may be reduced and you may experience unexpected errors during data ingestion and background merges. + +[Original article](https://clickhouse.com/docs/en/operations/tips/) diff --git a/docs/en/reference/operations/troubleshooting.md b/docs/en/reference/operations/troubleshooting.md new file mode 100644 index 00000000000..b67282c8aa1 --- /dev/null +++ b/docs/en/reference/operations/troubleshooting.md @@ -0,0 +1,144 @@ +--- +sidebar_position: 46 +sidebar_label: Troubleshooting +--- + +# Troubleshooting + +- [Installation](#troubleshooting-installation-errors) +- [Connecting to the server](#troubleshooting-accepts-no-connections) +- [Query processing](#troubleshooting-does-not-process-queries) +- [Efficiency of query processing](#troubleshooting-too-slow) + +## Installation {#troubleshooting-installation-errors} + +### You Cannot Get Deb Packages from ClickHouse Repository with Apt-get {#you-cannot-get-deb-packages-from-clickhouse-repository-with-apt-get} + +- Check firewall settings. +- If you cannot access the repository for any reason, download packages as described in the [install guide](../install.md) article and install them manually using the `sudo dpkg -i ` command. You will also need the `tzdata` package. + +## Connecting to the Server {#troubleshooting-accepts-no-connections} + +Possible issues: + +- The server is not running. +- Unexpected or wrong configuration parameters. + +### Server Is Not Running {#server-is-not-running} + +**Check if server is runnnig** + +Command: + +``` bash +$ sudo service clickhouse-server status +``` + +If the server is not running, start it with the command: + +``` bash +$ sudo service clickhouse-server start +``` + +**Check logs** + +The main log of `clickhouse-server` is in `/var/log/clickhouse-server/clickhouse-server.log` by default. + +If the server started successfully, you should see the strings: + +- ` Application: starting up.` — Server started. +- ` Application: Ready for connections.` — Server is running and ready for connections. + +If `clickhouse-server` start failed with a configuration error, you should see the `` string with an error description. For example: + +``` text +2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused +``` + +If you do not see an error at the end of the file, look through the entire file starting from the string: + +``` text + Application: starting up. +``` + +If you try to start a second instance of `clickhouse-server` on the server, you see the following log: + +``` text +2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 +2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up +2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: +PID: 8510 +Started at: 2019-01-11 15:24:23 +Revision: 54413 + +2019.01.11 15:25:11.156673 [ 1 ] {} Application: DB::Exception: Cannot lock file ./status. Another server instance in same directory is already running. +2019.01.11 15:25:11.156682 [ 1 ] {} Application: shutting down +2019.01.11 15:25:11.156686 [ 1 ] {} Application: Uninitializing subsystem: Logging Subsystem +2019.01.11 15:25:11.156716 [ 2 ] {} BaseDaemon: Stop SignalListener thread +``` + +**See system.d logs** + +If you do not find any useful information in `clickhouse-server` logs or there aren’t any logs, you can view `system.d` logs using the command: + +``` bash +$ sudo journalctl -u clickhouse-server +``` + +**Start clickhouse-server in interactive mode** + +``` bash +$ sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml +``` + +This command starts the server as an interactive app with standard parameters of the autostart script. In this mode `clickhouse-server` prints all the event messages in the console. + +### Configuration Parameters {#configuration-parameters} + +Check: + +- Docker settings. + + If you run ClickHouse in Docker in an IPv6 network, make sure that `network=host` is set. + +- Endpoint settings. + + Check [listen_host](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-listen_host) and [tcp_port](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) settings. + + ClickHouse server accepts localhost connections only by default. + +- HTTP protocol settings. + + Check protocol settings for the HTTP API. + +- Secure connection settings. + + Check: + + - The [tcp_port_secure](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) setting. + - Settings for [SSL certificates](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-openssl). + + Use proper parameters while connecting. For example, use the `port_secure` parameter with `clickhouse_client`. + +- User settings. + + You might be using the wrong user name or password. + +## Query Processing {#troubleshooting-does-not-process-queries} + +If ClickHouse is not able to process the query, it sends an error description to the client. In the `clickhouse-client` you get a description of the error in the console. If you are using the HTTP interface, ClickHouse sends the error description in the response body. For example: + +``` bash +$ curl 'http://localhost:8123/' --data-binary "SELECT a" +Code: 47, e.displayText() = DB::Exception: Unknown identifier: a. Note that there are no tables (FROM clause) in your query, context: required_names: 'a' source_tables: table_aliases: private_aliases: column_aliases: public_columns: 'a' masked_columns: array_join_columns: source_columns: , e.what() = DB::Exception +``` + +If you start `clickhouse-client` with the `stack-trace` parameter, ClickHouse returns the server stack trace with the description of an error. + +You might see a message about a broken connection. In this case, you can repeat the query. If the connection breaks every time you perform the query, check the server logs for errors. + +## Efficiency of Query Processing {#troubleshooting-too-slow} + +If you see that ClickHouse is working too slowly, you need to profile the load on the server resources and network for your queries. + +You can use the clickhouse-benchmark utility to profile queries. It shows the number of queries processed per second, the number of rows processed per second, and percentiles of query processing times. diff --git a/docs/en/reference/operations/update.md b/docs/en/reference/operations/update.md new file mode 100644 index 00000000000..fb5fb7803a9 --- /dev/null +++ b/docs/en/reference/operations/update.md @@ -0,0 +1,32 @@ +--- +sidebar_position: 47 +sidebar_label: ClickHouse Upgrade +--- + +# ClickHouse Upgrade {#clickhouse-upgrade} + +If ClickHouse was installed from `deb` packages, execute the following commands on the server: + +``` bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server +$ sudo service clickhouse-server restart +``` + +If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method. + +:::note +You can update multiple servers at once as soon as there is no moment when all replicas of one shard are offline. +::: + +The upgrade of older version of ClickHouse to specific version: + +As an example: + +`xx.yy.a.b` is a current stable version. The latest stable version could be found [here](https://github.com/ClickHouse/ClickHouse/releases) + +```bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b +$ sudo service clickhouse-server restart +``` diff --git a/docs/en/reference/operations/utilities/clickhouse-benchmark.md b/docs/en/reference/operations/utilities/clickhouse-benchmark.md new file mode 100644 index 00000000000..3a52ec92dc3 --- /dev/null +++ b/docs/en/reference/operations/utilities/clickhouse-benchmark.md @@ -0,0 +1,163 @@ +--- +sidebar_position: 61 +sidebar_label: clickhouse-benchmark +--- + +# clickhouse-benchmark + +Connects to a ClickHouse server and repeatedly sends specified queries. + +**Syntax** + +``` bash +$ clickhouse-benchmark --query ["single query"] [keys] +``` + +or + +``` bash +$ echo "single query" | clickhouse-benchmark [keys] +``` + +or + +``` bash +$ clickhouse-benchmark [keys] <<< "single query" +``` + +If you want to send a set of queries, create a text file and place each query on the individual string in this file. For example: + +``` sql +SELECT * FROM system.numbers LIMIT 10000000; +SELECT 1; +``` + +Then pass this file to a standard input of `clickhouse-benchmark`: + +``` bash +clickhouse-benchmark [keys] < queries_file; +``` + +## Keys {#clickhouse-benchmark-keys} + +- `--query=QUERY` — Query to execute. If this parameter is not passed, `clickhouse-benchmark` will read queries from standard input. +- `-c N`, `--concurrency=N` — Number of queries that `clickhouse-benchmark` sends simultaneously. Default value: 1. +- `-d N`, `--delay=N` — Interval in seconds between intermediate reports (to disable reports set 0). Default value: 1. +- `-h HOST`, `--host=HOST` — Server host. Default value: `localhost`. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-h` keys. +- `-p N`, `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys. +- `-i N`, `--iterations=N` — Total number of queries. Default value: 0 (repeat forever). +- `-r`, `--randomize` — Random order of queries execution if there is more than one input query. +- `-s`, `--secure` — Using `TLS` connection. +- `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` stops sending queries when the specified time limit is reached. Default value: 0 (time limit disabled). +- `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [comparison mode](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` performs the [Independent two-sample Student’s t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) to determine whether the two distributions aren’t different with the selected level of confidence. +- `--cumulative` — Printing cumulative data instead of data per interval. +- `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`. +- `--json=FILEPATH` — `JSON` output. When the key is set, `clickhouse-benchmark` outputs a report to the specified JSON-file. +- `--user=USERNAME` — ClickHouse user name. Default value: `default`. +- `--password=PSWD` — ClickHouse user password. Default value: empty string. +- `--stacktrace` — Stack traces output. When the key is set, `clickhouse-bencmark` outputs stack traces of exceptions. +- `--stage=WORD` — Query processing stage at server. ClickHouse stops query processing and returns an answer to `clickhouse-benchmark` at the specified stage. Possible values: `complete`, `fetch_columns`, `with_mergeable_state`. Default value: `complete`. +- `--help` — Shows the help message. + +If you want to apply some [settings](../../operations/settings/index.md) for queries, pass them as a key `--= SETTING_VALUE`. For example, `--max_memory_usage=1048576`. + +## Output {#clickhouse-benchmark-output} + +By default, `clickhouse-benchmark` reports for each `--delay` interval. + +Example of the report: + +``` text +Queries executed: 10. + +localhost:9000, queries 10, QPS: 6.772, RPS: 67904487.440, MiB/s: 518.070, result RPS: 67721584.984, result MiB/s: 516.675. + +0.000% 0.145 sec. +10.000% 0.146 sec. +20.000% 0.146 sec. +30.000% 0.146 sec. +40.000% 0.147 sec. +50.000% 0.148 sec. +60.000% 0.148 sec. +70.000% 0.148 sec. +80.000% 0.149 sec. +90.000% 0.150 sec. +95.000% 0.150 sec. +99.000% 0.150 sec. +99.900% 0.150 sec. +99.990% 0.150 sec. +``` + +In the report you can find: + +- Number of queries in the `Queries executed:` field. + +- Status string containing (in order): + + - Endpoint of ClickHouse server. + - Number of processed queries. + - QPS: How many queries the server performed per second during a period specified in the `--delay` argument. + - RPS: How many rows the server reads per second during a period specified in the `--delay` argument. + - MiB/s: How many mebibytes the server reads per second during a period specified in the `--delay` argument. + - result RPS: How many rows placed by the server to the result of a query per second during a period specified in the `--delay` argument. + - result MiB/s. How many mebibytes placed by the server to the result of a query per second during a period specified in the `--delay` argument. + +- Percentiles of queries execution time. + +## Comparison Mode {#clickhouse-benchmark-comparison-mode} + +`clickhouse-benchmark` can compare performances for two running ClickHouse servers. + +To use the comparison mode, specify endpoints of both servers by two pairs of `--host`, `--port` keys. Keys matched together by position in arguments list, the first `--host` is matched with the first `--port` and so on. `clickhouse-benchmark` establishes connections to both servers, then sends queries. Each query addressed to a randomly selected server. The results are shown for each server separately. + +## Example {#clickhouse-benchmark-example} + +``` bash +$ echo "SELECT * FROM system.numbers LIMIT 10000000 OFFSET 10000000" | clickhouse-benchmark -i 10 +``` + +``` text +Loaded 1 queries. + +Queries executed: 6. + +localhost:9000, queries 6, QPS: 6.153, RPS: 123398340.957, MiB/s: 941.455, result RPS: 61532982.200, result MiB/s: 469.459. + +0.000% 0.159 sec. +10.000% 0.159 sec. +20.000% 0.159 sec. +30.000% 0.160 sec. +40.000% 0.160 sec. +50.000% 0.162 sec. +60.000% 0.164 sec. +70.000% 0.165 sec. +80.000% 0.166 sec. +90.000% 0.166 sec. +95.000% 0.167 sec. +99.000% 0.167 sec. +99.900% 0.167 sec. +99.990% 0.167 sec. + + + +Queries executed: 10. + +localhost:9000, queries 10, QPS: 6.082, RPS: 121959604.568, MiB/s: 930.478, result RPS: 60815551.642, result MiB/s: 463.986. + +0.000% 0.159 sec. +10.000% 0.159 sec. +20.000% 0.160 sec. +30.000% 0.163 sec. +40.000% 0.164 sec. +50.000% 0.165 sec. +60.000% 0.166 sec. +70.000% 0.166 sec. +80.000% 0.167 sec. +90.000% 0.167 sec. +95.000% 0.170 sec. +99.000% 0.172 sec. +99.900% 0.172 sec. +99.990% 0.172 sec. +``` + +[Original article](https://clickhouse.com/docs/en/operations/utilities/clickhouse-benchmark.md) diff --git a/docs/en/reference/operations/utilities/clickhouse-compressor.md b/docs/en/reference/operations/utilities/clickhouse-compressor.md new file mode 100644 index 00000000000..2f8f4794ba8 --- /dev/null +++ b/docs/en/reference/operations/utilities/clickhouse-compressor.md @@ -0,0 +1,28 @@ + +# clickhouse-compressor + +Simple program for data compression and decompression. + +### Examples + +Compress data with LZ4: +``` +$ ./clickhouse-compressor < input_file > output_file +``` + +Decompress data from LZ4 format: +``` +$ ./clickhouse-compressor --decompress < input_file > output_file +``` + +Compress data with ZSTD at level 5: + +``` +$ ./clickhouse-compressor --codec 'ZSTD(5)' < input_file > output_file +``` + +Compress data with Delta of four bytes and ZSTD level 10. + +``` +$ ./clickhouse-compressor --codec 'Delta(4)' --codec 'ZSTD(10)' < input_file > output_file +``` diff --git a/docs/en/reference/operations/utilities/clickhouse-copier.md b/docs/en/reference/operations/utilities/clickhouse-copier.md new file mode 100644 index 00000000000..f152c177992 --- /dev/null +++ b/docs/en/reference/operations/utilities/clickhouse-copier.md @@ -0,0 +1,188 @@ +--- +sidebar_position: 59 +sidebar_label: clickhouse-copier +--- + +# clickhouse-copier + +Copies data from the tables in one cluster to tables in another (or the same) cluster. + +:::warning +To get a consistent copy, the data in the source tables and partitions should not change during the entire process. +::: + +You can run multiple `clickhouse-copier` instances on different servers to perform the same job. ZooKeeper is used for syncing the processes. + +After starting, `clickhouse-copier`: + +- Connects to ZooKeeper and receives: + + - Copying jobs. + - The state of the copying jobs. + +- It performs the jobs. + + Each running process chooses the “closest” shard of the source cluster and copies the data into the destination cluster, resharding the data if necessary. + +`clickhouse-copier` tracks the changes in ZooKeeper and applies them on the fly. + +To reduce network traffic, we recommend running `clickhouse-copier` on the same server where the source data is located. + +## Running Clickhouse-copier {#running-clickhouse-copier} + +The utility should be run manually: + +``` bash +$ clickhouse-copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir +``` + +Parameters: + +- `daemon` — Starts `clickhouse-copier` in daemon mode. +- `config` — The path to the `zookeeper.xml` file with the parameters for the connection to ZooKeeper. +- `task-path` — The path to the ZooKeeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`. +- `task-file` — Optional path to file with task configuration for initial upload to ZooKeeper. +- `task-upload-force` — Force upload `task-file` even if node already exists. +- `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` creates `clickhouse-copier_YYYYMMHHSS_` subdirectories in `$base-dir`. If this parameter is omitted, the directories are created in the directory where `clickhouse-copier` was launched. + +## Format of Zookeeper.xml {#format-of-zookeeper-xml} + +``` xml + + + trace + 100M + 3 + + + + + 127.0.0.1 + 2181 + + + +``` + +## Configuration of Copying Tasks {#configuration-of-copying-tasks} + +``` xml + + + + + + + false + + 127.0.0.1 + 9000 + + + + ... + + + + ... + + + + + 2 + + + + 1 + + + + + 0 + + + + + 3 + + 1 + + + + + + + + source_cluster + test + hits + + + destination_cluster + test + hits2 + + + + ENGINE=ReplicatedMergeTree('/clickhouse/tables/{cluster}/{shard}/hits2', '{replica}') + PARTITION BY toMonday(date) + ORDER BY (CounterID, EventDate) + + + + jumpConsistentHash(intHash64(UserID), 2) + + + CounterID != 0 + + + + '2018-02-26' + '2018-03-05' + ... + + + + + + ... + + ... + + +``` + +`clickhouse-copier` tracks the changes in `/task/path/description` and applies them on the fly. For instance, if you change the value of `max_workers`, the number of processes running tasks will also change. + +[Original article](https://clickhouse.com/docs/en/operations/utils/clickhouse-copier/) diff --git a/docs/en/reference/operations/utilities/clickhouse-format.md b/docs/en/reference/operations/utilities/clickhouse-format.md new file mode 100644 index 00000000000..219a170fc23 --- /dev/null +++ b/docs/en/reference/operations/utilities/clickhouse-format.md @@ -0,0 +1,109 @@ +# clickhouse-format + +Allows formatting input queries. + +Keys: + +- `--help` or`-h` — Produce help message. +- `--query` — Format queries of any length and complexity. +- `--hilite` — Add syntax highlight with ANSI terminal escape sequences. +- `--oneline` — Format in single line. +- `--quiet` or `-q` — Just check syntax, no output on success. +- `--multiquery` or `-n` — Allow multiple queries in the same file. +- `--obfuscate` — Obfuscate instead of formatting. +- `--seed ` — Seed arbitrary string that determines the result of obfuscation. +- `--backslash` — Add a backslash at the end of each line of the formatted query. Can be useful when you copy a query from web or somewhere else with multiple lines, and want to execute it in command line. + +## Examples {#examples} + +1. Formatting a query: + +```bash +$ clickhouse-format --query "select number from numbers(10) where number%2 order by number desc;" +``` + +Result: + +```text +SELECT number +FROM numbers(10) +WHERE number % 2 +ORDER BY number DESC +``` + +2. Highlighting and single line: + +```bash +$ clickhouse-format --oneline --hilite <<< "SELECT sum(number) FROM numbers(5);" +``` + +Result: + +```sql +SELECT sum(number) FROM numbers(5) +``` + +3. Multiqueries: + +```bash +$ clickhouse-format -n <<< "SELECT * FROM (SELECT 1 AS x UNION ALL SELECT 1 UNION DISTINCT SELECT 3);" +``` + +Result: + +```text +SELECT * +FROM +( + SELECT 1 AS x + UNION ALL + SELECT 1 + UNION DISTINCT + SELECT 3 +) +; +``` + +4. Obfuscating: + +```bash +$ clickhouse-format --seed Hello --obfuscate <<< "SELECT cost_first_screen BETWEEN a AND b, CASE WHEN x >= 123 THEN y ELSE NULL END;" +``` + +Result: + +```text +SELECT treasury_mammoth_hazelnut BETWEEN nutmeg AND span, CASE WHEN chive >= 116 THEN switching ELSE ANYTHING END; +``` + +Same query and another seed string: + +```bash +$ clickhouse-format --seed World --obfuscate <<< "SELECT cost_first_screen BETWEEN a AND b, CASE WHEN x >= 123 THEN y ELSE NULL END;" +``` + +Result: + +```text +SELECT horse_tape_summer BETWEEN folklore AND moccasins, CASE WHEN intestine >= 116 THEN nonconformist ELSE FORESTRY END; +``` + +5. Adding backslash: + +```bash +$ clickhouse-format --backslash <<< "SELECT * FROM (SELECT 1 AS x UNION ALL SELECT 1 UNION DISTINCT SELECT 3);" +``` + +Result: + +```text +SELECT * \ +FROM \ +( \ + SELECT 1 AS x \ + UNION ALL \ + SELECT 1 \ + UNION DISTINCT \ + SELECT 3 \ +) +``` diff --git a/docs/en/reference/operations/utilities/clickhouse-local.md b/docs/en/reference/operations/utilities/clickhouse-local.md new file mode 100644 index 00000000000..3c35ab933e2 --- /dev/null +++ b/docs/en/reference/operations/utilities/clickhouse-local.md @@ -0,0 +1,118 @@ +--- +sidebar_position: 60 +sidebar_label: clickhouse-local +--- + +# clickhouse-local + +The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. + +Accepts data that represent tables and queries them using [ClickHouse SQL dialect](../../sql-reference/index.md). + +`clickhouse-local` uses the same core as ClickHouse server, so it supports most of the features and the same set of formats and table engines. + +By default `clickhouse-local` does not have access to data on the same host, but it supports loading server configuration using `--config-file` argument. + +:::warning +It is not recommended to load production server configuration into `clickhouse-local` because data can be damaged in case of human error. +::: + +For temporary data, a unique temporary data directory is created by default. + +## Usage {#usage} + +Basic usage: + +``` bash +$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" \ + --query "query" +``` + +Arguments: + +- `-S`, `--structure` — table structure for input data. +- `-if`, `--input-format` — input format, `TSV` by default. +- `-f`, `--file` — path to data, `stdin` by default. +- `-q`, `--query` — queries to execute with `;` as delimeter. You must specify either `query` or `queries-file` option. +- `-qf`, `--queries-file` - file path with queries to execute. You must specify either `query` or `queries-file` option. +- `-N`, `--table` — table name where to put output data, `table` by default. +- `-of`, `--format`, `--output-format` — output format, `TSV` by default. +- `-d`, `--database` — default database, `_local` by default. +- `--stacktrace` — whether to dump debug output in case of exception. +- `--echo` — print query before execution. +- `--verbose` — more details on query execution. +- `--logger.console` — Log to console. +- `--logger.log` — Log file name. +- `--logger.level` — Log level. +- `--ignore-error` — do not stop processing if a query failed. +- `-c`, `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty. +- `--no-system-tables` — do not attach system tables. +- `--help` — arguments references for `clickhouse-local`. +- `-V`, `--version` — print version information and exit. + +Also there are arguments for each ClickHouse configuration variable which are more commonly used instead of `--config-file`. + + +## Examples {#examples} + +``` bash +$ echo -e "1,2\n3,4" | clickhouse-local --structure "a Int64, b Int64" \ + --input-format "CSV" --query "SELECT * FROM table" +Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. +1 2 +3 4 +``` + +Previous example is the same as: + +``` bash +$ echo -e "1,2\n3,4" | clickhouse-local --query " + CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); + SELECT a, b FROM table; + DROP TABLE table" +Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. +1 2 +3 4 +``` + +You don't have to use `stdin` or `--file` argument, and can open any number of files using the [`file` table function](../../sql-reference/table-functions/file.md): + +``` bash +$ echo 1 | tee 1.tsv +1 + +$ echo 2 | tee 2.tsv +2 + +$ clickhouse-local --query " + select * from file('1.tsv', TSV, 'a int') t1 + cross join file('2.tsv', TSV, 'b int') t2" +1 2 +``` + +Now let’s output memory user for each Unix user: + +Query: + +``` bash +$ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ + | clickhouse-local --structure "user String, mem Float64" \ + --query "SELECT user, round(sum(mem), 2) as memTotal + FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" +``` + +Result: + +``` text +Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. +┏━━━━━━━━━━┳━━━━━━━━━━┓ +┃ user ┃ memTotal ┃ +┡━━━━━━━━━━╇━━━━━━━━━━┩ +│ bayonet │ 113.5 │ +├──────────┼──────────┤ +│ root │ 8.8 │ +├──────────┼──────────┤ +... +``` + +[Original article](https://clickhouse.com/docs/en/operations/utils/clickhouse-local/) diff --git a/docs/en/reference/operations/utilities/clickhouse-obfuscator.md b/docs/en/reference/operations/utilities/clickhouse-obfuscator.md new file mode 100644 index 00000000000..baa0f19dda9 --- /dev/null +++ b/docs/en/reference/operations/utilities/clickhouse-obfuscator.md @@ -0,0 +1,42 @@ +# clickhouse-obfuscator + +A simple tool for table data obfuscation. + +It reads an input table and produces an output table, that retains some properties of input, but contains different data. +It allows publishing almost real production data for usage in benchmarks. + +It is designed to retain the following properties of data: +- cardinalities of values (number of distinct values) for every column and every tuple of columns; +- conditional cardinalities: number of distinct values of one column under the condition on the value of another column; +- probability distributions of the absolute value of integers; the sign of signed integers; exponent and sign for floats; +- probability distributions of the length of strings; +- probability of zero values of numbers; empty strings and arrays, `NULL`s; + +- data compression ratio when compressed with LZ77 and entropy family of codecs; +- continuity (magnitude of difference) of time values across the table; continuity of floating-point values; +- date component of `DateTime` values; + +- UTF-8 validity of string values; +- string values look natural. + +Most of the properties above are viable for performance testing: + +reading data, filtering, aggregatio, and sorting will work at almost the same speed +as on original data due to saved cardinalities, magnitudes, compression ratios, etc. + +It works in a deterministic fashion: you define a seed value and the transformation is determined by input data and by seed. +Some transformations are one to one and could be reversed, so you need to have a large seed and keep it in secret. + +It uses some cryptographic primitives to transform data but from the cryptographic point of view, it does not do it properly, that is why you should not consider the result as secure unless you have another reason. The result may retain some data you don't want to publish. + + +It always leaves 0, 1, -1 numbers, dates, lengths of arrays, and null flags exactly as in source data. +For example, you have a column `IsMobile` in your table with values 0 and 1. In transformed data, it will have the same value. + +So, the user will be able to count the exact ratio of mobile traffic. + +Let's give another example. When you have some private data in your table, like user email and you don't want to publish any single email address. +If your table is large enough and contains multiple different emails and no email has a very high frequency than all others, it will anonymize all data. But if you have a small number of different values in a column, it can reproduce some of them. +You should look at the working algorithm of this tool works, and fine-tune its command line parameters. + +This tool works fine only with an average amount of data (at least 1000s of rows). diff --git a/docs/en/reference/operations/utilities/index.md b/docs/en/reference/operations/utilities/index.md new file mode 100644 index 00000000000..7fdc783f9c4 --- /dev/null +++ b/docs/en/reference/operations/utilities/index.md @@ -0,0 +1,15 @@ +--- +sidebar_position: 56 +sidebar_label: Utilities +--- + +# ClickHouse Utility + +- [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without starting the ClickHouse server, similar to how `awk` does this. +- [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. +- [clickhouse-benchmark](../../operations/utilities/clickhouse-benchmark.md) — Loads server with the custom queries and settings. +- [clickhouse-format](../../operations/utilities/clickhouse-format.md) — Enables formatting input queries. +- [ClickHouse obfuscator](../../operations/utilities/clickhouse-obfuscator.md) — Obfuscates data. +- [ClickHouse compressor](../../operations/utilities/clickhouse-compressor.md) — Compresses and decompresses data. +- [clickhouse-odbc-bridge](../../operations/utilities/odbc-bridge.md) — A proxy server for ODBC driver. + diff --git a/docs/en/reference/operations/utilities/odbc-bridge.md b/docs/en/reference/operations/utilities/odbc-bridge.md new file mode 100644 index 00000000000..e5967085c49 --- /dev/null +++ b/docs/en/reference/operations/utilities/odbc-bridge.md @@ -0,0 +1,38 @@ +# clickhouse-odbc-bridge + +Simple HTTP-server which works like a proxy for ODBC driver. The main motivation +was possible segfaults or another faults in ODBC implementations, which can +crash whole clickhouse-server process. + +This tool works via HTTP, not via pipes, shared memory, or TCP because: +- It's simpler to implement +- It's simpler to debug +- jdbc-bridge can be implemented in the same way + +## Usage + +`clickhouse-server` use this tool inside odbc table function and StorageODBC. +However it can be used as standalone tool from command line with the following +parameters in POST-request URL: +- `connection_string` -- ODBC connection string. +- `columns` -- columns in ClickHouse NamesAndTypesList format, name in backticks, + type as string. Name and type are space separated, rows separated with + newline. +- `max_block_size` -- optional parameter, sets maximum size of single block. +Query is send in post body. Response is returned in RowBinary format. + +## Example: + +```bash +$ clickhouse-odbc-bridge --http-port 9018 --daemon + +$ curl -d "query=SELECT PageID, ImpID, AdType FROM Keys ORDER BY PageID, ImpID" --data-urlencode "connection_string=DSN=ClickHouse;DATABASE=stat" --data-urlencode "sample_block=columns format version: 1 +3 columns: +\`PageID\` String +\`ImpID\` String +\`AdType\` String +" "http://localhost:9018/" > result.txt + +$ cat result.txt +12246623837185725195925621517 +``` diff --git a/docs/en/reference/sql-reference/_category_.yml b/docs/en/reference/sql-reference/_category_.yml new file mode 100644 index 00000000000..8c5a40b6a85 --- /dev/null +++ b/docs/en/reference/sql-reference/_category_.yml @@ -0,0 +1,4 @@ +position: 15 +label: 'SQL Reference' +collapsible: true +collapsed: true \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/combinators.md b/docs/en/reference/sql-reference/aggregate-functions/combinators.md new file mode 100644 index 00000000000..6a8c178919c --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/combinators.md @@ -0,0 +1,287 @@ +--- +sidebar_position: 37 +sidebar_label: Combinators +--- + +# Aggregate Function Combinators {#aggregate_functions_combinators} + +The name of an aggregate function can have a suffix appended to it. This changes the way the aggregate function works. + +## -If {#agg-functions-combinator-if} + +The suffix -If can be appended to the name of any aggregate function. In this case, the aggregate function accepts an extra argument – a condition (Uint8 type). The aggregate function processes only the rows that trigger the condition. If the condition was not triggered even once, it returns a default value (usually zeros or empty strings). + +Examples: `sumIf(column, cond)`, `countIf(cond)`, `avgIf(x, cond)`, `quantilesTimingIf(level1, level2)(x, cond)`, `argMinIf(arg, val, cond)` and so on. + +With conditional aggregate functions, you can calculate aggregates for several conditions at once, without using subqueries and `JOIN`s. For example, conditional aggregate functions can be used to implement the segment comparison functionality. + +## -Array {#agg-functions-combinator-array} + +The -Array suffix can be appended to any aggregate function. In this case, the aggregate function takes arguments of the ‘Array(T)’ type (arrays) instead of ‘T’ type arguments. If the aggregate function accepts multiple arguments, this must be arrays of equal lengths. When processing arrays, the aggregate function works like the original aggregate function across all array elements. + +Example 1: `sumArray(arr)` - Totals all the elements of all ‘arr’ arrays. In this example, it could have been written more simply: `sum(arraySum(arr))`. + +Example 2: `uniqArray(arr)` – Counts the number of unique elements in all ‘arr’ arrays. This could be done an easier way: `uniq(arrayJoin(arr))`, but it’s not always possible to add ‘arrayJoin’ to a query. + +-If and -Array can be combined. However, ‘Array’ must come first, then ‘If’. Examples: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Due to this order, the ‘cond’ argument won’t be an array. + +## -Map {#agg-functions-combinator-map} + +The -Map suffix can be appended to any aggregate function. This will create an aggregate function which gets Map type as an argument, and aggregates values of each key of the map separately using the specified aggregate function. The result is also of a Map type. + +Examples: `sumMap(map(1,1))`, `avgMap(map('a', 1))`. + +## -SimpleState {#agg-functions-combinator-simplestate} + +If you apply this combinator, the aggregate function returns the same value but with a different type. This is a [SimpleAggregateFunction(...)](../../sql-reference/data-types/simpleaggregatefunction.md) that can be stored in a table to work with [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) tables. + +**Syntax** + +``` sql +SimpleState(x) +``` + +**Arguments** + +- `x` — Aggregate function parameters. + +**Returned values** + +The value of an aggregate function with the `SimpleAggregateFunction(...)` type. + +**Example** + +Query: + +``` sql +WITH anySimpleState(number) AS c SELECT toTypeName(c), c FROM numbers(1); +``` + +Result: + +``` text +┌─toTypeName(c)────────────────────────┬─c─┐ +│ SimpleAggregateFunction(any, UInt64) │ 0 │ +└──────────────────────────────────────┴───┘ +``` + +## -State {#agg-functions-combinator-state} + +If you apply this combinator, the aggregate function does not return the resulting value (such as the number of unique values for the [uniq](../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function), but an intermediate state of the aggregation (for `uniq`, this is the hash table for calculating the number of unique values). This is an `AggregateFunction(...)` that can be used for further processing or stored in a table to finish aggregating later. + +To work with these states, use: + +- [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engine. +- [finalizeAggregation](../../sql-reference/functions/other-functions.md#function-finalizeaggregation) function. +- [runningAccumulate](../../sql-reference/functions/other-functions.md#runningaccumulate) function. +- [-Merge](#aggregate_functions_combinators-merge) combinator. +- [-MergeState](#aggregate_functions_combinators-mergestate) combinator. + +## -Merge {#aggregate_functions_combinators-merge} + +If you apply this combinator, the aggregate function takes the intermediate aggregation state as an argument, combines the states to finish aggregation, and returns the resulting value. + +## -MergeState {#aggregate_functions_combinators-mergestate} + +Merges the intermediate aggregation states in the same way as the -Merge combinator. However, it does not return the resulting value, but an intermediate aggregation state, similar to the -State combinator. + +## -ForEach {#agg-functions-combinator-foreach} + +Converts an aggregate function for tables into an aggregate function for arrays that aggregates the corresponding array items and returns an array of results. For example, `sumForEach` for the arrays `[1, 2]`, `[3, 4, 5]`and`[6, 7]`returns the result `[10, 13, 5]` after adding together the corresponding array items. + +## -Distinct {#agg-functions-combinator-distinct} + +Every unique combination of arguments will be aggregated only once. Repeating values are ignored. +Examples: `sum(DISTINCT x)`, `groupArray(DISTINCT x)`, `corrStableDistinct(DISTINCT x, y)` and so on. + +## -OrDefault {#agg-functions-combinator-ordefault} + +Changes behavior of an aggregate function. + +If an aggregate function does not have input values, with this combinator it returns the default value for its return data type. Applies to the aggregate functions that can take empty input data. + +`-OrDefault` can be used with other combinators. + +**Syntax** + +``` sql +OrDefault(x) +``` + +**Arguments** + +- `x` — Aggregate function parameters. + +**Returned values** + +Returns the default value of an aggregate function’s return type if there is nothing to aggregate. + +Type depends on the aggregate function used. + +**Example** + +Query: + +``` sql +SELECT avg(number), avgOrDefault(number) FROM numbers(0) +``` + +Result: + +``` text +┌─avg(number)─┬─avgOrDefault(number)─┐ +│ nan │ 0 │ +└─────────────┴──────────────────────┘ +``` + +Also `-OrDefault` can be used with another combinators. It is useful when the aggregate function does not accept the empty input. + +Query: + +``` sql +SELECT avgOrDefaultIf(x, x > 10) +FROM +( + SELECT toDecimal32(1.23, 2) AS x +) +``` + +Result: + +``` text +┌─avgOrDefaultIf(x, greater(x, 10))─┐ +│ 0.00 │ +└───────────────────────────────────┘ +``` + +## -OrNull {#agg-functions-combinator-ornull} + +Changes behavior of an aggregate function. + +This combinator converts a result of an aggregate function to the [Nullable](../../sql-reference/data-types/nullable.md) data type. If the aggregate function does not have values to calculate it returns [NULL](../../sql-reference/syntax.md#null-literal). + +`-OrNull` can be used with other combinators. + +**Syntax** + +``` sql +OrNull(x) +``` + +**Arguments** + +- `x` — Aggregate function parameters. + +**Returned values** + +- The result of the aggregate function, converted to the `Nullable` data type. +- `NULL`, if there is nothing to aggregate. + +Type: `Nullable(aggregate function return type)`. + +**Example** + +Add `-orNull` to the end of aggregate function. + +Query: + +``` sql +SELECT sumOrNull(number), toTypeName(sumOrNull(number)) FROM numbers(10) WHERE number > 10 +``` + +Result: + +``` text +┌─sumOrNull(number)─┬─toTypeName(sumOrNull(number))─┐ +│ ᴺᵁᴸᴸ │ Nullable(UInt64) │ +└───────────────────┴───────────────────────────────┘ +``` + +Also `-OrNull` can be used with another combinators. It is useful when the aggregate function does not accept the empty input. + +Query: + +``` sql +SELECT avgOrNullIf(x, x > 10) +FROM +( + SELECT toDecimal32(1.23, 2) AS x +) +``` + +Result: + +``` text +┌─avgOrNullIf(x, greater(x, 10))─┐ +│ ᴺᵁᴸᴸ │ +└────────────────────────────────┘ +``` + +## -Resample {#agg-functions-combinator-resample} + +Lets you divide data into groups, and then separately aggregates the data in those groups. Groups are created by splitting the values from one column into intervals. + +``` sql +Resample(start, end, step)(, resampling_key) +``` + +**Arguments** + +- `start` — Starting value of the whole required interval for `resampling_key` values. +- `stop` — Ending value of the whole required interval for `resampling_key` values. The whole interval does not include the `stop` value `[start, stop)`. +- `step` — Step for separating the whole interval into subintervals. The `aggFunction` is executed over each of those subintervals independently. +- `resampling_key` — Column whose values are used for separating data into intervals. +- `aggFunction_params` — `aggFunction` parameters. + +**Returned values** + +- Array of `aggFunction` results for each subinterval. + +**Example** + +Consider the `people` table with the following data: + +``` text +┌─name───┬─age─┬─wage─┐ +│ John │ 16 │ 10 │ +│ Alice │ 30 │ 15 │ +│ Mary │ 35 │ 8 │ +│ Evelyn │ 48 │ 11.5 │ +│ David │ 62 │ 9.9 │ +│ Brian │ 60 │ 16 │ +└────────┴─────┴──────┘ +``` + +Let’s get the names of the people whose age lies in the intervals of `[30,60)` and `[60,75)`. Since we use integer representation for age, we get ages in the `[30, 59]` and `[60,74]` intervals. + +To aggregate names in an array, we use the [groupArray](../../sql-reference/aggregate-functions/reference/grouparray.md#agg_function-grouparray) aggregate function. It takes one argument. In our case, it’s the `name` column. The `groupArrayResample` function should use the `age` column to aggregate names by age. To define the required intervals, we pass the `30, 75, 30` arguments into the `groupArrayResample` function. + +``` sql +SELECT groupArrayResample(30, 75, 30)(name, age) FROM people +``` + +``` text +┌─groupArrayResample(30, 75, 30)(name, age)─────┐ +│ [['Alice','Mary','Evelyn'],['David','Brian']] │ +└───────────────────────────────────────────────┘ +``` + +Consider the results. + +`Jonh` is out of the sample because he’s too young. Other people are distributed according to the specified age intervals. + +Now let’s count the total number of people and their average wage in the specified age intervals. + +``` sql +SELECT + countResample(30, 75, 30)(name, age) AS amount, + avgResample(30, 75, 30)(wage, age) AS avg_wage +FROM people +``` + +``` text +┌─amount─┬─avg_wage──────────────────┐ +│ [3,2] │ [11.5,12.949999809265137] │ +└────────┴───────────────────────────┘ +``` + diff --git a/docs/en/reference/sql-reference/aggregate-functions/index.md b/docs/en/reference/sql-reference/aggregate-functions/index.md new file mode 100644 index 00000000000..1e6cc0f88c2 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/index.md @@ -0,0 +1,60 @@ +--- +sidebar_label: Aggregate Functions +sidebar_position: 33 +--- + +# Aggregate Functions + +Aggregate functions work in the [normal](http://www.sql-tutorial.com/sql-aggregate-functions-sql-tutorial) way as expected by database experts. + +ClickHouse also supports: + +- [Parametric aggregate functions](../../sql-reference/aggregate-functions/parametric-functions.md#aggregate_functions_parametric), which accept other parameters in addition to columns. +- [Combinators](../../sql-reference/aggregate-functions/combinators.md#aggregate_functions_combinators), which change the behavior of aggregate functions. + + +## NULL Processing {#null-processing} + +During aggregation, all `NULL`s are skipped. + +**Examples:** + +Consider this table: + +``` text +┌─x─┬────y─┐ +│ 1 │ 2 │ +│ 2 │ ᴺᵁᴸᴸ │ +│ 3 │ 2 │ +│ 3 │ 3 │ +│ 3 │ ᴺᵁᴸᴸ │ +└───┴──────┘ +``` + +Let’s say you need to total the values in the `y` column: + +``` sql +SELECT sum(y) FROM t_null_big +``` + +```text +┌─sum(y)─┐ +│ 7 │ +└────────┘ +``` + +Now you can use the `groupArray` function to create an array from the `y` column: + +``` sql +SELECT groupArray(y) FROM t_null_big +``` + +``` text +┌─groupArray(y)─┐ +│ [2,2,3] │ +└───────────────┘ +``` + +`groupArray` does not include `NULL` in the resulting array. + + diff --git a/docs/en/reference/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/reference/sql-reference/aggregate-functions/parametric-functions.md new file mode 100644 index 00000000000..7708bcb8129 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/parametric-functions.md @@ -0,0 +1,766 @@ +--- +sidebar_position: 38 +sidebar_label: Parametric +--- + +# Parametric Aggregate Functions {#aggregate_functions_parametric} + +Some aggregate functions can accept not only argument columns (used for compression), but a set of parameters – constants for initialization. The syntax is two pairs of brackets instead of one. The first is for parameters, and the second is for arguments. + +## histogram {#histogram} + +Calculates an adaptive histogram. It does not guarantee precise results. + +``` sql +histogram(number_of_bins)(values) +``` + +The functions uses [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). The borders of histogram bins are adjusted as new data enters a function. In common case, the widths of bins are not equal. + +**Arguments** + +`values` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in input values. + +**Parameters** + +`number_of_bins` — Upper limit for the number of bins in the histogram. The function automatically calculates the number of bins. It tries to reach the specified number of bins, but if it fails, it uses fewer bins. + +**Returned values** + +- [Array](../../sql-reference/data-types/array.md) of [Tuples](../../sql-reference/data-types/tuple.md) of the following format: + + ``` + [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] + ``` + + - `lower` — Lower bound of the bin. + - `upper` — Upper bound of the bin. + - `height` — Calculated height of the bin. + +**Example** + +``` sql +SELECT histogram(5)(number + 1) +FROM ( + SELECT * + FROM system.numbers + LIMIT 20 +) +``` + +``` text +┌─histogram(5)(plus(number, 1))───────────────────────────────────────────┐ +│ [(1,4.5,4),(4.5,8.5,4),(8.5,12.75,4.125),(12.75,17,4.625),(17,20,3.25)] │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +You can visualize a histogram with the [bar](../../sql-reference/functions/other-functions.md#function-bar) function, for example: + +``` sql +WITH histogram(5)(rand() % 100) AS hist +SELECT + arrayJoin(hist).3 AS height, + bar(height, 0, 6, 5) AS bar +FROM +( + SELECT * + FROM system.numbers + LIMIT 20 +) +``` + +``` text +┌─height─┬─bar───┐ +│ 2.125 │ █▋ │ +│ 3.25 │ ██▌ │ +│ 5.625 │ ████▏ │ +│ 5.625 │ ████▏ │ +│ 3.375 │ ██▌ │ +└────────┴───────┘ +``` + +In this case, you should remember that you do not know the histogram bin borders. + +## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} + +Checks whether the sequence contains an event chain that matches the pattern. + +``` sql +sequenceMatch(pattern)(timestamp, cond1, cond2, ...) +``` + +:::warning +Events that occur at the same second may lay in the sequence in an undefined order affecting the result. +::: + +**Arguments** + +- `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../sql-reference/data-types/int-uint.md) data types. + +- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn’t described in a condition, the function skips them. + +**Parameters** + +- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). + +**Returned values** + +- 1, if the pattern is matched. +- 0, if the pattern isn’t matched. + +Type: `UInt8`. + + +**Pattern syntax** + +- `(?N)` — Matches the condition argument at position `N`. Conditions are numbered in the `[1, 32]` range. For example, `(?1)` matches the argument passed to the `cond1` parameter. + +- `.*` — Matches any number of events. You do not need conditional arguments to match this element of the pattern. + +- `(?t operator value)` — Sets the time in seconds that should separate two events. For example, pattern `(?1)(?t>1800)(?2)` matches events that occur more than 1800 seconds from each other. An arbitrary number of any events can lay between these events. You can use the `>=`, `>`, `<`, `<=`, `==` operators. + +**Examples** + +Consider data in the `t` table: + +``` text +┌─time─┬─number─┐ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 3 │ 2 │ +└──────┴────────┘ +``` + +Perform the query: + +``` sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t +``` + +``` text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2))─┐ +│ 1 │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +The function found the event chain where number 2 follows number 1. It skipped number 3 between them, because the number is not described as an event. If we want to take this number into account when searching for the event chain given in the example, we should make a condition for it. + +``` sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 3) FROM t +``` + +``` text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 3))─┐ +│ 0 │ +└──────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +In this case, the function couldn’t find the event chain matching the pattern, because the event for number 3 occurred between 1 and 2. If in the same case we checked the condition for number 4, the sequence would match the pattern. + +``` sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM t +``` + +``` text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 4))─┐ +│ 1 │ +└──────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**See Also** + +- [sequenceCount](#function-sequencecount) + +## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} + +Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. + +:::warning +Events that occur at the same second may lay in the sequence in an undefined order affecting the result. +::: + +``` sql +sequenceCount(pattern)(timestamp, cond1, cond2, ...) +``` + +**Arguments** + +- `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../sql-reference/data-types/int-uint.md) data types. + +- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn’t described in a condition, the function skips them. + +**Parameters** + +- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). + +**Returned values** + +- Number of non-overlapping event chains that are matched. + +Type: `UInt64`. + +**Example** + +Consider data in the `t` table: + +``` text +┌─time─┬─number─┐ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 3 │ 2 │ +│ 4 │ 1 │ +│ 5 │ 3 │ +│ 6 │ 2 │ +└──────┴────────┘ +``` + +Count how many times the number 2 occurs after the number 1 with any amount of other numbers between them: + +``` sql +SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t +``` + +``` text +┌─sequenceCount('(?1).*(?2)')(time, equals(number, 1), equals(number, 2))─┐ +│ 2 │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +**See Also** + +- [sequenceMatch](#function-sequencematch) + +## windowFunnel {#windowfunnel} + +Searches for event chains in a sliding time window and calculates the maximum number of events that occurred from the chain. + +The function works according to the algorithm: + +- The function searches for data that triggers the first condition in the chain and sets the event counter to 1. This is the moment when the sliding window starts. + +- If events from the chain occur sequentially within the window, the counter is incremented. If the sequence of events is disrupted, the counter isn’t incremented. + +- If the data has multiple event chains at varying points of completion, the function will only output the size of the longest chain. + +**Syntax** + +``` sql +windowFunnel(window, [mode, [mode, ... ]])(timestamp, cond1, cond2, ..., condN) +``` + +**Arguments** + +- `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) and other unsigned integer types (note that even though timestamp supports the `UInt64` type, it’s value can’t exceed the Int64 maximum, which is 2^63 - 1). +- `cond` — Conditions or data describing the chain of events. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Parameters** + +- `window` — Length of the sliding window, it is the time interval between the first and the last condition. The unit of `window` depends on the `timestamp` itself and varies. Determined using the expression `timestamp of cond1 <= timestamp of cond2 <= ... <= timestamp of condN <= timestamp of cond1 + window`. +- `mode` — It is an optional argument. One or more modes can be set. + - `'strict_deduplication'` — If the same condition holds for the sequence of events, then such repeating event interrupts further processing. + - `'strict_order'` — Don't allow interventions of other events. E.g. in the case of `A->B->D->C`, it stops finding `A->B->C` at the `D` and the max event level is 2. + - `'strict_increase'` — Apply conditions only to events with strictly increasing timestamps. + +**Returned value** + +The maximum number of consecutive triggered conditions from the chain within the sliding time window. +All the chains in the selection are analyzed. + +Type: `Integer`. + +**Example** + +Determine if a set period of time is enough for the user to select a phone and purchase it twice in the online store. + +Set the following chain of events: + +1. The user logged in to their account on the store (`eventID = 1003`). +2. The user searches for a phone (`eventID = 1007, product = 'phone'`). +3. The user placed an order (`eventID = 1009`). +4. The user made the order again (`eventID = 1010`). + +Input table: + +``` text +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-01-28 │ 1 │ 2019-01-29 10:00:00 │ 1003 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-01-31 │ 1 │ 2019-01-31 09:00:00 │ 1007 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-01-30 │ 1 │ 2019-01-30 08:00:00 │ 1009 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-02-01 │ 1 │ 2019-02-01 08:00:00 │ 1010 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +``` + +Find out how far the user `user_id` could get through the chain in a period in January-February of 2019. + +Query: + +``` sql +SELECT + level, + count() AS c +FROM +( + SELECT + user_id, + windowFunnel(6048000000000000)(timestamp, eventID = 1003, eventID = 1009, eventID = 1007, eventID = 1010) AS level + FROM trend + WHERE (event_date >= '2019-01-01') AND (event_date <= '2019-02-02') + GROUP BY user_id +) +GROUP BY level +ORDER BY level ASC; +``` + +Result: + +``` text +┌─level─┬─c─┐ +│ 4 │ 1 │ +└───────┴───┘ +``` + +## retention {#retention} + +The function takes as arguments a set of conditions from 1 to 32 arguments of type `UInt8` that indicate whether a certain condition was met for the event. +Any condition can be specified as an argument (as in [WHERE](../../sql-reference/statements/select/where.md#select-where)). + +The conditions, except the first, apply in pairs: the result of the second will be true if the first and second are true, of the third if the first and third are true, etc. + +**Syntax** + +``` sql +retention(cond1, cond2, ..., cond32); +``` + +**Arguments** + +- `cond` — An expression that returns a `UInt8` result (1 or 0). + +**Returned value** + +The array of 1 or 0. + +- 1 — Condition was met for the event. +- 0 — Condition wasn’t met for the event. + +Type: `UInt8`. + +**Example** + +Let’s consider an example of calculating the `retention` function to determine site traffic. + +**1.** Сreate a table to illustrate an example. + +``` sql +CREATE TABLE retention_test(date Date, uid Int32) ENGINE = Memory; + +INSERT INTO retention_test SELECT '2020-01-01', number FROM numbers(5); +INSERT INTO retention_test SELECT '2020-01-02', number FROM numbers(10); +INSERT INTO retention_test SELECT '2020-01-03', number FROM numbers(15); +``` + +Input table: + +Query: + +``` sql +SELECT * FROM retention_test +``` + +Result: + +``` text +┌───────date─┬─uid─┐ +│ 2020-01-01 │ 0 │ +│ 2020-01-01 │ 1 │ +│ 2020-01-01 │ 2 │ +│ 2020-01-01 │ 3 │ +│ 2020-01-01 │ 4 │ +└────────────┴─────┘ +┌───────date─┬─uid─┐ +│ 2020-01-02 │ 0 │ +│ 2020-01-02 │ 1 │ +│ 2020-01-02 │ 2 │ +│ 2020-01-02 │ 3 │ +│ 2020-01-02 │ 4 │ +│ 2020-01-02 │ 5 │ +│ 2020-01-02 │ 6 │ +│ 2020-01-02 │ 7 │ +│ 2020-01-02 │ 8 │ +│ 2020-01-02 │ 9 │ +└────────────┴─────┘ +┌───────date─┬─uid─┐ +│ 2020-01-03 │ 0 │ +│ 2020-01-03 │ 1 │ +│ 2020-01-03 │ 2 │ +│ 2020-01-03 │ 3 │ +│ 2020-01-03 │ 4 │ +│ 2020-01-03 │ 5 │ +│ 2020-01-03 │ 6 │ +│ 2020-01-03 │ 7 │ +│ 2020-01-03 │ 8 │ +│ 2020-01-03 │ 9 │ +│ 2020-01-03 │ 10 │ +│ 2020-01-03 │ 11 │ +│ 2020-01-03 │ 12 │ +│ 2020-01-03 │ 13 │ +│ 2020-01-03 │ 14 │ +└────────────┴─────┘ +``` + +**2.** Group users by unique ID `uid` using the `retention` function. + +Query: + +``` sql +SELECT + uid, + retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r +FROM retention_test +WHERE date IN ('2020-01-01', '2020-01-02', '2020-01-03') +GROUP BY uid +ORDER BY uid ASC +``` + +Result: + +``` text +┌─uid─┬─r───────┐ +│ 0 │ [1,1,1] │ +│ 1 │ [1,1,1] │ +│ 2 │ [1,1,1] │ +│ 3 │ [1,1,1] │ +│ 4 │ [1,1,1] │ +│ 5 │ [0,0,0] │ +│ 6 │ [0,0,0] │ +│ 7 │ [0,0,0] │ +│ 8 │ [0,0,0] │ +│ 9 │ [0,0,0] │ +│ 10 │ [0,0,0] │ +│ 11 │ [0,0,0] │ +│ 12 │ [0,0,0] │ +│ 13 │ [0,0,0] │ +│ 14 │ [0,0,0] │ +└─────┴─────────┘ +``` + +**3.** Calculate the total number of site visits per day. + +Query: + +``` sql +SELECT + sum(r[1]) AS r1, + sum(r[2]) AS r2, + sum(r[3]) AS r3 +FROM +( + SELECT + uid, + retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r + FROM retention_test + WHERE date IN ('2020-01-01', '2020-01-02', '2020-01-03') + GROUP BY uid +) +``` + +Result: + +``` text +┌─r1─┬─r2─┬─r3─┐ +│ 5 │ 5 │ 5 │ +└────┴────┴────┘ +``` + +Where: + +- `r1`- the number of unique visitors who visited the site during 2020-01-01 (the `cond1` condition). +- `r2`- the number of unique visitors who visited the site during a specific time period between 2020-01-01 and 2020-01-02 (`cond1` and `cond2` conditions). +- `r3`- the number of unique visitors who visited the site during a specific time period between 2020-01-01 and 2020-01-03 (`cond1` and `cond3` conditions). + +## uniqUpTo(N)(x) {#uniquptonx} + +Calculates the number of different argument values ​​if it is less than or equal to N. If the number of different argument values is greater than N, it returns N + 1. + +Recommended for use with small Ns, up to 10. The maximum value of N is 100. + +For the state of an aggregate function, it uses the amount of memory equal to 1 + N \* the size of one value of bytes. +For strings, it stores a non-cryptographic hash of 8 bytes. That is, the calculation is approximated for strings. + +The function also works for several arguments. + +It works as fast as possible, except for cases when a large N value is used and the number of unique values is slightly less than N. + +Usage example: + +``` text +Problem: Generate a report that shows only keywords that produced at least 5 unique users. +Solution: Write in the GROUP BY query SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 +``` + + +## sumMapFiltered(keys_to_keep)(keys, values) {#summapfilteredkeys-to-keepkeys-values} + +Same behavior as [sumMap](../../sql-reference/aggregate-functions/reference/summap.md#agg_functions-summap) except that an array of keys is passed as a parameter. This can be especially useful when working with a high cardinality of keys. + +## sequenceNextNode {#sequenceNextNode} + +Returns a value of the next event that matched an event chain. + +_Experimental function, `SET allow_experimental_funnel_functions = 1` to enable it._ + +**Syntax** + +``` sql +sequenceNextNode(direction, base)(timestamp, event_column, base_condition, event1, event2, event3, ...) +``` + +**Parameters** + +- `direction` — Used to navigate to directions. + - forward — Moving forward. + - backward — Moving backward. + +- `base` — Used to set the base point. + - head — Set the base point to the first event. + - tail — Set the base point to the last event. + - first_match — Set the base point to the first matched `event1`. + - last_match — Set the base point to the last matched `event1`. + +**Arguments** + +- `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) and other unsigned integer types. +- `event_column` — Name of the column containing the value of the next event to be returned. Data types supported: [String](../../sql-reference/data-types/string.md) and [Nullable(String)](../../sql-reference/data-types/nullable.md). +- `base_condition` — Condition that the base point must fulfill. +- `event1`, `event2`, ... — Conditions describing the chain of events. [UInt8](../../sql-reference/data-types/int-uint.md). + +**Returned values** + +- `event_column[next_index]` — If the pattern is matched and next value exists. +- `NULL` - If the pattern isn’t matched or next value doesn't exist. + +Type: [Nullable(String)](../../sql-reference/data-types/nullable.md). + +**Example** + +It can be used when events are A->B->C->D->E and you want to know the event following B->C, which is D. + +The query statement searching the event following A->B: + +``` sql +CREATE TABLE test_flow ( + dt DateTime, + id int, + page String) +ENGINE = MergeTree() +PARTITION BY toYYYYMMDD(dt) +ORDER BY id; + +INSERT INTO test_flow VALUES (1, 1, 'A') (2, 1, 'B') (3, 1, 'C') (4, 1, 'D') (5, 1, 'E'); + +SELECT id, sequenceNextNode('forward', 'head')(dt, page, page = 'A', page = 'A', page = 'B') as next_flow FROM test_flow GROUP BY id; +``` + +Result: + +``` text +┌─id─┬─next_flow─┐ +│ 1 │ C │ +└────┴───────────┘ +``` + +**Behavior for `forward` and `head`** + +``` sql +ALTER TABLE test_flow DELETE WHERE 1 = 1 settings mutations_sync = 1; + +INSERT INTO test_flow VALUES (1, 1, 'Home') (2, 1, 'Gift') (3, 1, 'Exit'); +INSERT INTO test_flow VALUES (1, 2, 'Home') (2, 2, 'Home') (3, 2, 'Gift') (4, 2, 'Basket'); +INSERT INTO test_flow VALUES (1, 3, 'Gift') (2, 3, 'Home') (3, 3, 'Gift') (4, 3, 'Basket'); +``` + +``` sql +SELECT id, sequenceNextNode('forward', 'head')(dt, page, page = 'Home', page = 'Home', page = 'Gift') FROM test_flow GROUP BY id; + + dt id page + 1970-01-01 09:00:01 1 Home // Base point, Matched with Home + 1970-01-01 09:00:02 1 Gift // Matched with Gift + 1970-01-01 09:00:03 1 Exit // The result + + 1970-01-01 09:00:01 2 Home // Base point, Matched with Home + 1970-01-01 09:00:02 2 Home // Unmatched with Gift + 1970-01-01 09:00:03 2 Gift + 1970-01-01 09:00:04 2 Basket + + 1970-01-01 09:00:01 3 Gift // Base point, Unmatched with Home + 1970-01-01 09:00:02 3 Home + 1970-01-01 09:00:03 3 Gift + 1970-01-01 09:00:04 3 Basket +``` + +**Behavior for `backward` and `tail`** + +``` sql +SELECT id, sequenceNextNode('backward', 'tail')(dt, page, page = 'Basket', page = 'Basket', page = 'Gift') FROM test_flow GROUP BY id; + + dt id page +1970-01-01 09:00:01 1 Home +1970-01-01 09:00:02 1 Gift +1970-01-01 09:00:03 1 Exit // Base point, Unmatched with Basket + +1970-01-01 09:00:01 2 Home +1970-01-01 09:00:02 2 Home // The result +1970-01-01 09:00:03 2 Gift // Matched with Gift +1970-01-01 09:00:04 2 Basket // Base point, Matched with Basket + +1970-01-01 09:00:01 3 Gift +1970-01-01 09:00:02 3 Home // The result +1970-01-01 09:00:03 3 Gift // Base point, Matched with Gift +1970-01-01 09:00:04 3 Basket // Base point, Matched with Basket +``` + + +**Behavior for `forward` and `first_match`** + +``` sql +SELECT id, sequenceNextNode('forward', 'first_match')(dt, page, page = 'Gift', page = 'Gift') FROM test_flow GROUP BY id; + + dt id page +1970-01-01 09:00:01 1 Home +1970-01-01 09:00:02 1 Gift // Base point +1970-01-01 09:00:03 1 Exit // The result + +1970-01-01 09:00:01 2 Home +1970-01-01 09:00:02 2 Home +1970-01-01 09:00:03 2 Gift // Base point +1970-01-01 09:00:04 2 Basket The result + +1970-01-01 09:00:01 3 Gift // Base point +1970-01-01 09:00:02 3 Home // The result +1970-01-01 09:00:03 3 Gift +1970-01-01 09:00:04 3 Basket +``` + +``` sql +SELECT id, sequenceNextNode('forward', 'first_match')(dt, page, page = 'Gift', page = 'Gift', page = 'Home') FROM test_flow GROUP BY id; + + dt id page +1970-01-01 09:00:01 1 Home +1970-01-01 09:00:02 1 Gift // Base point +1970-01-01 09:00:03 1 Exit // Unmatched with Home + +1970-01-01 09:00:01 2 Home +1970-01-01 09:00:02 2 Home +1970-01-01 09:00:03 2 Gift // Base point +1970-01-01 09:00:04 2 Basket // Unmatched with Home + +1970-01-01 09:00:01 3 Gift // Base point +1970-01-01 09:00:02 3 Home // Matched with Home +1970-01-01 09:00:03 3 Gift // The result +1970-01-01 09:00:04 3 Basket +``` + + +**Behavior for `backward` and `last_match`** + +``` sql +SELECT id, sequenceNextNode('backward', 'last_match')(dt, page, page = 'Gift', page = 'Gift') FROM test_flow GROUP BY id; + + dt id page +1970-01-01 09:00:01 1 Home // The result +1970-01-01 09:00:02 1 Gift // Base point +1970-01-01 09:00:03 1 Exit + +1970-01-01 09:00:01 2 Home +1970-01-01 09:00:02 2 Home // The result +1970-01-01 09:00:03 2 Gift // Base point +1970-01-01 09:00:04 2 Basket + +1970-01-01 09:00:01 3 Gift +1970-01-01 09:00:02 3 Home // The result +1970-01-01 09:00:03 3 Gift // Base point +1970-01-01 09:00:04 3 Basket +``` + +``` sql +SELECT id, sequenceNextNode('backward', 'last_match')(dt, page, page = 'Gift', page = 'Gift', page = 'Home') FROM test_flow GROUP BY id; + + dt id page +1970-01-01 09:00:01 1 Home // Matched with Home, the result is null +1970-01-01 09:00:02 1 Gift // Base point +1970-01-01 09:00:03 1 Exit + +1970-01-01 09:00:01 2 Home // The result +1970-01-01 09:00:02 2 Home // Matched with Home +1970-01-01 09:00:03 2 Gift // Base point +1970-01-01 09:00:04 2 Basket + +1970-01-01 09:00:01 3 Gift // The result +1970-01-01 09:00:02 3 Home // Matched with Home +1970-01-01 09:00:03 3 Gift // Base point +1970-01-01 09:00:04 3 Basket +``` + + +**Behavior for `base_condition`** + +``` sql +CREATE TABLE test_flow_basecond +( + `dt` DateTime, + `id` int, + `page` String, + `ref` String +) +ENGINE = MergeTree +PARTITION BY toYYYYMMDD(dt) +ORDER BY id; + +INSERT INTO test_flow_basecond VALUES (1, 1, 'A', 'ref4') (2, 1, 'A', 'ref3') (3, 1, 'B', 'ref2') (4, 1, 'B', 'ref1'); +``` + +``` sql +SELECT id, sequenceNextNode('forward', 'head')(dt, page, ref = 'ref1', page = 'A') FROM test_flow_basecond GROUP BY id; + + dt id page ref + 1970-01-01 09:00:01 1 A ref4 // The head can not be base point because the ref column of the head unmatched with 'ref1'. + 1970-01-01 09:00:02 1 A ref3 + 1970-01-01 09:00:03 1 B ref2 + 1970-01-01 09:00:04 1 B ref1 + ``` + +``` sql +SELECT id, sequenceNextNode('backward', 'tail')(dt, page, ref = 'ref4', page = 'B') FROM test_flow_basecond GROUP BY id; + + dt id page ref + 1970-01-01 09:00:01 1 A ref4 + 1970-01-01 09:00:02 1 A ref3 + 1970-01-01 09:00:03 1 B ref2 + 1970-01-01 09:00:04 1 B ref1 // The tail can not be base point because the ref column of the tail unmatched with 'ref4'. +``` + +``` sql +SELECT id, sequenceNextNode('forward', 'first_match')(dt, page, ref = 'ref3', page = 'A') FROM test_flow_basecond GROUP BY id; + + dt id page ref + 1970-01-01 09:00:01 1 A ref4 // This row can not be base point because the ref column unmatched with 'ref3'. + 1970-01-01 09:00:02 1 A ref3 // Base point + 1970-01-01 09:00:03 1 B ref2 // The result + 1970-01-01 09:00:04 1 B ref1 +``` + +``` sql +SELECT id, sequenceNextNode('backward', 'last_match')(dt, page, ref = 'ref2', page = 'B') FROM test_flow_basecond GROUP BY id; + + dt id page ref + 1970-01-01 09:00:01 1 A ref4 + 1970-01-01 09:00:02 1 A ref3 // The result + 1970-01-01 09:00:03 1 B ref2 // Base point + 1970-01-01 09:00:04 1 B ref1 // This row can not be base point because the ref column unmatched with 'ref2'. +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/any.md b/docs/en/reference/sql-reference/aggregate-functions/reference/any.md new file mode 100644 index 00000000000..3b5539c5b8d --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/any.md @@ -0,0 +1,13 @@ +--- +sidebar_position: 6 +--- + +# any {#agg_function-any} + +Selects the first encountered value. +The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. +To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’. + +In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY. + +When a `SELECT` query has the `GROUP BY` clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the `SELECT`, `HAVING`, and `ORDER BY` clauses be calculated from keys or from aggregate functions. In other words, each column selected from the table must be used either in keys or inside aggregate functions. To get behavior like in MySQL, you can put the other columns in the `any` aggregate function. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/en/reference/sql-reference/aggregate-functions/reference/anyheavy.md new file mode 100644 index 00000000000..29144ee2f50 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/anyheavy.md @@ -0,0 +1,30 @@ +--- +sidebar_position: 103 +--- + +# anyHeavy {#anyheavyx} + +Selects a frequently occurring value using the [heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf) algorithm. If there is a value that occurs more than in half the cases in each of the query’s execution threads, this value is returned. Normally, the result is nondeterministic. + +``` sql +anyHeavy(column) +``` + +**Arguments** + +- `column` – The column name. + +**Example** + +Take the [OnTime](../../../example-datasets/ontime.md) data set and select any frequently occurring value in the `AirlineID` column. + +``` sql +SELECT anyHeavy(AirlineID) AS res +FROM ontime +``` + +``` text +┌───res─┐ +│ 19690 │ +└───────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/anylast.md b/docs/en/reference/sql-reference/aggregate-functions/reference/anylast.md new file mode 100644 index 00000000000..2a01a587f70 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/anylast.md @@ -0,0 +1,8 @@ +--- +sidebar_position: 104 +--- + +## anyLast {#anylastx} + +Selects the last value encountered. +The result is just as indeterminate as for the [any](../../../sql-reference/aggregate-functions/reference/any.md) function. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/reference/sql-reference/aggregate-functions/reference/argmax.md new file mode 100644 index 00000000000..f09bcd0bba2 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/argmax.md @@ -0,0 +1,50 @@ +--- +sidebar_position: 106 +--- + +# argMax {#agg-function-argmax} + +Calculates the `arg` value for a maximum `val` value. If there are several different values of `arg` for maximum values of `val`, returns the first of these values encountered. + +**Syntax** + +``` sql +argMax(arg, val) +``` + +**Arguments** + +- `arg` — Argument. +- `val` — Value. + +**Returned value** + +- `arg` value that corresponds to maximum `val` value. + +Type: matches `arg` type. + +**Example** + +Input table: + +``` text +┌─user─────┬─salary─┐ +│ director │ 5000 │ +│ manager │ 3000 │ +│ worker │ 1000 │ +└──────────┴────────┘ +``` + +Query: + +``` sql +SELECT argMax(user, salary) FROM salary; +``` + +Result: + +``` text +┌─argMax(user, salary)─┐ +│ director │ +└──────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/reference/sql-reference/aggregate-functions/reference/argmin.md new file mode 100644 index 00000000000..926fda5a512 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/argmin.md @@ -0,0 +1,50 @@ +--- +sidebar_position: 105 +--- + +# argMin {#agg-function-argmin} + +Calculates the `arg` value for a minimum `val` value. If there are several different values of `arg` for minimum values of `val`, returns the first of these values encountered. + +**Syntax** + +``` sql +argMin(arg, val) +``` + +**Arguments** + +- `arg` — Argument. +- `val` — Value. + +**Returned value** + +- `arg` value that corresponds to minimum `val` value. + +Type: matches `arg` type. + +**Example** + +Input table: + +``` text +┌─user─────┬─salary─┐ +│ director │ 5000 │ +│ manager │ 3000 │ +│ worker │ 1000 │ +└──────────┴────────┘ +``` + +Query: + +``` sql +SELECT argMin(user, salary) FROM salary +``` + +Result: + +``` text +┌─argMin(user, salary)─┐ +│ worker │ +└──────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/avg.md b/docs/en/reference/sql-reference/aggregate-functions/reference/avg.md new file mode 100644 index 00000000000..b7b5e9fbed4 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/avg.md @@ -0,0 +1,66 @@ +--- +sidebar_position: 5 +--- + +# avg {#agg_function-avg} + +Calculates the arithmetic mean. + +**Syntax** + +``` sql +avg(x) +``` + +**Arguments** + +- `x` — input values, must be [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), or [Decimal](../../../sql-reference/data-types/decimal.md). + +**Returned value** + +- The arithmetic mean, always as [Float64](../../../sql-reference/data-types/float.md). +- `NaN` if the input parameter `x` is empty. + +**Example** + +Query: + +``` sql +SELECT avg(x) FROM values('x Int8', 0, 1, 2, 3, 4, 5); +``` + +Result: + +``` text +┌─avg(x)─┐ +│ 2.5 │ +└────────┘ +``` + +**Example** + +Create a temp table: + +Query: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +``` + +Get the arithmetic mean: + +Query: + +``` +SELECT avg(t) FROM test; +``` + +Result: + +``` text +┌─avg(x)─┐ +│ nan │ +└────────┘ +``` + +[Original article](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/avg/) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/en/reference/sql-reference/aggregate-functions/reference/avgweighted.md new file mode 100644 index 00000000000..126c0c2f1d7 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/avgweighted.md @@ -0,0 +1,99 @@ +--- +sidebar_position: 107 +--- + +# avgWeighted {#avgweighted} + +Calculates the [weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted_arithmetic_mean). + +**Syntax** + +``` sql +avgWeighted(x, weight) +``` + +**Arguments** + +- `x` — Values. +- `weight` — Weights of the values. + +`x` and `weight` must both be +[Integer](../../../sql-reference/data-types/int-uint.md), +[floating-point](../../../sql-reference/data-types/float.md), or +[Decimal](../../../sql-reference/data-types/decimal.md), +but may have different types. + +**Returned value** + +- `NaN` if all the weights are equal to 0 or the supplied weights parameter is empty. +- Weighted mean otherwise. + +**Return type** is always [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +SELECT avgWeighted(x, w) +FROM values('x Int8, w Int8', (4, 1), (1, 0), (10, 2)) +``` + +Result: + +``` text +┌─avgWeighted(x, weight)─┐ +│ 8 │ +└────────────────────────┘ +``` + +**Example** + +Query: + +``` sql +SELECT avgWeighted(x, w) +FROM values('x Int8, w Float64', (4, 1), (1, 0), (10, 2)) +``` + +Result: + +``` text +┌─avgWeighted(x, weight)─┐ +│ 8 │ +└────────────────────────┘ +``` + +**Example** + +Query: + +``` sql +SELECT avgWeighted(x, w) +FROM values('x Int8, w Int8', (0, 0), (1, 0), (10, 0)) +``` + +Result: + +``` text +┌─avgWeighted(x, weight)─┐ +│ nan │ +└────────────────────────┘ +``` + +**Example** + +Query: + +``` sql +CREATE table test (t UInt8) ENGINE = Memory; +SELECT avgWeighted(t) FROM test +``` + +Result: + +``` text +┌─avgWeighted(x, weight)─┐ +│ nan │ +└────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md b/docs/en/reference/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md new file mode 100644 index 00000000000..e836dbe868a --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md @@ -0,0 +1,13 @@ +--- +sidebar_position: 250 +--- + +# categoricalInformationValue {#categoricalinformationvalue} + +Calculates the value of `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` for each category. + +``` sql +categoricalInformationValue(category1, category2, ..., tag) +``` + +The result indicates how a discrete (categorical) feature `[category1, category2, ...]` contribute to a learning model which predicting the value of `tag`. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/corr.md b/docs/en/reference/sql-reference/aggregate-functions/reference/corr.md new file mode 100644 index 00000000000..c6d7fd5baed --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/corr.md @@ -0,0 +1,13 @@ +--- +sidebar_position: 107 +--- + +# corr {#corrx-y} + +Syntax: `corr(x, y)` + +Calculates the Pearson correlation coefficient: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`. + +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `corrStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/count.md b/docs/en/reference/sql-reference/aggregate-functions/reference/count.md new file mode 100644 index 00000000000..8df4aef9d03 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/count.md @@ -0,0 +1,72 @@ +--- +sidebar_position: 1 +--- + +# count {#agg_function-count} + +Counts the number of rows or not-NULL values. + +ClickHouse supports the following syntaxes for `count`: + +- `count(expr)` or `COUNT(DISTINCT expr)`. +- `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific. + +**Arguments** + +The function can take: + +- Zero parameters. +- One [expression](../../../sql-reference/syntax.md#syntax-expressions). + +**Returned value** + +- If the function is called without parameters it counts the number of rows. +- If the [expression](../../../sql-reference/syntax.md#syntax-expressions) is passed, then the function counts how many times this expression returned not null. If the expression returns a [Nullable](../../../sql-reference/data-types/nullable.md)-type value, then the result of `count` stays not `Nullable`. The function returns 0 if the expression returned `NULL` for all the rows. + +In both cases the type of the returned value is [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Details** + +ClickHouse supports the `COUNT(DISTINCT ...)` syntax. The behavior of this construction depends on the [count_distinct_implementation](../../../operations/settings/settings.md#settings-count_distinct_implementation) setting. It defines which of the [uniq\*](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) functions is used to perform the operation. The default is the [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) function. + +The `SELECT count() FROM table` query is optimized by default using metadata from MergeTree. If you need to use row-level security, disable optimization using the [optimize_trivial_count_query](../../../operations/settings/settings.md#optimize-trivial-count-query) setting. + +However `SELECT count(nullable_column) FROM table` query can be optimized by enabling the [optimize_functions_to_subcolumns](../../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [null](../../../sql-reference/data-types/nullable.md#finding-null) subcolumn instead of reading and processing the whole column data. The query `SELECT count(n) FROM table` transforms to `SELECT sum(NOT n.null) FROM table`. + +**Examples** + +Example 1: + +``` sql +SELECT count() FROM t +``` + +``` text +┌─count()─┐ +│ 5 │ +└─────────┘ +``` + +Example 2: + +``` sql +SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation' +``` + +``` text +┌─name──────────────────────────┬─value─────┐ +│ count_distinct_implementation │ uniqExact │ +└───────────────────────────────┴───────────┘ +``` + +``` sql +SELECT count(DISTINCT num) FROM t +``` + +``` text +┌─uniqExact(num)─┐ +│ 3 │ +└────────────────┘ +``` + +This example shows that `count(DISTINCT num)` is performed by the `uniqExact` function according to the `count_distinct_implementation` setting value. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/covarpop.md b/docs/en/reference/sql-reference/aggregate-functions/reference/covarpop.md new file mode 100644 index 00000000000..363a98c3f16 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/covarpop.md @@ -0,0 +1,13 @@ +--- +sidebar_position: 36 +--- + +# covarPop {#covarpop} + +Syntax: `covarPop(x, y)` + +Calculates the value of `Σ((x - x̅)(y - y̅)) / n`. + +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarPopStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/en/reference/sql-reference/aggregate-functions/reference/covarsamp.md new file mode 100644 index 00000000000..977b3f3b5b4 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/covarsamp.md @@ -0,0 +1,13 @@ +--- +sidebar_position: 37 +--- + +# covarSamp {#covarsamp} + +Calculates the value of `Σ((x - x̅)(y - y̅)) / (n - 1)`. + +Returns Float64. When `n <= 1`, returns +∞. + +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarSampStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/deltasum.md b/docs/en/reference/sql-reference/aggregate-functions/reference/deltasum.md new file mode 100644 index 00000000000..ac35938e26d --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/deltasum.md @@ -0,0 +1,73 @@ +--- +sidebar_position: 141 +--- + +# deltaSum {#agg_functions-deltasum} + +Sums the arithmetic difference between consecutive rows. If the difference is negative, it is ignored. + +:::note +The underlying data must be sorted for this function to work properly. If you would like to use this function in a [materialized view](../../../sql-reference/statements/create/view.md#materialized), you most likely want to use the [deltaSumTimestamp](../../../sql-reference/aggregate-functions/reference/deltasumtimestamp.md#agg_functions-deltasumtimestamp) method instead. +::: + +**Syntax** + +``` sql +deltaSum(value) +``` + +**Arguments** + +- `value` — Input values, must be [Integer](../../data-types/int-uint.md) or [Float](../../data-types/float.md) type. + +**Returned value** + +- A gained arithmetic difference of the `Integer` or `Float` type. + +**Examples** + +Query: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3])); +``` + +Result: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3]))─┐ +│ 2 │ +└────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3])); +``` + +Result: + +``` text +┌─deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]))─┐ +│ 7 │ +└───────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT deltaSum(arrayJoin([2.25, 3, 4.5])); +``` + +Result: + +``` text +┌─deltaSum(arrayJoin([2.25, 3, 4.5]))─┐ +│ 2.25 │ +└─────────────────────────────────────┘ +``` + +## See Also {#see-also} + +- [runningDifference](../../functions/other-functions.md#other_functions-runningdifference) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/deltasumtimestamp.md b/docs/en/reference/sql-reference/aggregate-functions/reference/deltasumtimestamp.md new file mode 100644 index 00000000000..e1024e58328 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/deltasumtimestamp.md @@ -0,0 +1,45 @@ +--- +sidebar_position: 141 +--- + +# deltaSumTimestamp {#agg_functions-deltasumtimestamp} + +Adds the difference between consecutive rows. If the difference is negative, it is ignored. + +This function is primarily for [materialized views](../../../sql-reference/statements/create/view.md#materialized) that are ordered by some time bucket-aligned timestamp, for example, a `toStartOfMinute` bucket. Because the rows in such a materialized view will all have the same timestamp, it is impossible for them to be merged in the "right" order. This function keeps track of the `timestamp` of the values it's seen, so it's possible to order the states correctly during merging. + +To calculate the delta sum across an ordered collection you can simply use the [deltaSum](../../../sql-reference/aggregate-functions/reference/deltasum.md#agg_functions-deltasum) function. + +**Syntax** + +``` sql +deltaSumTimestamp(value, timestamp) +``` + +**Arguments** + +- `value` — Input values, must be some [Integer](../../data-types/int-uint.md) type or [Float](../../data-types/float.md) type or a [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md). +- `timestamp` — The parameter for order values, must be some [Integer](../../data-types/int-uint.md) type or [Float](../../data-types/float.md) type or a [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md). + +**Returned value** + +- Accumulated differences between consecutive values, ordered by the `timestamp` parameter. + +Type: [Integer](../../data-types/int-uint.md) or [Float](../../data-types/float.md) or [Date](../../data-types/date.md) or [DateTime](../../data-types/datetime.md). + +**Example** + +Query: + +```sql +SELECT deltaSumTimestamp(value, timestamp) +FROM (SELECT number AS timestamp, [0, 4, 8, 3, 0, 0, 0, 1, 3, 5][number] AS value FROM numbers(1, 10)); +``` + +Result: + +``` text +┌─deltaSumTimestamp(value, timestamp)─┐ +│ 13 │ +└─────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/entropy.md b/docs/en/reference/sql-reference/aggregate-functions/reference/entropy.md new file mode 100644 index 00000000000..9f1576c3ed8 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/entropy.md @@ -0,0 +1,43 @@ +--- +sidebar_position: 302 +--- + +# entropy {#entropy} + +Calculates [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) of a column of values. + +**Syntax** + +``` sql +entropy(val) +``` + +**Arguments** + +- `val` — Column of values of any type. + +**Returned value** + +- Shannon entropy. + +Type: [Float64](../../../sql-reference/data-types/float.md). + +**Example** + +Query: + +``` sql +CREATE TABLE entropy (`vals` UInt32,`strings` String) ENGINE = Memory; + +INSERT INTO entropy VALUES (1, 'A'), (1, 'A'), (1,'A'), (1,'A'), (2,'B'), (2,'B'), (2,'C'), (2,'D'); + +SELECT entropy(vals), entropy(strings) FROM entropy; +``` + +Result: + +``` text +┌─entropy(vals)─┬─entropy(strings)─┐ +│ 1 │ 1.75 │ +└───────────────┴──────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md b/docs/en/reference/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md new file mode 100644 index 00000000000..2337a0c8dab --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md @@ -0,0 +1,148 @@ +--- +sidebar_position: 108 +--- + +## exponentialMovingAverage {#exponential-moving-average} + +Сalculates the exponential moving average of values for the determined time. + +**Syntax** + +```sql +exponentialMovingAverage(x)(value, timestamp) +``` + +Each `value` corresponds to the determinate `timestamp`. The half-life `x` is the time lag at which the exponential weights decay by one-half. The function returns a weighted average: the older the time point, the less weight the corresponding value is considered to be. + +**Arguments** + +- `value` — Value. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). +- `timestamp` — Timestamp. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). + +**Parameters** + +- `x` — Half-life period. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). + +**Returned values** + +- Returnes an [exponentially smoothed moving average](https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average) of the values for the past `x` time at the latest point of time. + +Type: [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Examples** + +Input table: + +``` text +┌──temperature─┬─timestamp──┐ +│ 95 │ 1 │ +│ 95 │ 2 │ +│ 95 │ 3 │ +│ 96 │ 4 │ +│ 96 │ 5 │ +│ 96 │ 6 │ +│ 96 │ 7 │ +│ 97 │ 8 │ +│ 97 │ 9 │ +│ 97 │ 10 │ +│ 97 │ 11 │ +│ 98 │ 12 │ +│ 98 │ 13 │ +│ 98 │ 14 │ +│ 98 │ 15 │ +│ 99 │ 16 │ +│ 99 │ 17 │ +│ 99 │ 18 │ +│ 100 │ 19 │ +│ 100 │ 20 │ +└──────────────┴────────────┘ +``` + +Query: + +```sql +SELECT exponentialMovingAverage(5)(temperature, timestamp); +``` + +Result: + +``` text +┌──exponentialMovingAverage(5)(temperature, timestamp)──┐ +│ 92.25779635374204 │ +└───────────────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT + value, + time, + round(exp_smooth, 3), + bar(exp_smooth, 0, 1, 50) AS bar +FROM +( + SELECT + (number = 0) OR (number >= 25) AS value, + number AS time, + exponentialMovingAverage(10)(value, time) OVER (Rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS exp_smooth + FROM numbers(50) +) +``` + +Result: + +``` text +┌─value─┬─time─┬─round(exp_smooth, 3)─┬─bar────────────────────────────────────────┐ +│ 1 │ 0 │ 0.067 │ ███▎ │ +│ 0 │ 1 │ 0.062 │ ███ │ +│ 0 │ 2 │ 0.058 │ ██▊ │ +│ 0 │ 3 │ 0.054 │ ██▋ │ +│ 0 │ 4 │ 0.051 │ ██▌ │ +│ 0 │ 5 │ 0.047 │ ██▎ │ +│ 0 │ 6 │ 0.044 │ ██▏ │ +│ 0 │ 7 │ 0.041 │ ██ │ +│ 0 │ 8 │ 0.038 │ █▊ │ +│ 0 │ 9 │ 0.036 │ █▋ │ +│ 0 │ 10 │ 0.033 │ █▋ │ +│ 0 │ 11 │ 0.031 │ █▌ │ +│ 0 │ 12 │ 0.029 │ █▍ │ +│ 0 │ 13 │ 0.027 │ █▎ │ +│ 0 │ 14 │ 0.025 │ █▎ │ +│ 0 │ 15 │ 0.024 │ █▏ │ +│ 0 │ 16 │ 0.022 │ █ │ +│ 0 │ 17 │ 0.021 │ █ │ +│ 0 │ 18 │ 0.019 │ ▊ │ +│ 0 │ 19 │ 0.018 │ ▊ │ +│ 0 │ 20 │ 0.017 │ ▋ │ +│ 0 │ 21 │ 0.016 │ ▋ │ +│ 0 │ 22 │ 0.015 │ ▋ │ +│ 0 │ 23 │ 0.014 │ ▋ │ +│ 0 │ 24 │ 0.013 │ ▋ │ +│ 1 │ 25 │ 0.079 │ ███▊ │ +│ 1 │ 26 │ 0.14 │ ███████ │ +│ 1 │ 27 │ 0.198 │ █████████▊ │ +│ 1 │ 28 │ 0.252 │ ████████████▌ │ +│ 1 │ 29 │ 0.302 │ ███████████████ │ +│ 1 │ 30 │ 0.349 │ █████████████████▍ │ +│ 1 │ 31 │ 0.392 │ ███████████████████▌ │ +│ 1 │ 32 │ 0.433 │ █████████████████████▋ │ +│ 1 │ 33 │ 0.471 │ ███████████████████████▌ │ +│ 1 │ 34 │ 0.506 │ █████████████████████████▎ │ +│ 1 │ 35 │ 0.539 │ ██████████████████████████▊ │ +│ 1 │ 36 │ 0.57 │ ████████████████████████████▌ │ +│ 1 │ 37 │ 0.599 │ █████████████████████████████▊ │ +│ 1 │ 38 │ 0.626 │ ███████████████████████████████▎ │ +│ 1 │ 39 │ 0.651 │ ████████████████████████████████▌ │ +│ 1 │ 40 │ 0.674 │ █████████████████████████████████▋ │ +│ 1 │ 41 │ 0.696 │ ██████████████████████████████████▋ │ +│ 1 │ 42 │ 0.716 │ ███████████████████████████████████▋ │ +│ 1 │ 43 │ 0.735 │ ████████████████████████████████████▋ │ +│ 1 │ 44 │ 0.753 │ █████████████████████████████████████▋ │ +│ 1 │ 45 │ 0.77 │ ██████████████████████████████████████▍ │ +│ 1 │ 46 │ 0.785 │ ███████████████████████████████████████▎ │ +│ 1 │ 47 │ 0.8 │ ███████████████████████████████████████▊ │ +│ 1 │ 48 │ 0.813 │ ████████████████████████████████████████▋ │ +│ 1 │ 49 │ 0.825 │ █████████████████████████████████████████▎│ +└───────┴──────┴──────────────────────┴────────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/grouparray.md b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparray.md new file mode 100644 index 00000000000..348ac98c75b --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparray.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 110 +--- + +# groupArray {#agg_function-grouparray} + +Syntax: `groupArray(x)` or `groupArray(max_size)(x)` + +Creates an array of argument values. +Values can be added to the array in any (indeterminate) order. + +The second version (with the `max_size` parameter) limits the size of the resulting array to `max_size` elements. For example, `groupArray(1)(x)` is equivalent to `[any (x)]`. + +In some cases, you can still rely on the order of execution. This applies to cases when `SELECT` comes from a subquery that uses `ORDER BY`. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparrayinsertat.md new file mode 100644 index 00000000000..0699326725e --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparrayinsertat.md @@ -0,0 +1,91 @@ +--- +sidebar_position: 112 +--- + +# groupArrayInsertAt {#grouparrayinsertat} + +Inserts a value into the array at the specified position. + +**Syntax** + +``` sql +groupArrayInsertAt(default_x, size)(x, pos) +``` + +If in one query several values are inserted into the same position, the function behaves in the following ways: + +- If a query is executed in a single thread, the first one of the inserted values is used. +- If a query is executed in multiple threads, the resulting value is an undetermined one of the inserted values. + +**Arguments** + +- `x` — Value to be inserted. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in one of the [supported data types](../../../sql-reference/data-types/index.md). +- `pos` — Position at which the specified element `x` is to be inserted. Index numbering in the array starts from zero. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). +- `default_x` — Default value for substituting in empty positions. Optional parameter. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in the data type configured for the `x` parameter. If `default_x` is not defined, the [default values](../../../sql-reference/statements/create/table.md#create-default-values) are used. +- `size` — Length of the resulting array. Optional parameter. When using this parameter, the default value `default_x` must be specified. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges). + +**Returned value** + +- Array with inserted values. + +Type: [Array](../../../sql-reference/data-types/array.md#data-type-array). + +**Example** + +Query: + +``` sql +SELECT groupArrayInsertAt(toString(number), number * 2) FROM numbers(5); +``` + +Result: + +``` text +┌─groupArrayInsertAt(toString(number), multiply(number, 2))─┐ +│ ['0','','1','','2','','3','','4'] │ +└───────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT groupArrayInsertAt('-')(toString(number), number * 2) FROM numbers(5); +``` + +Result: + +``` text +┌─groupArrayInsertAt('-')(toString(number), multiply(number, 2))─┐ +│ ['0','-','1','-','2','-','3','-','4'] │ +└────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT groupArrayInsertAt('-', 5)(toString(number), number * 2) FROM numbers(5); +``` + +Result: + +``` text +┌─groupArrayInsertAt('-', 5)(toString(number), multiply(number, 2))─┐ +│ ['0','-','1','-','2'] │ +└───────────────────────────────────────────────────────────────────┘ +``` + +Multi-threaded insertion of elements into one position. + +Query: + +``` sql +SELECT groupArrayInsertAt(number, 0) FROM numbers_mt(10) SETTINGS max_block_size = 1; +``` + +As a result of this query you get random integer in the `[0,9]` range. For example: + +``` text +┌─groupArrayInsertAt(number, 0)─┐ +│ [7] │ +└───────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingavg.md b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingavg.md new file mode 100644 index 00000000000..dc3cc74721e --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingavg.md @@ -0,0 +1,78 @@ +--- +sidebar_position: 114 +--- + +# groupArrayMovingAvg {#agg_function-grouparraymovingavg} + +Calculates the moving average of input values. + +``` sql +groupArrayMovingAvg(numbers_for_summing) +groupArrayMovingAvg(window_size)(numbers_for_summing) +``` + +The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column. + +**Arguments** + +- `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value. +- `window_size` — Size of the calculation window. + +**Returned values** + +- Array of the same size and type as the input data. + +The function uses [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). It truncates the decimal places insignificant for the resulting data type. + +**Example** + +The sample table `b`: + +``` sql +CREATE TABLE t +( + `int` UInt8, + `float` Float32, + `dec` Decimal32(2) +) +ENGINE = TinyLog +``` + +``` text +┌─int─┬─float─┬──dec─┐ +│ 1 │ 1.1 │ 1.10 │ +│ 2 │ 2.2 │ 2.20 │ +│ 4 │ 4.4 │ 4.40 │ +│ 7 │ 7.77 │ 7.77 │ +└─────┴───────┴──────┘ +``` + +The queries: + +``` sql +SELECT + groupArrayMovingAvg(int) AS I, + groupArrayMovingAvg(float) AS F, + groupArrayMovingAvg(dec) AS D +FROM t +``` + +``` text +┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐ +│ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │ +└───────────┴─────────────────────────────────────┴───────────────────────┘ +``` + +``` sql +SELECT + groupArrayMovingAvg(2)(int) AS I, + groupArrayMovingAvg(2)(float) AS F, + groupArrayMovingAvg(2)(dec) AS D +FROM t +``` + +``` text +┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐ +│ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │ +└───────────┴──────────────────────────────────┴───────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingsum.md new file mode 100644 index 00000000000..563280b7dec --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraymovingsum.md @@ -0,0 +1,76 @@ +--- +sidebar_position: 113 +--- + +# groupArrayMovingSum {#agg_function-grouparraymovingsum} + +Calculates the moving sum of input values. + +``` sql +groupArrayMovingSum(numbers_for_summing) +groupArrayMovingSum(window_size)(numbers_for_summing) +``` + +The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column. + +**Arguments** + +- `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value. +- `window_size` — Size of the calculation window. + +**Returned values** + +- Array of the same size and type as the input data. + +**Example** + +The sample table: + +``` sql +CREATE TABLE t +( + `int` UInt8, + `float` Float32, + `dec` Decimal32(2) +) +ENGINE = TinyLog +``` + +``` text +┌─int─┬─float─┬──dec─┐ +│ 1 │ 1.1 │ 1.10 │ +│ 2 │ 2.2 │ 2.20 │ +│ 4 │ 4.4 │ 4.40 │ +│ 7 │ 7.77 │ 7.77 │ +└─────┴───────┴──────┘ +``` + +The queries: + +``` sql +SELECT + groupArrayMovingSum(int) AS I, + groupArrayMovingSum(float) AS F, + groupArrayMovingSum(dec) AS D +FROM t +``` + +``` text +┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ +│ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │ +└────────────┴─────────────────────────────────┴────────────────────────┘ +``` + +``` sql +SELECT + groupArrayMovingSum(2)(int) AS I, + groupArrayMovingSum(2)(float) AS F, + groupArrayMovingSum(2)(dec) AS D +FROM t +``` + +``` text +┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ +│ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │ +└────────────┴─────────────────────────────────┴────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysample.md new file mode 100644 index 00000000000..f0406ddc93c --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysample.md @@ -0,0 +1,81 @@ +--- +sidebar_position: 114 +--- + +# groupArraySample {#grouparraysample} + +Creates an array of sample argument values. The size of the resulting array is limited to `max_size` elements. Argument values are selected and added to the array randomly. + +**Syntax** + +``` sql +groupArraySample(max_size[, seed])(x) +``` + +**Arguments** + +- `max_size` — Maximum size of the resulting array. [UInt64](../../data-types/int-uint.md). +- `seed` — Seed for the random number generator. Optional. [UInt64](../../data-types/int-uint.md). Default value: `123456`. +- `x` — Argument (column name or expression). + +**Returned values** + +- Array of randomly selected `x` arguments. + +Type: [Array](../../data-types/array.md). + +**Examples** + +Consider table `colors`: + +``` text +┌─id─┬─color──┐ +│ 1 │ red │ +│ 2 │ blue │ +│ 3 │ green │ +│ 4 │ white │ +│ 5 │ orange │ +└────┴────────┘ +``` + +Query with column name as argument: + +``` sql +SELECT groupArraySample(3)(color) as newcolors FROM colors; +``` + +Result: + +```text +┌─newcolors──────────────────┐ +│ ['white','blue','green'] │ +└────────────────────────────┘ +``` + +Query with column name and different seed: + +``` sql +SELECT groupArraySample(3, 987654321)(color) as newcolors FROM colors; +``` + +Result: + +```text +┌─newcolors──────────────────┐ +│ ['red','orange','green'] │ +└────────────────────────────┘ +``` + +Query with expression as argument: + +``` sql +SELECT groupArraySample(3)(concat('light-', color)) as newcolors FROM colors; +``` + +Result: + +```text +┌─newcolors───────────────────────────────────┐ +│ ['light-blue','light-orange','light-green'] │ +└─────────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysorted.md b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysorted.md new file mode 100644 index 00000000000..e34fcbc5788 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/grouparraysorted.md @@ -0,0 +1,48 @@ +--- +sidebar_position: 108 +--- + +# groupArraySorted {#groupArraySorted} + +Returns an array with the first N items in ascending order. + +``` sql +groupArraySorted(N)(column) +``` + +**Arguments** + +- `N` – The number of elements to return. + +If the parameter is omitted, default value 10 is used. + +**Arguments** + +- `column` – The value. +- `expr` — Optional. The field or expresion to sort by. If not set values are sorted by themselves. + +**Example** + +Gets the first 10 numbers: + +``` sql +SELECT groupArraySorted(10)(number) FROM numbers(100) +``` + +``` text +┌─groupArraySorted(10)(number)─┐ +│ [0,1,2,3,4,5,6,7,8,9] │ +└──────────────────────────────┘ +``` + +Or the last 10: + +``` sql +SELECT groupArraySorted(10)(number, -number) FROM numbers(100) +``` + +``` text +┌─groupArraySorted(10)(number, negate(number))─┐ +│ [99,98,97,96,95,94,93,92,91,90] │ +└──────────────────────────────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitand.md new file mode 100644 index 00000000000..0ebb9aec495 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitand.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 125 +--- + +# groupBitAnd {#groupbitand} + +Applies bitwise `AND` for series of numbers. + +``` sql +groupBitAnd(expr) +``` + +**Arguments** + +`expr` – An expression that results in `UInt*` type. + +**Return value** + +Value of the `UInt*` type. + +**Example** + +Test data: + +``` text +binary decimal +00101100 = 44 +00011100 = 28 +00001101 = 13 +01010101 = 85 +``` + +Query: + +``` sql +SELECT groupBitAnd(num) FROM t +``` + +Where `num` is the column with the test data. + +Result: + +``` text +binary decimal +00000100 = 4 +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmap.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmap.md new file mode 100644 index 00000000000..7f1fee6a9f0 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmap.md @@ -0,0 +1,44 @@ +--- +sidebar_position: 128 +--- + +# groupBitmap {#groupbitmap} + +Bitmap or Aggregate calculations from a unsigned integer column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md). + +``` sql +groupBitmap(expr) +``` + +**Arguments** + +`expr` – An expression that results in `UInt*` type. + +**Return value** + +Value of the `UInt64` type. + +**Example** + +Test data: + +``` text +UserID +1 +1 +2 +3 +``` + +Query: + +``` sql +SELECT groupBitmap(UserID) as num FROM t +``` + +Result: + +``` text +num +3 +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapand.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapand.md new file mode 100644 index 00000000000..89c94547f8b --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapand.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 129 +--- + +# groupBitmapAnd {#groupbitmapand} + +Calculations the AND of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md). + +``` sql +groupBitmapAnd(expr) +``` + +**Arguments** + +`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type. + +**Return value** + +Value of the `UInt64` type. + +**Example** + +``` sql +DROP TABLE IF EXISTS bitmap_column_expr_test2; +CREATE TABLE bitmap_column_expr_test2 +( + tag_id String, + z AggregateFunction(groupBitmap, UInt32) +) +ENGINE = MergeTree +ORDER BY tag_id; + +INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); + +SELECT groupBitmapAnd(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─groupBitmapAnd(z)─┐ +│ 3 │ +└───────────────────┘ + +SELECT arraySort(bitmapToArray(groupBitmapAndState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─arraySort(bitmapToArray(groupBitmapAndState(z)))─┐ +│ [6,8,10] │ +└──────────────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapor.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapor.md new file mode 100644 index 00000000000..172a3bb29ac --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapor.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 130 +--- + +# groupBitmapOr {#groupbitmapor} + +Calculations the OR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md). This is equivalent to `groupBitmapMerge`. + +``` sql +groupBitmapOr(expr) +``` + +**Arguments** + +`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type. + +**Returned value** + +Value of the `UInt64` type. + +**Example** + +``` sql +DROP TABLE IF EXISTS bitmap_column_expr_test2; +CREATE TABLE bitmap_column_expr_test2 +( + tag_id String, + z AggregateFunction(groupBitmap, UInt32) +) +ENGINE = MergeTree +ORDER BY tag_id; + +INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); + +SELECT groupBitmapOr(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─groupBitmapOr(z)─┐ +│ 15 │ +└──────────────────┘ + +SELECT arraySort(bitmapToArray(groupBitmapOrState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─arraySort(bitmapToArray(groupBitmapOrState(z)))─┐ +│ [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] │ +└─────────────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapxor.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapxor.md new file mode 100644 index 00000000000..52c45815cc5 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitmapxor.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 131 +--- + +# groupBitmapXor {#groupbitmapxor} + +Calculations the XOR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../../../sql-reference/functions/bitmap-functions.md). + +``` sql +groupBitmapOr(expr) +``` + +**Arguments** + +`expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type. + +**Returned value** + +Value of the `UInt64` type. + +**Example** + +``` sql +DROP TABLE IF EXISTS bitmap_column_expr_test2; +CREATE TABLE bitmap_column_expr_test2 +( + tag_id String, + z AggregateFunction(groupBitmap, UInt32) +) +ENGINE = MergeTree +ORDER BY tag_id; + +INSERT INTO bitmap_column_expr_test2 VALUES ('tag1', bitmapBuild(cast([1,2,3,4,5,6,7,8,9,10] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag2', bitmapBuild(cast([6,7,8,9,10,11,12,13,14,15] as Array(UInt32)))); +INSERT INTO bitmap_column_expr_test2 VALUES ('tag3', bitmapBuild(cast([2,4,6,8,10,12] as Array(UInt32)))); + +SELECT groupBitmapXor(z) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─groupBitmapXor(z)─┐ +│ 10 │ +└───────────────────┘ + +SELECT arraySort(bitmapToArray(groupBitmapXorState(z))) FROM bitmap_column_expr_test2 WHERE like(tag_id, 'tag%'); +┌─arraySort(bitmapToArray(groupBitmapXorState(z)))─┐ +│ [1,3,5,6,8,10,11,13,14,15] │ +└──────────────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitor.md new file mode 100644 index 00000000000..c1ee1c40894 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitor.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 126 +--- + +# groupBitOr {#groupbitor} + +Applies bitwise `OR` for series of numbers. + +``` sql +groupBitOr(expr) +``` + +**Arguments** + +`expr` – An expression that results in `UInt*` type. + +**Returned value** + +Value of the `UInt*` type. + +**Example** + +Test data: + +``` text +binary decimal +00101100 = 44 +00011100 = 28 +00001101 = 13 +01010101 = 85 +``` + +Query: + +``` sql +SELECT groupBitOr(num) FROM t +``` + +Where `num` is the column with the test data. + +Result: + +``` text +binary decimal +01111101 = 125 +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitxor.md new file mode 100644 index 00000000000..472bcdf65c1 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupbitxor.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 127 +--- + +# groupBitXor {#groupbitxor} + +Applies bitwise `XOR` for series of numbers. + +``` sql +groupBitXor(expr) +``` + +**Arguments** + +`expr` – An expression that results in `UInt*` type. + +**Return value** + +Value of the `UInt*` type. + +**Example** + +Test data: + +``` text +binary decimal +00101100 = 44 +00011100 = 28 +00001101 = 13 +01010101 = 85 +``` + +Query: + +``` sql +SELECT groupBitXor(num) FROM t +``` + +Where `num` is the column with the test data. + +Result: + +``` text +binary decimal +01101000 = 104 +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/groupuniqarray.md b/docs/en/reference/sql-reference/aggregate-functions/reference/groupuniqarray.md new file mode 100644 index 00000000000..9b5058032e5 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/groupuniqarray.md @@ -0,0 +1,12 @@ +--- +sidebar_position: 111 +--- + +# groupUniqArray {#groupuniqarray} + +Syntax: `groupUniqArray(x)` or `groupUniqArray(max_size)(x)` + +Creates an array from different argument values. Memory consumption is the same as for the [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) function. + +The second version (with the `max_size` parameter) limits the size of the resulting array to `max_size` elements. +For example, `groupUniqArray(1)(x)` is equivalent to `[any(x)]`. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/index.md b/docs/en/reference/sql-reference/aggregate-functions/reference/index.md new file mode 100644 index 00000000000..cd71bca2556 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/index.md @@ -0,0 +1,76 @@ +--- +toc_folder_title: Reference +sidebar_position: 36 +toc_hidden: true +--- + +# List of Aggregate Functions {#aggregate-functions-reference} + +Standard aggregate functions: + +- [count](../../../sql-reference/aggregate-functions/reference/count.md) +- [min](../../../sql-reference/aggregate-functions/reference/min.md) +- [max](../../../sql-reference/aggregate-functions/reference/max.md) +- [sum](../../../sql-reference/aggregate-functions/reference/sum.md) +- [avg](../../../sql-reference/aggregate-functions/reference/avg.md) +- [any](../../../sql-reference/aggregate-functions/reference/any.md) +- [stddevPop](../../../sql-reference/aggregate-functions/reference/stddevpop.md) +- [stddevSamp](../../../sql-reference/aggregate-functions/reference/stddevsamp.md) +- [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md) +- [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md) +- [covarPop](../../../sql-reference/aggregate-functions/reference/covarpop.md) +- [covarSamp](../../../sql-reference/aggregate-functions/reference/covarsamp.md) + +ClickHouse-specific aggregate functions: + +- [anyHeavy](../../../sql-reference/aggregate-functions/reference/anyheavy.md) +- [anyLast](../../../sql-reference/aggregate-functions/reference/anylast.md) +- [argMin](../../../sql-reference/aggregate-functions/reference/argmin.md) +- [argMax](../../../sql-reference/aggregate-functions/reference/argmax.md) +- [avgWeighted](../../../sql-reference/aggregate-functions/reference/avgweighted.md) +- [topK](../../../sql-reference/aggregate-functions/reference/topk.md) +- [topKWeighted](../../../sql-reference/aggregate-functions/reference/topkweighted.md) +- [groupArray](../../../sql-reference/aggregate-functions/reference/grouparray.md) +- [groupUniqArray](../../../sql-reference/aggregate-functions/reference/groupuniqarray.md) +- [groupArrayInsertAt](../../../sql-reference/aggregate-functions/reference/grouparrayinsertat.md) +- [groupArrayMovingAvg](../../../sql-reference/aggregate-functions/reference/grouparraymovingavg.md) +- [groupArrayMovingSum](../../../sql-reference/aggregate-functions/reference/grouparraymovingsum.md) +- [groupBitAnd](../../../sql-reference/aggregate-functions/reference/groupbitand.md) +- [groupBitOr](../../../sql-reference/aggregate-functions/reference/groupbitor.md) +- [groupBitXor](../../../sql-reference/aggregate-functions/reference/groupbitxor.md) +- [groupBitmap](../../../sql-reference/aggregate-functions/reference/groupbitmap.md) +- [groupBitmapAnd](../../../sql-reference/aggregate-functions/reference/groupbitmapand.md) +- [groupBitmapOr](../../../sql-reference/aggregate-functions/reference/groupbitmapor.md) +- [groupBitmapXor](../../../sql-reference/aggregate-functions/reference/groupbitmapxor.md) +- [sumWithOverflow](../../../sql-reference/aggregate-functions/reference/sumwithoverflow.md) +- [sumMap](../../../sql-reference/aggregate-functions/reference/summap.md) +- [minMap](../../../sql-reference/aggregate-functions/reference/minmap.md) +- [maxMap](../../../sql-reference/aggregate-functions/reference/maxmap.md) +- [skewSamp](../../../sql-reference/aggregate-functions/reference/skewsamp.md) +- [skewPop](../../../sql-reference/aggregate-functions/reference/skewpop.md) +- [kurtSamp](../../../sql-reference/aggregate-functions/reference/kurtsamp.md) +- [kurtPop](../../../sql-reference/aggregate-functions/reference/kurtpop.md) +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md) +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md) +- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md) +- [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md) +- [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md) +- [quantileExactLow](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactlow) +- [quantileExactHigh](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexacthigh) +- [quantileExactWeighted](../../../sql-reference/aggregate-functions/reference/quantileexactweighted.md) +- [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md) +- [quantileTimingWeighted](../../../sql-reference/aggregate-functions/reference/quantiletimingweighted.md) +- [quantileDeterministic](../../../sql-reference/aggregate-functions/reference/quantiledeterministic.md) +- [quantileTDigest](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) +- [quantileTDigestWeighted](../../../sql-reference/aggregate-functions/reference/quantiletdigestweighted.md) +- [quantileBFloat16](../../../sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16) +- [quantileBFloat16Weighted](../../../sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16weighted) +- [simpleLinearRegression](../../../sql-reference/aggregate-functions/reference/simplelinearregression.md) +- [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md) +- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md) +- [categoricalInformationValue](../../../sql-reference/aggregate-functions/reference/categoricalinformationvalue.md) + +[Original article](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/intervalLengthSum.md b/docs/en/reference/sql-reference/aggregate-functions/reference/intervalLengthSum.md new file mode 100644 index 00000000000..33c5686cbbc --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/intervalLengthSum.md @@ -0,0 +1,108 @@ +--- +sidebar_position: 146 +sidebar_label: intervalLengthSum +--- + +# intervalLengthSum {#agg_function-intervallengthsum} + +Calculates the total length of union of all ranges (segments on numeric axis). + +**Syntax** + +``` sql +intervalLengthSum(start, end) +``` + +**Arguments** + +- `start` — The starting value of the interval. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) or [Date](../../../sql-reference/data-types/date.md#data_type-date). +- `end` — The ending value of the interval. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) or [Date](../../../sql-reference/data-types/date.md#data_type-date). + +:::note +Arguments must be of the same data type. Otherwise, an exception will be thrown. +::: + +**Returned value** + +- Total length of union of all ranges (segments on numeric axis). Depending on the type of the argument, the return value may be [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64) type. + +**Examples** + +1. Input table: + +``` text +┌─id─┬─start─┬─end─┐ +│ a │ 1.1 │ 2.9 │ +│ a │ 2.5 │ 3.2 │ +│ a │ 4 │ 5 │ +└────┴───────┴─────┘ +``` + +In this example, the arguments of the Float32 type are used. The function returns a value of the Float64 type. + +Result is the sum of lengths of intervals `[1.1, 3.2]` (union of `[1.1, 2.9]` and `[2.5, 3.2]`) and `[4, 5]` + +Query: + +``` sql +SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM fl_interval GROUP BY id ORDER BY id; +``` + +Result: + +``` text +┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐ +│ a │ 3.1 │ Float64 │ +└────┴───────────────────────────────┴───────────────────────────────────────────┘ +``` + +2. Input table: + +``` text +┌─id─┬───────────────start─┬─────────────────end─┐ +│ a │ 2020-01-01 01:12:30 │ 2020-01-01 02:10:10 │ +│ a │ 2020-01-01 02:05:30 │ 2020-01-01 02:50:31 │ +│ a │ 2020-01-01 03:11:22 │ 2020-01-01 03:23:31 │ +└────┴─────────────────────┴─────────────────────┘ +``` + +In this example, the arguments of the DateTime type are used. The function returns a value in seconds. + +Query: + +``` sql +SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM dt_interval GROUP BY id ORDER BY id; +``` + +Result: + +``` text +┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐ +│ a │ 6610 │ UInt64 │ +└────┴───────────────────────────────┴───────────────────────────────────────────┘ +``` + +3. Input table: + +``` text +┌─id─┬──────start─┬────────end─┐ +│ a │ 2020-01-01 │ 2020-01-04 │ +│ a │ 2020-01-12 │ 2020-01-18 │ +└────┴────────────┴────────────┘ +``` + +In this example, the arguments of the Date type are used. The function returns a value in days. + +Query: + +``` sql +SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM date_interval GROUP BY id ORDER BY id; +``` + +Result: + +``` text +┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐ +│ a │ 9 │ UInt64 │ +└────┴───────────────────────────────┴───────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/en/reference/sql-reference/aggregate-functions/reference/kurtpop.md new file mode 100644 index 00000000000..5640e69ba7c --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/kurtpop.md @@ -0,0 +1,25 @@ +--- +sidebar_position: 153 +--- + +# kurtPop {#kurtpop} + +Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence. + +``` sql +kurtPop(expr) +``` + +**Arguments** + +`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number. + +**Returned value** + +The kurtosis of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md) + +**Example** + +``` sql +SELECT kurtPop(value) FROM series_with_value_column; +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/en/reference/sql-reference/aggregate-functions/reference/kurtsamp.md new file mode 100644 index 00000000000..c0768edaf2d --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/kurtsamp.md @@ -0,0 +1,27 @@ +--- +sidebar_position: 154 +--- + +# kurtSamp {#kurtsamp} + +Computes the [sample kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence. + +It represents an unbiased estimate of the kurtosis of a random variable if passed values form its sample. + +``` sql +kurtSamp(expr) +``` + +**Arguments** + +`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number. + +**Returned value** + +The kurtosis of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md). If `n <= 1` (`n` is a size of the sample), then the function returns `nan`. + +**Example** + +``` sql +SELECT kurtSamp(value) FROM series_with_value_column; +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/en/reference/sql-reference/aggregate-functions/reference/mannwhitneyutest.md new file mode 100644 index 00000000000..32e56b8de10 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -0,0 +1,74 @@ +--- +sidebar_position: 310 +sidebar_label: mannWhitneyUTest +--- + +# mannWhitneyUTest {#mannwhitneyutest} + +Applies the Mann-Whitney rank test to samples from two populations. + +**Syntax** + +``` sql +mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_index) +``` + +Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population. +The null hypothesis is that two populations are stochastically equal. Also one-sided hypothesises can be tested. This test does not assume that data have normal distribution. + +**Arguments** + +- `sample_data` — sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — sample index. [Integer](../../../sql-reference/data-types/int-uint.md). + +**Parameters** + +- `alternative` — alternative hypothesis. (Optional, default: `'two-sided'`.) [String](../../../sql-reference/data-types/string.md). + - `'two-sided'`; + - `'greater'`; + - `'less'`. +- `continuity_correction` — if not 0 then continuity correction in the normal approximation for the p-value is applied. (Optional, default: 1.) [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Returned values** + +[Tuple](../../../sql-reference/data-types/tuple.md) with two elements: + +- calculated U-statistic. [Float64](../../../sql-reference/data-types/float.md). +- calculated p-value. [Float64](../../../sql-reference/data-types/float.md). + + +**Example** + +Input table: + +``` text +┌─sample_data─┬─sample_index─┐ +│ 10 │ 0 │ +│ 11 │ 0 │ +│ 12 │ 0 │ +│ 1 │ 1 │ +│ 2 │ 1 │ +│ 3 │ 1 │ +└─────────────┴──────────────┘ +``` + +Query: + +``` sql +SELECT mannWhitneyUTest('greater')(sample_data, sample_index) FROM mww_ttest; +``` + +Result: + +``` text +┌─mannWhitneyUTest('greater')(sample_data, sample_index)─┐ +│ (9,0.04042779918503192) │ +└────────────────────────────────────────────────────────┘ +``` + +**See Also** + +- [Mann–Whitney U test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test) +- [Stochastic ordering](https://en.wikipedia.org/wiki/Stochastic_ordering) + +[Original article](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest/) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/max.md b/docs/en/reference/sql-reference/aggregate-functions/reference/max.md new file mode 100644 index 00000000000..845d0c5ecee --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/max.md @@ -0,0 +1,24 @@ +--- +sidebar_position: 3 +--- + +# max {#agg_function-max} + +Aggregate function that calculates the maximum across a group of values. + +Example: + +``` +SELECT max(salary) FROM employees; +``` + +``` +SELECT department, max(salary) FROM employees GROUP BY department; +``` + +If you need non-aggregate function to choose a maximum of two values, see `greatest`: + +``` +SELECT greatest(a, b) FROM table; +``` + diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/maxmap.md b/docs/en/reference/sql-reference/aggregate-functions/reference/maxmap.md new file mode 100644 index 00000000000..243a3375552 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/maxmap.md @@ -0,0 +1,28 @@ +--- +sidebar_position: 143 +--- + +# maxMap {#agg_functions-maxmap} + +Syntax: `maxMap(key, value)` or `maxMap(Tuple(key, value))` + +Calculates the maximum from `value` array according to the keys specified in the `key` array. + +Passing a tuple of keys and value arrays is identical to passing two arrays of keys and values. + +The number of elements in `key` and `value` must be the same for each row that is totaled. + +Returns a tuple of two arrays: keys and values calculated for the corresponding keys. + +Example: + +``` sql +SELECT maxMap(a, b) +FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1])) +``` + +``` text +┌─maxMap(a, b)──────┐ +│ ([1,2,3],[2,2,1]) │ +└───────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/meanztest.md b/docs/en/reference/sql-reference/aggregate-functions/reference/meanztest.md new file mode 100644 index 00000000000..02b89b1b31d --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/meanztest.md @@ -0,0 +1,70 @@ +--- +sidebar_position: 303 +sidebar_label: meanZTest +--- + +# meanZTest {#meanztest} + +Applies mean z-test to samples from two populations. + +**Syntax** + +``` sql +meanZTest(population_variance_x, population_variance_y, confidence_level)(sample_data, sample_index) +``` + +Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population. +The null hypothesis is that means of populations are equal. Normal distribution is assumed. Populations may have unequal variance and the variances are known. + +**Arguments** + +- `sample_data` — Sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — Sample index. [Integer](../../../sql-reference/data-types/int-uint.md). + +**Parameters** + +- `population_variance_x` — Variance for population x. [Float](../../../sql-reference/data-types/float.md). +- `population_variance_y` — Variance for population y. [Float](../../../sql-reference/data-types/float.md). +- `confidence_level` — Confidence level in order to calculate confidence intervals. [Float](../../../sql-reference/data-types/float.md). + +**Returned values** + +[Tuple](../../../sql-reference/data-types/tuple.md) with four elements: + +- calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md). +- calculated p-value. [Float64](../../../sql-reference/data-types/float.md). +- calculated confidence-interval-low. [Float64](../../../sql-reference/data-types/float.md). +- calculated confidence-interval-high. [Float64](../../../sql-reference/data-types/float.md). + + +**Example** + +Input table: + +``` text +┌─sample_data─┬─sample_index─┐ +│ 20.3 │ 0 │ +│ 21.9 │ 0 │ +│ 22.1 │ 0 │ +│ 18.9 │ 1 │ +│ 19 │ 1 │ +│ 20.3 │ 1 │ +└─────────────┴──────────────┘ +``` + +Query: + +``` sql +SELECT meanZTest(0.7, 0.45, 0.95)(sample_data, sample_index) FROM mean_ztest +``` + +Result: + +``` text +┌─meanZTest(0.7, 0.45, 0.95)(sample_data, sample_index)────────────────────────────┐ +│ (3.2841296025548123,0.0010229786769086013,0.8198428246768334,3.2468238419898365) │ +└──────────────────────────────────────────────────────────────────────────────────┘ +``` + + +[Original article](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/meanZTest/) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/median.md b/docs/en/reference/sql-reference/aggregate-functions/reference/median.md new file mode 100644 index 00000000000..3e84b4b169c --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/median.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 212 +--- + +# median {#median} + +The `median*` functions are the aliases for the corresponding `quantile*` functions. They calculate median of a numeric data sample. + +Functions: + +- `median` — Alias for [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile). +- `medianDeterministic` — Alias for [quantileDeterministic](../../../sql-reference/aggregate-functions/reference/quantiledeterministic.md#quantiledeterministic). +- `medianExact` — Alias for [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact). +- `medianExactWeighted` — Alias for [quantileExactWeighted](../../../sql-reference/aggregate-functions/reference/quantileexactweighted.md#quantileexactweighted). +- `medianTiming` — Alias for [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). +- `medianTimingWeighted` — Alias for [quantileTimingWeighted](../../../sql-reference/aggregate-functions/reference/quantiletimingweighted.md#quantiletimingweighted). +- `medianTDigest` — Alias for [quantileTDigest](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md#quantiletdigest). +- `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](../../../sql-reference/aggregate-functions/reference/quantiletdigestweighted.md#quantiletdigestweighted). +- `medianBFloat16` — Alias for [quantileBFloat16](../../../sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16). + +**Example** + +Input table: + +``` text +┌─val─┐ +│ 1 │ +│ 1 │ +│ 2 │ +│ 3 │ +└─────┘ +``` + +Query: + +``` sql +SELECT medianDeterministic(val, 1) FROM t; +``` + +Result: + +``` text +┌─medianDeterministic(val, 1)─┐ +│ 1.5 │ +└─────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/min.md b/docs/en/reference/sql-reference/aggregate-functions/reference/min.md new file mode 100644 index 00000000000..0525066e9f3 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/min.md @@ -0,0 +1,23 @@ +--- +sidebar_position: 2 +--- + +## min {#agg_function-min} + +Aggregate function that calculates the minimum across a group of values. + +Example: + +``` +SELECT min(salary) FROM employees; +``` + +``` +SELECT department, min(salary) FROM employees GROUP BY department; +``` + +If you need non-aggregate function to choose a minimum of two values, see `least`: + +``` +SELECT least(a, b) FROM table; +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/minmap.md b/docs/en/reference/sql-reference/aggregate-functions/reference/minmap.md new file mode 100644 index 00000000000..8a4d50dd46c --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/minmap.md @@ -0,0 +1,28 @@ +--- +sidebar_position: 142 +--- + +# minMap {#agg_functions-minmap} + +Syntax: `minMap(key, value)` or `minMap(Tuple(key, value))` + +Calculates the minimum from `value` array according to the keys specified in the `key` array. + +Passing a tuple of keys and value ​​arrays is identical to passing two arrays of keys and values. + +The number of elements in `key` and `value` must be the same for each row that is totaled. + +Returns a tuple of two arrays: keys in sorted order, and values calculated for the corresponding keys. + +Example: + +``` sql +SELECT minMap(a, b) +FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1])) +``` + +``` text +┌─minMap(a, b)──────┐ +│ ([1,2,3],[2,1,1]) │ +└───────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantile.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantile.md new file mode 100644 index 00000000000..6a0479da77f --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantile.md @@ -0,0 +1,68 @@ +--- +sidebar_position: 200 +--- + +# quantile {#quantile} + +Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) with a reservoir size up to 8192 and a random number generator for sampling. The result is non-deterministic. To get an exact quantile, use the [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) function. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +Note that for an empty numeric sequence, `quantile` will return NaN, but its `quantile*` variants will return either NaN or a default value for the sequence type, depending on the variant. + +**Syntax** + +``` sql +quantile(level)(expr) +``` + +Alias: `median`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Returned value** + +- Approximate quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Input table: + +``` text +┌─val─┐ +│ 1 │ +│ 1 │ +│ 2 │ +│ 3 │ +└─────┘ +``` + +Query: + +``` sql +SELECT quantile(val) FROM t +``` + +Result: + +``` text +┌─quantile(val)─┐ +│ 1.5 │ +└───────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantilebfloat16.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantilebfloat16.md new file mode 100644 index 00000000000..f0bd51f0add --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantilebfloat16.md @@ -0,0 +1,68 @@ +--- +sidebar_position: 209 +--- + +# quantileBFloat16 {#quantilebfloat16} + +Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a sample consisting of [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) numbers. `bfloat16` is a floating-point data type with 1 sign bit, 8 exponent bits and 7 fraction bits. +The function converts input values to 32-bit floats and takes the most significant 16 bits. Then it calculates `bfloat16` quantile value and converts the result to a 64-bit float by appending zero bits. +The function is a fast quantile estimator with a relative error no more than 0.390625%. + +**Syntax** + +``` sql +quantileBFloat16[(level)](expr) +``` + +Alias: `medianBFloat16` + +**Arguments** + +- `expr` — Column with numeric data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md). + +**Parameters** + +- `level` — Level of quantile. Optional. Possible values are in the range from 0 to 1. Default value: 0.5. [Float](../../../sql-reference/data-types/float.md). + +**Returned value** + +- Approximate quantile of the specified level. + +Type: [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Example** + +Input table has an integer and a float columns: + +``` text +┌─a─┬─────b─┐ +│ 1 │ 1.001 │ +│ 2 │ 1.002 │ +│ 3 │ 1.003 │ +│ 4 │ 1.004 │ +└───┴───────┘ +``` + +Query to calculate 0.75-quantile (third quartile): + +``` sql +SELECT quantileBFloat16(0.75)(a), quantileBFloat16(0.75)(b) FROM example_table; +``` + +Result: + +``` text +┌─quantileBFloat16(0.75)(a)─┬─quantileBFloat16(0.75)(b)─┐ +│ 3 │ 1 │ +└───────────────────────────┴───────────────────────────┘ +``` +Note that all floating point values in the example are truncated to 1.0 when converting to `bfloat16`. + +# quantileBFloat16Weighted {#quantilebfloat16weighted} + +Like `quantileBFloat16` but takes into account the weight of each sequence member. + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiledeterministic.md new file mode 100644 index 00000000000..bb23ce63cea --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiledeterministic.md @@ -0,0 +1,67 @@ +--- +sidebar_position: 206 +--- + +# quantileDeterministic {#quantiledeterministic} + +Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) with a reservoir size up to 8192 and deterministic algorithm of sampling. The result is deterministic. To get an exact quantile, use the [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact) function. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileDeterministic(level)(expr, determinator) +``` + +Alias: `medianDeterministic`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). +- `determinator` — Number whose hash is used instead of a random number generator in the reservoir sampling algorithm to make the result of sampling deterministic. As a determinator you can use any deterministic positive number, for example, a user id or an event id. If the same determinator value occures too often, the function works incorrectly. + +**Returned value** + +- Approximate quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Input table: + +``` text +┌─val─┐ +│ 1 │ +│ 1 │ +│ 2 │ +│ 3 │ +└─────┘ +``` + +Query: + +``` sql +SELECT quantileDeterministic(val, 1) FROM t +``` + +Result: + +``` text +┌─quantileDeterministic(val, 1)─┐ +│ 1.5 │ +└───────────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantileexact.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantileexact.md new file mode 100644 index 00000000000..b3a384b0cfd --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantileexact.md @@ -0,0 +1,270 @@ +--- +sidebar_position: 202 +--- + +# quantileExact Functions {#quantileexact-functions} + +## quantileExact {#quantileexact} + +Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memory, where `n` is a number of values that were passed. However, for a small number of values, the function is very effective. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileExact(level)(expr) +``` + +Alias: `medianExact`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Returned value** + +- Quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +SELECT quantileExact(number) FROM numbers(10) +``` + +Result: + +``` text +┌─quantileExact(number)─┐ +│ 5 │ +└───────────────────────┘ +``` + +## quantileExactLow {#quantileexactlow} + +Similar to `quantileExact`, this computes the exact [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +To get the exact value, all the passed values are combined into an array, which is then fully sorted. The sorting [algorithm's](https://en.cppreference.com/w/cpp/algorithm/sort) complexity is `O(N·log(N))`, where `N = std::distance(first, last)` comparisons. + +The return value depends on the quantile level and the number of elements in the selection, i.e. if the level is 0.5, then the function returns the lower median value for an even number of elements and the middle median value for an odd number of elements. Median is calculated similarly to the [median_low](https://docs.python.org/3/library/statistics.html#statistics.median_low) implementation which is used in python. + +For all other levels, the element at the index corresponding to the value of `level * size_of_array` is returned. For example: + +``` sql +SELECT quantileExactLow(0.1)(number) FROM numbers(10) + +┌─quantileExactLow(0.1)(number)─┐ +│ 1 │ +└───────────────────────────────┘ +``` + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileExactLow(level)(expr) +``` + +Alias: `medianExactLow`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Returned value** + +- Quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +SELECT quantileExactLow(number) FROM numbers(10) +``` + +Result: + +``` text +┌─quantileExactLow(number)─┐ +│ 4 │ +└──────────────────────────┘ +``` +## quantileExactHigh {#quantileexacthigh} + +Similar to `quantileExact`, this computes the exact [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +All the passed values are combined into an array, which is then fully sorted, to get the exact value. The sorting [algorithm's](https://en.cppreference.com/w/cpp/algorithm/sort) complexity is `O(N·log(N))`, where `N = std::distance(first, last)` comparisons. + +The return value depends on the quantile level and the number of elements in the selection, i.e. if the level is 0.5, then the function returns the higher median value for an even number of elements and the middle median value for an odd number of elements. Median is calculated similarly to the [median_high](https://docs.python.org/3/library/statistics.html#statistics.median_high) implementation which is used in python. For all other levels, the element at the index corresponding to the value of `level * size_of_array` is returned. + +This implementation behaves exactly similar to the current `quantileExact` implementation. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileExactHigh(level)(expr) +``` + +Alias: `medianExactHigh`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Returned value** + +- Quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +SELECT quantileExactHigh(number) FROM numbers(10) +``` + +Result: + +``` text +┌─quantileExactHigh(number)─┐ +│ 5 │ +└───────────────────────────┘ +``` + +## quantileExactExclusive {#quantileexactexclusive} + +Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memory, where `n` is a number of values that were passed. However, for a small number of values, the function is very effective. + +This function is equivalent to [PERCENTILE.EXC](https://support.microsoft.com/en-us/office/percentile-exc-function-bbaa7204-e9e1-4010-85bf-c31dc5dce4ba) Excel function, ([type R6](https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample)). + +When using multiple `quantileExactExclusive` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantilesExactExclusive](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantilesexactexclusive) function. + +**Syntax** + +``` sql +quantileExactExclusive(level)(expr) +``` + +**Arguments** + +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Parameters** + +- `level` — Level of quantile. Optional. Possible values: (0, 1) — bounds not included. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). [Float](../../../sql-reference/data-types/float.md). + +**Returned value** + +- Quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +CREATE TABLE num AS numbers(1000); + +SELECT quantileExactExclusive(0.6)(x) FROM (SELECT number AS x FROM num); +``` + +Result: + +``` text +┌─quantileExactExclusive(0.6)(x)─┐ +│ 599.6 │ +└────────────────────────────────┘ +``` + +## quantileExactInclusive {#quantileexactinclusive} + +Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memory, where `n` is a number of values that were passed. However, for a small number of values, the function is very effective. + +This function is equivalent to [PERCENTILE.INC](https://support.microsoft.com/en-us/office/percentile-inc-function-680f9539-45eb-410b-9a5e-c1355e5fe2ed) Excel function, ([type R7](https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample)). + +When using multiple `quantileExactInclusive` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantilesExactInclusive](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantilesexactinclusive) function. + +**Syntax** + +``` sql +quantileExactInclusive(level)(expr) +``` + +**Arguments** + +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Parameters** + +- `level` — Level of quantile. Optional. Possible values: [0, 1] — bounds included. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). [Float](../../../sql-reference/data-types/float.md). + +**Returned value** + +- Quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +CREATE TABLE num AS numbers(1000); + +SELECT quantileExactInclusive(0.6)(x) FROM (SELECT number AS x FROM num); +``` + +Result: + +``` text +┌─quantileExactInclusive(0.6)(x)─┐ +│ 599.4 │ +└────────────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantileexactweighted.md new file mode 100644 index 00000000000..4740d4a26f8 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantileexactweighted.md @@ -0,0 +1,67 @@ +--- +sidebar_position: 203 +--- + +# quantileExactWeighted {#quantileexactweighted} + +Exactly computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence, taking into account the weight of each element. + +To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Each value is counted with its weight, as if it is present `weight` times. A hash table is used in the algorithm. Because of this, if the passed values ​​are frequently repeated, the function consumes less RAM than [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexact). You can use this function instead of `quantileExact` and specify the weight 1. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileExactWeighted(level)(expr, weight) +``` + +Alias: `medianExactWeighted`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). +- `weight` — Column with weights of sequence members. Weight is a number of value occurrences. + +**Returned value** + +- Quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Input table: + +``` text +┌─n─┬─val─┐ +│ 0 │ 3 │ +│ 1 │ 2 │ +│ 2 │ 1 │ +│ 5 │ 4 │ +└───┴─────┘ +``` + +Query: + +``` sql +SELECT quantileExactWeighted(n, val) FROM t +``` + +Result: + +``` text +┌─quantileExactWeighted(n, val)─┐ +│ 1 │ +└───────────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantiles.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiles.md new file mode 100644 index 00000000000..6d0cf37f25e --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiles.md @@ -0,0 +1,115 @@ +--- +sidebar_position: 201 +--- + +# quantiles Functions {#quantiles-functions} + +## quantiles {#quantiles} + +Syntax: `quantiles(level1, level2, …)(x)` + +All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. + +## quantilesExactExclusive {#quantilesexactexclusive} + +Exactly computes the [quantiles](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memory, where `n` is a number of values that were passed. However, for a small number of values, the function is very effective. + +This function is equivalent to [PERCENTILE.EXC](https://support.microsoft.com/en-us/office/percentile-exc-function-bbaa7204-e9e1-4010-85bf-c31dc5dce4ba) Excel function, ([type R6](https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample)). + +Works more efficiently with sets of levels than [quantileExactExclusive](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactexclusive). + +**Syntax** + +``` sql +quantilesExactExclusive(level1, level2, ...)(expr) +``` + +**Arguments** + +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Parameters** + +- `level` — Levels of quantiles. Possible values: (0, 1) — bounds not included. [Float](../../../sql-reference/data-types/float.md). + +**Returned value** + +- [Array](../../../sql-reference/data-types/array.md) of quantiles of the specified levels. + +Type of array values: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +CREATE TABLE num AS numbers(1000); + +SELECT quantilesExactExclusive(0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999)(x) FROM (SELECT number AS x FROM num); +``` + +Result: + +``` text +┌─quantilesExactExclusive(0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999)(x)─┐ +│ [249.25,499.5,749.75,899.9,949.9499999999999,989.99,998.999] │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +## quantilesExactInclusive {#quantilesexactinclusive} + +Exactly computes the [quantiles](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +To get exact value, all the passed values ​​are combined into an array, which is then partially sorted. Therefore, the function consumes `O(n)` memory, where `n` is a number of values that were passed. However, for a small number of values, the function is very effective. + +This function is equivalent to [PERCENTILE.INC](https://support.microsoft.com/en-us/office/percentile-inc-function-680f9539-45eb-410b-9a5e-c1355e5fe2ed) Excel function, ([type R7](https://en.wikipedia.org/wiki/Quantile#Estimating_quantiles_from_a_sample)). + +Works more efficiently with sets of levels than [quantileExactInclusive](../../../sql-reference/aggregate-functions/reference/quantileexact.md#quantileexactinclusive). + +**Syntax** + +``` sql +quantilesExactInclusive(level1, level2, ...)(expr) +``` + +**Arguments** + +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Parameters** + +- `level` — Levels of quantiles. Possible values: [0, 1] — bounds included. [Float](../../../sql-reference/data-types/float.md). + +**Returned value** + +- [Array](../../../sql-reference/data-types/array.md) of quantiles of the specified levels. + +Type of array values: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +CREATE TABLE num AS numbers(1000); + +SELECT quantilesExactInclusive(0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999)(x) FROM (SELECT number AS x FROM num); +``` + +Result: + +``` text +┌─quantilesExactInclusive(0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999)(x)─┐ +│ [249.75,499.5,749.25,899.1,949.05,989.01,998.001] │ +└─────────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigest.md new file mode 100644 index 00000000000..f42c88b2aca --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigest.md @@ -0,0 +1,57 @@ +--- +sidebar_position: 207 +--- + +# quantileTDigest {#quantiletdigest} + +Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. + +Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic. + +The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileTDigest(level)(expr) +``` + +Alias: `medianTDigest`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). + +**Returned value** + +- Approximate quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +SELECT quantileTDigest(number) FROM numbers(10) +``` + +Result: + +``` text +┌─quantileTDigest(number)─┐ +│ 4.5 │ +└─────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md new file mode 100644 index 00000000000..684e438f0c7 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md @@ -0,0 +1,62 @@ +--- +sidebar_position: 208 +--- + +# quantileTDigestWeighted {#quantiletdigestweighted} + +Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. The function takes into account the weight of each sequence member. The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. + +The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`. + +The result depends on the order of running the query, and is nondeterministic. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +:::note +Using `quantileTDigestWeighted` [is not recommended for tiny data sets](https://github.com/tdunning/t-digest/issues/167#issuecomment-828650275) and can lead to significat error. In this case, consider possibility of using [`quantileTDigest`](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) instead. +::: + +**Syntax** + +``` sql +quantileTDigestWeighted(level)(expr, weight) +``` + +Alias: `medianTDigestWeighted`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md). +- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. + +**Returned value** + +- Approximate quantile of the specified level. + +Type: + +- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input. +- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type. +- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type. + +**Example** + +Query: + +``` sql +SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) +``` + +Result: + +``` text +┌─quantileTDigestWeighted(number, 1)─┐ +│ 4.5 │ +└────────────────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletiming.md new file mode 100644 index 00000000000..f282f7e2004 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletiming.md @@ -0,0 +1,88 @@ +--- +sidebar_position: 204 +--- + +# quantileTiming {#quantiletiming} + +With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. + +The result is deterministic (it does not depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileTiming(level)(expr) +``` + +Alias: `medianTiming`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). + +- `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) over a column values returning a [Float\*](../../../sql-reference/data-types/float.md)-type number. + + - If negative values are passed to the function, the behavior is undefined. + - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. + +**Accuracy** + +The calculation is accurate if: + +- Total number of values does not exceed 5670. +- Total number of values exceeds 5670, but the page loading time is less than 1024ms. + +Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms. + +:::note +For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile). +::: + +**Returned value** + +- Quantile of the specified level. + +Type: `Float32`. + +:::note +If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values. +::: + +**Example** + +Input table: + +``` text +┌─response_time─┐ +│ 72 │ +│ 112 │ +│ 126 │ +│ 145 │ +│ 104 │ +│ 242 │ +│ 313 │ +│ 168 │ +│ 108 │ +└───────────────┘ +``` + +Query: + +``` sql +SELECT quantileTiming(response_time) FROM t +``` + +Result: + +``` text +┌─quantileTiming(response_time)─┐ +│ 126 │ +└───────────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletimingweighted.md new file mode 100644 index 00000000000..c773f900764 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/quantiletimingweighted.md @@ -0,0 +1,121 @@ +--- +sidebar_position: 205 +--- + +# quantileTimingWeighted {#quantiletimingweighted} + +With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence according to the weight of each sequence member. + +The result is deterministic (it does not depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times. + +When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. + +**Syntax** + +``` sql +quantileTimingWeighted(level)(expr, weight) +``` + +Alias: `medianTimingWeighted`. + +**Arguments** + +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). + +- `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) over a column values returning a [Float\*](../../../sql-reference/data-types/float.md)-type number. + + - If negative values are passed to the function, the behavior is undefined. + - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. + +- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. + +**Accuracy** + +The calculation is accurate if: + +- Total number of values does not exceed 5670. +- Total number of values exceeds 5670, but the page loading time is less than 1024ms. + +Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms. + +:::note +For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile). +::: + +**Returned value** + +- Quantile of the specified level. + +Type: `Float32`. + +:::note +If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values. +::: + +**Example** + +Input table: + +``` text +┌─response_time─┬─weight─┐ +│ 68 │ 1 │ +│ 104 │ 2 │ +│ 112 │ 3 │ +│ 126 │ 2 │ +│ 138 │ 1 │ +│ 162 │ 1 │ +└───────────────┴────────┘ +``` + +Query: + +``` sql +SELECT quantileTimingWeighted(response_time, weight) FROM t +``` + +Result: + +``` text +┌─quantileTimingWeighted(response_time, weight)─┐ +│ 112 │ +└───────────────────────────────────────────────┘ +``` + +# quantilesTimingWeighted {#quantilestimingweighted} + +Same as `quantileTimingWeighted`, but accept multiple parameters with quantile levels and return an Array filled with many values of that quantiles. + + +**Example** + +Input table: + +``` text +┌─response_time─┬─weight─┐ +│ 68 │ 1 │ +│ 104 │ 2 │ +│ 112 │ 3 │ +│ 126 │ 2 │ +│ 138 │ 1 │ +│ 162 │ 1 │ +└───────────────┴────────┘ +``` + +Query: + +``` sql +SELECT quantilesTimingWeighted(0,5, 0.99)(response_time, weight) FROM t +``` + +Result: + +``` text +┌─quantilesTimingWeighted(0.5, 0.99)(response_time, weight)─┐ +│ [112,162] │ +└───────────────────────────────────────────────────────────┘ +``` + +**See Also** + +- [median](../../../sql-reference/aggregate-functions/reference/median.md#median) +- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/en/reference/sql-reference/aggregate-functions/reference/rankCorr.md new file mode 100644 index 00000000000..399fd88cf0e --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/rankCorr.md @@ -0,0 +1,57 @@ +--- +sidebar_position: 145 +--- + +# rankCorr {#agg_function-rankcorr} + +Computes a rank correlation coefficient. + +**Syntax** + +``` sql +rankCorr(x, y) +``` + +**Arguments** + +- `x` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64). +- `y` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Returned value(s)** + +- Returns a rank correlation coefficient of the ranks of x and y. The value of the correlation coefficient ranges from -1 to +1. If less than two arguments are passed, the function will return an exception. The value close to +1 denotes a high linear relationship, and with an increase of one random variable, the second random variable also increases. The value close to -1 denotes a high linear relationship, and with an increase of one random variable, the second random variable decreases. The value close or equal to 0 denotes no relationship between the two random variables. + +Type: [Float64](../../../sql-reference/data-types/float.md#float32-float64). + +**Example** + +Query: + +``` sql +SELECT rankCorr(number, number) FROM numbers(100); +``` + +Result: + +``` text +┌─rankCorr(number, number)─┐ +│ 1 │ +└──────────────────────────┘ +``` + +Query: + +``` sql +SELECT roundBankers(rankCorr(exp(number), sin(number)), 3) FROM numbers(100); +``` + +Result: + +``` text +┌─roundBankers(rankCorr(exp(number), sin(number)), 3)─┐ +│ -0.037 │ +└─────────────────────────────────────────────────────┘ +``` +**See Also** + +- [Spearman's rank correlation coefficient](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/en/reference/sql-reference/aggregate-functions/reference/simplelinearregression.md new file mode 100644 index 00000000000..8684cd4c3bb --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -0,0 +1,42 @@ +--- +sidebar_position: 220 +--- + +# simpleLinearRegression {#simplelinearregression} + +Performs simple (unidimensional) linear regression. + +``` sql +simpleLinearRegression(x, y) +``` + +Parameters: + +- `x` — Column with dependent variable values. +- `y` — Column with explanatory variable values. + +Returned values: + +Constants `(a, b)` of the resulting line `y = a*x + b`. + +**Examples** + +``` sql +SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3]) +``` + +``` text +┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐ +│ (1,0) │ +└───────────────────────────────────────────────────────────────────┘ +``` + +``` sql +SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6]) +``` + +``` text +┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐ +│ (1,3) │ +└───────────────────────────────────────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/skewpop.md b/docs/en/reference/sql-reference/aggregate-functions/reference/skewpop.md new file mode 100644 index 00000000000..4cb3d58304f --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/skewpop.md @@ -0,0 +1,25 @@ +--- +sidebar_position: 150 +--- + +# skewPop {#skewpop} + +Computes the [skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence. + +``` sql +skewPop(expr) +``` + +**Arguments** + +`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number. + +**Returned value** + +The skewness of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md) + +**Example** + +``` sql +SELECT skewPop(value) FROM series_with_value_column; +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/en/reference/sql-reference/aggregate-functions/reference/skewsamp.md new file mode 100644 index 00000000000..92e807d2d7d --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/skewsamp.md @@ -0,0 +1,27 @@ +--- +sidebar_position: 151 +--- + +# skewSamp {#skewsamp} + +Computes the [sample skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence. + +It represents an unbiased estimate of the skewness of a random variable if passed values form its sample. + +``` sql +skewSamp(expr) +``` + +**Arguments** + +`expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number. + +**Returned value** + +The skewness of the given distribution. Type — [Float64](../../../sql-reference/data-types/float.md). If `n <= 1` (`n` is the size of the sample), then the function returns `nan`. + +**Example** + +``` sql +SELECT skewSamp(value) FROM series_with_value_column; +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/sparkbar.md b/docs/en/reference/sql-reference/aggregate-functions/reference/sparkbar.md new file mode 100644 index 00000000000..ebb9cccbd40 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/sparkbar.md @@ -0,0 +1,64 @@ +--- +sidebar_position: 311 +sidebar_label: sparkbar +--- + +# sparkbar {#sparkbar} + +The function plots a frequency histogram for values `x` and the repetition rate `y` of these values over the interval `[min_x, max_x]`. + + +If no interval is specified, then the minimum `x` is used as the interval start, and the maximum `x` — as the interval end. + +**Syntax** + +``` sql +sparkbar(width[, min_x, max_x])(x, y) +``` + +**Parameters** + +- `width` — The number of segments. Type: [Integer](../../../sql-reference/data-types/int-uint.md). +- `min_x` — The interval start. Optional parameter. +- `max_x` — The interval end. Optional parameter. + +**Arguments** + +- `x` — The field with values. +- `y` — The field with the frequency of values. + +**Returned value** + +- The frequency histogram. + +**Example** + +Query: + +``` sql +CREATE TABLE spark_bar_data (`cnt` UInt64,`event_date` Date) ENGINE = MergeTree ORDER BY event_date SETTINGS index_granularity = 8192; + +INSERT INTO spark_bar_data VALUES(1,'2020-01-01'),(4,'2020-01-02'),(5,'2020-01-03'),(2,'2020-01-04'),(3,'2020-01-05'),(7,'2020-01-06'),(6,'2020-01-07'),(8,'2020-01-08'),(2,'2020-01-11'); + +SELECT sparkbar(9)(event_date,cnt) FROM spark_bar_data; + +SELECT sparkbar(9,toDate('2020-01-01'),toDate('2020-01-10'))(event_date,cnt) FROM spark_bar_data; +``` + +Result: + +``` text + +┌─sparkbar(9)(event_date, cnt)─┐ +│ │ +│ ▁▅▄▃██▅ ▁ │ +│ │ +└──────────────────────────────┘ + +┌─sparkbar(9, toDate('2020-01-01'), toDate('2020-01-10'))(event_date, cnt)─┐ +│ │ +│▁▄▄▂▅▇█▁ │ +│ │ +└──────────────────────────────────────────────────────────────────────────┘ +``` + diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/en/reference/sql-reference/aggregate-functions/reference/stddevpop.md new file mode 100644 index 00000000000..2b22320ae7a --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/stddevpop.md @@ -0,0 +1,11 @@ +--- +sidebar_position: 30 +--- + +# stddevPop {#stddevpop} + +The result is equal to the square root of [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md). + +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevPopStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/en/reference/sql-reference/aggregate-functions/reference/stddevsamp.md new file mode 100644 index 00000000000..3dcee821606 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/stddevsamp.md @@ -0,0 +1,11 @@ +--- +sidebar_position: 31 +--- + +# stddevSamp {#stddevsamp} + +The result is equal to the square root of [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md). + +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevSampStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlinearregression.md new file mode 100644 index 00000000000..e171629e90d --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -0,0 +1,75 @@ +--- +sidebar_position: 221 +--- + +# stochasticLinearRegression {#agg_functions-stochasticlinearregression} + +This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size and has few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)). + +### Parameters {#agg_functions-stochasticlinearregression-parameters} + +There are 4 customizable parameters. They are passed to the function sequentially, but there is no need to pass all four - default values will be used, however good model required some parameter tuning. + +``` text +stochasticLinearRegression(1.0, 1.0, 10, 'SGD') +``` + +1. `learning rate` is the coefficient on step length, when gradient descent step is performed. Too big learning rate may cause infinite weights of the model. Default is `0.00001`. +2. `l2 regularization coefficient` which may help to prevent overfitting. Default is `0.1`. +3. `mini-batch size` sets the number of elements, which gradients will be computed and summed to perform one step of gradient descent. Pure stochastic descent uses one element, however having small batches(about 10 elements) make gradient steps more stable. Default is `15`. +4. `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods. + +### Usage {#agg_functions-stochasticlinearregression-usage} + +`stochasticLinearRegression` is used in two steps: fitting the model and predicting on new data. In order to fit the model and save its state for later usage we use `-State` combinator, which basically saves the state (model weights, etc). +To predict we use function [evalMLMethod](../../../sql-reference/functions/machine-learning-functions.md#machine_learning_methods-evalmlmethod), which takes a state as an argument as well as features to predict on. + + + +**1.** Fitting + +Such query may be used. + +``` sql +CREATE TABLE IF NOT EXISTS train_data +( + param1 Float64, + param2 Float64, + target Float64 +) ENGINE = Memory; + +CREATE TABLE your_model ENGINE = Memory AS SELECT +stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2) +AS state FROM train_data; +``` + +Here we also need to insert data into `train_data` table. The number of parameters is not fixed, it depends only on number of arguments, passed into `linearRegressionState`. They all must be numeric values. +Note that the column with target value(which we would like to learn to predict) is inserted as the first argument. + +**2.** Predicting + +After saving a state into the table, we may use it multiple times for prediction, or even merge with other states and create new even better models. + +``` sql +WITH (SELECT state FROM your_model) AS model SELECT +evalMLMethod(model, param1, param2) FROM test_data +``` + +The query will return a column of predicted values. Note that first argument of `evalMLMethod` is `AggregateFunctionState` object, next are columns of features. + +`test_data` is a table like `train_data` but may not contain target value. + +### Notes {#agg_functions-stochasticlinearregression-notes} + +1. To merge two models user may create such query: + `sql SELECT state1 + state2 FROM your_models` + where `your_models` table contains both models. This query will return new `AggregateFunctionState` object. + +2. User may fetch weights of the created model for its own purposes without saving the model if no `-State` combinator is used. + `sql SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data` + Such query will fit the model and return its weights - first are weights, which correspond to the parameters of the model, the last one is bias. So in the example above the query will return a column with 3 values. + +**See Also** + +- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression) +- [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md new file mode 100644 index 00000000000..a7d4c640126 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md @@ -0,0 +1,55 @@ +--- +sidebar_position: 222 +--- + +# stochasticLogisticRegression {#agg_functions-stochasticlogisticregression} + +This function implements stochastic logistic regression. It can be used for binary classification problem, supports the same custom parameters as stochasticLinearRegression and works the same way. + +### Parameters {#agg_functions-stochasticlogisticregression-parameters} + +Parameters are exactly the same as in stochasticLinearRegression: +`learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. +For more information see [parameters](#agg_functions-stochasticlinearregression-parameters). + +``` text +stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') +``` + +**1.** Fitting + + + + See the `Fitting` section in the [stochasticLinearRegression](#stochasticlinearregression-usage-fitting) description. + + Predicted labels have to be in \[-1, 1\]. + +**2.** Predicting + + + + Using saved state we can predict probability of object having label `1`. + + ``` sql + WITH (SELECT state FROM your_model) AS model SELECT + evalMLMethod(model, param1, param2) FROM test_data + ``` + + The query will return a column of probabilities. Note that first argument of `evalMLMethod` is `AggregateFunctionState` object, next are columns of features. + + We can also set a bound of probability, which assigns elements to different labels. + + ``` sql + SELECT ans < 1.1 AND ans > 0.5 FROM + (WITH (SELECT state FROM your_model) AS model SELECT + evalMLMethod(model, param1, param2) AS ans FROM test_data) + ``` + + Then the result will be labels. + + `test_data` is a table like `train_data` but may not contain target value. + +**See Also** + +- [stochasticLinearRegression](../../../sql-reference/aggregate-functions/reference/stochasticlinearregression.md#agg_functions-stochasticlinearregression) +- [Difference between linear and logistic regressions.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/studentttest.md b/docs/en/reference/sql-reference/aggregate-functions/reference/studentttest.md new file mode 100644 index 00000000000..86207a35c04 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/studentttest.md @@ -0,0 +1,73 @@ +--- +sidebar_position: 300 +sidebar_label: studentTTest +--- + +# studentTTest {#studentttest} + +Applies Student's t-test to samples from two populations. + +**Syntax** + +``` sql +studentTTest([confidence_level])(sample_data, sample_index) +``` + +Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population. +The null hypothesis is that means of populations are equal. Normal distribution with equal variances is assumed. + +**Arguments** + +- `sample_data` — Sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — Sample index. [Integer](../../../sql-reference/data-types/int-uint.md). + +**Parameters** + +- `confidence_level` — Confidence level in order to calculate confidence intervals. [Float](../../../sql-reference/data-types/float.md). + + +**Returned values** + +[Tuple](../../../sql-reference/data-types/tuple.md) with two or four elements (if the optional `confidence_level` is specified): + +- calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md). +- calculated p-value. [Float64](../../../sql-reference/data-types/float.md). +- [calculated confidence-interval-low.] [Float64](../../../sql-reference/data-types/float.md). +- [calculated confidence-interval-high.] [Float64](../../../sql-reference/data-types/float.md). + + +**Example** + +Input table: + +``` text +┌─sample_data─┬─sample_index─┐ +│ 20.3 │ 0 │ +│ 21.1 │ 0 │ +│ 21.9 │ 1 │ +│ 21.7 │ 0 │ +│ 19.9 │ 1 │ +│ 21.8 │ 1 │ +└─────────────┴──────────────┘ +``` + +Query: + +``` sql +SELECT studentTTest(sample_data, sample_index) FROM student_ttest; +``` + +Result: + +``` text +┌─studentTTest(sample_data, sample_index)───┐ +│ (-0.21739130434783777,0.8385421208415731) │ +└───────────────────────────────────────────┘ +``` + +**See Also** + +- [Student's t-test](https://en.wikipedia.org/wiki/Student%27s_t-test) +- [welchTTest function](welchttest.md#welchttest) + +[Original article](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/studentttest/) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/sum.md b/docs/en/reference/sql-reference/aggregate-functions/reference/sum.md new file mode 100644 index 00000000000..b72cb84e74f --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/sum.md @@ -0,0 +1,7 @@ +--- +sidebar_position: 4 +--- + +# sum {#agg_function-sum} + +Calculates the sum. Only works for numbers. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/sumcount.md b/docs/en/reference/sql-reference/aggregate-functions/reference/sumcount.md new file mode 100644 index 00000000000..dbc0601241e --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/sumcount.md @@ -0,0 +1,46 @@ +--- +sidebar_position: 144 +--- + +# sumCount {#agg_function-sumCount} + +Calculates the sum of the numbers and counts the number of rows at the same time. The function is used by ClickHouse query optimizer: if there are multiple `sum`, `count` or `avg` functions in a query, they can be replaced to single `sumCount` function to reuse the calculations. The function is rarely needed to use explicitly. + +**Syntax** + +``` sql +sumCount(x) +``` + +**Arguments** + +- `x` — Input value, must be [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), or [Decimal](../../../sql-reference/data-types/decimal.md). + +**Returned value** + +- Tuple `(sum, count)`, where `sum` is the sum of numbers and `count` is the number of rows with not-NULL values. + +Type: [Tuple](../../../sql-reference/data-types/tuple.md). + +**Example** + +Query: + +``` sql +CREATE TABLE s_table (x Int8) Engine = Log; +INSERT INTO s_table SELECT number FROM numbers(0, 20); +INSERT INTO s_table VALUES (NULL); +SELECT sumCount(x) from s_table; +``` + +Result: + +``` text +┌─sumCount(x)─┐ +│ (190,20) │ +└─────────────┘ +``` + +**See also** + +- [optimize_syntax_fuse_functions](../../../operations/settings/settings.md#optimize_syntax_fuse_functions) setting. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/sumkahan.md b/docs/en/reference/sql-reference/aggregate-functions/reference/sumkahan.md new file mode 100644 index 00000000000..8c96464dfd5 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/sumkahan.md @@ -0,0 +1,40 @@ +--- +sidebar_position: 145 +--- + +# sumKahan {#agg_function-sumKahan} + +Calculates the sum of the numbers with [Kahan compensated summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm) +Slower than [sum](./sum.md) function. +The compensation works only for [Float](../../../sql-reference/data-types/float.md) types. + + +**Syntax** + +``` sql +sumKahan(x) +``` + +**Arguments** + +- `x` — Input value, must be [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), or [Decimal](../../../sql-reference/data-types/decimal.md). + +**Returned value** + +- the sum of numbers, with type [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md), or [Decimal](../../../sql-reference/data-types/decimal.md) depends on type of input arguments + +**Example** + +Query: + +``` sql +SELECT sum(0.1), sumKahan(0.1) FROM numbers(10); +``` + +Result: + +``` text +┌───────────sum(0.1)─┬─sumKahan(0.1)─┐ +│ 0.9999999999999999 │ 1 │ +└────────────────────┴───────────────┘ +``` \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/summap.md b/docs/en/reference/sql-reference/aggregate-functions/reference/summap.md new file mode 100644 index 00000000000..78ce6a9e835 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/summap.md @@ -0,0 +1,48 @@ +--- +sidebar_position: 141 +--- + +# sumMap {#agg_functions-summap} + +Syntax: `sumMap(key, value)` or `sumMap(Tuple(key, value))` + +Totals the `value` array according to the keys specified in the `key` array. + +Passing tuple of keys and values arrays is a synonym to passing two arrays of keys and values. + +The number of elements in `key` and `value` must be the same for each row that is totaled. + +Returns a tuple of two arrays: keys in sorted order, and values ​​summed for the corresponding keys. + +Example: + +``` sql +CREATE TABLE sum_map( + date Date, + timeslot DateTime, + statusMap Nested( + status UInt16, + requests UInt64 + ), + statusMapTuple Tuple(Array(Int32), Array(Int32)) +) ENGINE = Log; +INSERT INTO sum_map VALUES + ('2000-01-01', '2000-01-01 00:00:00', [1, 2, 3], [10, 10, 10], ([1, 2, 3], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:00:00', [3, 4, 5], [10, 10, 10], ([3, 4, 5], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:01:00', [4, 5, 6], [10, 10, 10], ([4, 5, 6], [10, 10, 10])), + ('2000-01-01', '2000-01-01 00:01:00', [6, 7, 8], [10, 10, 10], ([6, 7, 8], [10, 10, 10])); + +SELECT + timeslot, + sumMap(statusMap.status, statusMap.requests), + sumMap(statusMapTuple) +FROM sum_map +GROUP BY timeslot +``` + +``` text +┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┬─sumMap(statusMapTuple)─────────┐ +│ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ ([1,2,3,4,5],[10,10,20,10,10]) │ +│ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ ([4,5,6,7,8],[10,10,20,10,10]) │ +└─────────────────────┴──────────────────────────────────────────────┴────────────────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/sumwithoverflow.md b/docs/en/reference/sql-reference/aggregate-functions/reference/sumwithoverflow.md new file mode 100644 index 00000000000..0582eb5fb7b --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/sumwithoverflow.md @@ -0,0 +1,9 @@ +--- +sidebar_position: 140 +--- + +# sumWithOverflow {#sumwithoverflowx} + +Computes the sum of the numbers, using the same data type for the result as for the input parameters. If the sum exceeds the maximum value for this data type, it is calculated with overflow. + +Only works for numbers. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/topk.md b/docs/en/reference/sql-reference/aggregate-functions/reference/topk.md new file mode 100644 index 00000000000..d0f445c6710 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/topk.md @@ -0,0 +1,42 @@ +--- +sidebar_position: 108 +--- + +# topK {#topk} + +Returns an array of the approximately most frequent values in the specified column. The resulting array is sorted in descending order of approximate frequency of values (not by the values themselves). + +Implements the [Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) algorithm for analyzing TopK, based on the reduce-and-combine algorithm from [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf). + +``` sql +topK(N)(column) +``` + +This function does not provide a guaranteed result. In certain situations, errors might occur and it might return frequent values that aren’t the most frequent values. + +We recommend using the `N < 10` value; performance is reduced with large `N` values. Maximum value of `N = 65536`. + +**Arguments** + +- `N` – The number of elements to return. + +If the parameter is omitted, default value 10 is used. + +**Arguments** + +- `x` – The value to calculate frequency. + +**Example** + +Take the [OnTime](../../../example-datasets/ontime.md) data set and select the three most frequently occurring values in the `AirlineID` column. + +``` sql +SELECT topK(3)(AirlineID) AS res +FROM ontime +``` + +``` text +┌─res─────────────────┐ +│ [19393,19790,19805] │ +└─────────────────────┘ +``` diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/en/reference/sql-reference/aggregate-functions/reference/topkweighted.md new file mode 100644 index 00000000000..2d6e86667ef --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/topkweighted.md @@ -0,0 +1,43 @@ +--- +sidebar_position: 109 +--- + +# topKWeighted {#topkweighted} + +Returns an array of the approximately most frequent values in the specified column. The resulting array is sorted in descending order of approximate frequency of values (not by the values themselves). Additionally, the weight of the value is taken into account. + +**Syntax** + +``` sql +topKWeighted(N)(x, weight) +``` + +**Arguments** + +- `N` — The number of elements to return. +- `x` — The value. +- `weight` — The weight. Every value is accounted `weight` times for frequency calculation. [UInt64](../../../sql-reference/data-types/int-uint.md). + +**Returned value** + +Returns an array of the values with maximum approximate sum of weights. + +**Example** + +Query: + +``` sql +SELECT topKWeighted(10)(number, number) FROM numbers(1000) +``` + +Result: + +``` text +┌─topKWeighted(10)(number, number)──────────┐ +│ [999,998,997,996,995,994,993,992,991,990] │ +└───────────────────────────────────────────┘ +``` + +**See Also** + +- [topK](../../../sql-reference/aggregate-functions/reference/topk.md) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/uniq.md b/docs/en/reference/sql-reference/aggregate-functions/reference/uniq.md new file mode 100644 index 00000000000..6e6791702ef --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/uniq.md @@ -0,0 +1,39 @@ +--- +sidebar_position: 190 +--- + +# uniq {#agg_function-uniq} + +Calculates the approximate number of different values of the argument. + +``` sql +uniq(x[, ...]) +``` + +**Arguments** + +The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types. + +**Returned value** + +- A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number. + +**Implementation details** + +Function: + +- Calculates a hash for all parameters in the aggregate, then uses it in calculations. + +- Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536. This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. + +- Provides the result deterministically (it does not depend on the query processing order). + +We recommend using this function in almost all scenarios. + +**See Also** + +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) +- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) +- [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined.md b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined.md new file mode 100644 index 00000000000..79357cb14ce --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined.md @@ -0,0 +1,53 @@ +--- +sidebar_position: 192 +--- + +# uniqCombined {#agg_function-uniqcombined} + +Calculates the approximate number of different argument values. + +``` sql +uniqCombined(HLL_precision)(x[, ...]) +``` + +The `uniqCombined` function is a good choice for calculating the number of different values. + +**Arguments** + +The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types. + +`HLL_precision` is the base-2 logarithm of the number of cells in [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). Optional, you can use the function as `uniqCombined(x[, ...])`. The default value for `HLL_precision` is 17, which is effectively 96 KiB of space (2^17 cells, 6 bits each). + +**Returned value** + +- A number [UInt64](../../../sql-reference/data-types/int-uint.md)-type number. + +**Implementation details** + +Function: + +- Calculates a hash (64-bit hash for `String` and 32-bit otherwise) for all parameters in the aggregate, then uses it in calculations. + +- Uses a combination of three algorithms: array, hash table, and HyperLogLog with an error correction table. + + For a small number of distinct elements, an array is used. When the set size is larger, a hash table is used. For a larger number of elements, HyperLogLog is used, which will occupy a fixed amount of memory. + +- Provides the result deterministically (it does not depend on the query processing order). + +:::note +Since it uses 32-bit hash for non-`String` type, the result will have very high error for cardinalities significantly larger than `UINT_MAX` (error will raise quickly after a few tens of billions of distinct values), hence in this case you should use [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +::: + +Compared to the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function, the `uniqCombined`: + +- Consumes several times less memory. +- Calculates with several times higher accuracy. +- Usually has slightly lower performance. In some scenarios, `uniqCombined` can perform better than `uniq`, for example, with distributed queries that transmit a large number of aggregation states over the network. + +**See Also** + +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) +- [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined64.md b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined64.md new file mode 100644 index 00000000000..fb0be23c768 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqcombined64.md @@ -0,0 +1,7 @@ +--- +sidebar_position: 193 +--- + +# uniqCombined64 {#agg_function-uniqcombined64} + +Same as [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined), but uses 64-bit hash for all data types. diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/uniqexact.md b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqexact.md new file mode 100644 index 00000000000..68e6bc562f9 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqexact.md @@ -0,0 +1,26 @@ +--- +sidebar_position: 191 +--- + +# uniqExact {#agg_function-uniqexact} + +Calculates the exact number of different argument values. + +``` sql +uniqExact(x[, ...]) +``` + +Use the `uniqExact` function if you absolutely need an exact result. Otherwise use the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function. + +The `uniqExact` function uses more memory than `uniq`, because the size of the state has unbounded growth as the number of different values increases. + +**Arguments** + +The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types. + +**See Also** + +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqcombined) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqhll12) +- [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqhll12.md new file mode 100644 index 00000000000..1a13b365560 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqhll12.md @@ -0,0 +1,40 @@ +--- +sidebar_position: 194 +--- + +# uniqHLL12 {#agg_function-uniqhll12} + +Calculates the approximate number of different argument values, using the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algorithm. + +``` sql +uniqHLL12(x[, ...]) +``` + +**Arguments** + +The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types. + +**Returned value** + +- A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number. + +**Implementation details** + +Function: + +- Calculates a hash for all parameters in the aggregate, then uses it in calculations. + +- Uses the HyperLogLog algorithm to approximate the number of different argument values. + + 2^12 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). + +- Provides the determinate result (it does not depend on the query processing order). + +We do not recommend using this function. In most cases, use the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) or [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) function. + +**See Also** + +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) +- [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/uniqthetasketch.md b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqthetasketch.md new file mode 100644 index 00000000000..9b9c16922b1 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/uniqthetasketch.md @@ -0,0 +1,39 @@ +--- +sidebar_position: 195 +--- + +# uniqTheta {#agg_function-uniqthetasketch} + +Calculates the approximate number of different argument values, using the [Theta Sketch Framework](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html). + +``` sql +uniqTheta(x[, ...]) +``` + +**Arguments** + +The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types. + +**Returned value** + +- A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number. + +**Implementation details** + +Function: + +- Calculates a hash for all parameters in the aggregate, then uses it in calculations. + +- Uses the [KMV](https://datasketches.apache.org/docs/Theta/InverseEstimate.html) algorithm to approximate the number of different argument values. + + 4096(2^12) 64-bit sketch are used. The size of the state is about 41 KB. + +- The relative error is 3.125% (95% confidence), see the [relative error table](https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html) for detail. + +**See Also** + +- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) +- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined) +- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12) +- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact) diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/varpop.md b/docs/en/reference/sql-reference/aggregate-functions/reference/varpop.md new file mode 100644 index 00000000000..f16cfcdc63f --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/varpop.md @@ -0,0 +1,13 @@ +--- +sidebar_position: 32 +--- + +# varPop(x) {#varpopx} + +Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`. + +In other words, dispersion for a set of values. Returns `Float64`. + +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varPopStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/varsamp.md b/docs/en/reference/sql-reference/aggregate-functions/reference/varsamp.md new file mode 100644 index 00000000000..b323f78fbd1 --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/varsamp.md @@ -0,0 +1,15 @@ +--- +sidebar_position: 33 +--- + +# varSamp {#varsamp} + +Calculates the amount `Σ((x - x̅)^2) / (n - 1)`, where `n` is the sample size and `x̅`is the average value of `x`. + +It represents an unbiased estimate of the variance of a random variable if passed values form its sample. + +Returns `Float64`. When `n <= 1`, returns `+∞`. + +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varSampStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/reference/sql-reference/aggregate-functions/reference/welchttest.md b/docs/en/reference/sql-reference/aggregate-functions/reference/welchttest.md new file mode 100644 index 00000000000..0aff60e7bbf --- /dev/null +++ b/docs/en/reference/sql-reference/aggregate-functions/reference/welchttest.md @@ -0,0 +1,72 @@ +--- +sidebar_position: 301 +sidebar_label: welchTTest +--- + +# welchTTest {#welchttest} + +Applies Welch's t-test to samples from two populations. + +**Syntax** + +``` sql +welchTTest([confidence_level])(sample_data, sample_index) +``` + +Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population. +The null hypothesis is that means of populations are equal. Normal distribution is assumed. Populations may have unequal variance. + +**Arguments** + +- `sample_data` — Sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md). +- `sample_index` — Sample index. [Integer](../../../sql-reference/data-types/int-uint.md). + +**Parameters** + +- `confidence_level` — Confidence level in order to calculate confidence intervals. [Float](../../../sql-reference/data-types/float.md). + +**Returned values** + +[Tuple](../../../sql-reference/data-types/tuple.md) with two or four elements (if the optional `confidence_level` is specified) + +- calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md). +- calculated p-value. [Float64](../../../sql-reference/data-types/float.md). +- [calculated confidence-interval-low.] [Float64](../../../sql-reference/data-types/float.md). +- [calculated confidence-interval-high.] [Float64](../../../sql-reference/data-types/float.md). + + +**Example** + +Input table: + +``` text +┌─sample_data─┬─sample_index─┐ +│ 20.3 │ 0 │ +│ 22.1 │ 0 │ +│ 21.9 │ 0 │ +│ 18.9 │ 1 │ +│ 20.3 │ 1 │ +│ 19 │ 1 │ +└─────────────┴──────────────┘ +``` + +Query: + +``` sql +SELECT welchTTest(sample_data, sample_index) FROM welch_ttest; +``` + +Result: + +``` text +┌─welchTTest(sample_data, sample_index)─────┐ +│ (2.7988719532211235,0.051807360348581945) │ +└───────────────────────────────────────────┘ +``` + +**See Also** + +- [Welch's t-test](https://en.wikipedia.org/wiki/Welch%27s_t-test) +- [studentTTest function](studentttest.md#studentttest) + +[Original article](https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/welchTTest/) diff --git a/docs/en/reference/sql-reference/ansi.md b/docs/en/reference/sql-reference/ansi.md new file mode 100644 index 00000000000..d6473e64872 --- /dev/null +++ b/docs/en/reference/sql-reference/ansi.md @@ -0,0 +1,190 @@ +--- +sidebar_position: 40 +sidebar_label: ANSI Compatibility +--- + +# ANSI SQL Compatibility of ClickHouse SQL Dialect {#ansi-sql-compatibility-of-clickhouse-sql-dialect} + +:::note +This article relies on Table 38, “Feature taxonomy and definition for mandatory features”, Annex F of [ISO/IEC CD 9075-2:2011](https://www.iso.org/obp/ui/#iso:std:iso-iec:9075:-2:ed-4:v1:en:sec:8). +::: + +## Differences in Behaviour {#differences-in-behaviour} + +The following table lists cases when query feature works in ClickHouse, but behaves not as specified in ANSI SQL. + +| Feature ID | Feature Name | Difference | +|------------|-----------------------------|-----------------------------------------------------------------------------------------------------------| +| E011 | Numeric data types | Numeric literal with period is interpreted as approximate (`Float64`) instead of exact (`Decimal`) | +| E051-05 | Select items can be renamed | Item renames have a wider visibility scope than just the SELECT result | +| E141-01 | NOT NULL constraints | `NOT NULL` is implied for table columns by default | +| E011-04 | Arithmetic operators | ClickHouse overflows instead of checked arithmetic and changes the result data type based on custom rules | + +## Feature Status {#feature-status} + +| Feature ID | Feature Name | Status | Comment | +|------------|--------------------------------------------------------------------------------------------------------------------------|----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **E011** | **Numeric data types** | Partial | | +| E011-01 | INTEGER and SMALLINT data types | Yes | | +| E011-02 | REAL, DOUBLE PRECISION and FLOAT data types data types | Yes | | +| E011-03 | DECIMAL and NUMERIC data types | Yes | | +| E011-04 | Arithmetic operators | Yes | | +| E011-05 | Numeric comparison | Yes | | +| E011-06 | Implicit casting among the numeric data types | No | ANSI SQL allows arbitrary implicit cast between numeric types, while ClickHouse relies on functions having multiple overloads instead of implicit cast | +| **E021** | **Character string types** | Partial | | +| E021-01 | CHARACTER data type | Yes | | +| E021-02 | CHARACTER VARYING data type | Yes | | +| E021-03 | Character literals | Yes | | +| E021-04 | CHARACTER_LENGTH function | Partial | No `USING` clause | +| E021-05 | OCTET_LENGTH function | No | `LENGTH` behaves similarly | +| E021-06 | SUBSTRING | Partial | No support for `SIMILAR` and `ESCAPE` clauses, no `SUBSTRING_REGEX` variant | +| E021-07 | Character concatenation | Partial | No `COLLATE` clause | +| E021-08 | UPPER and LOWER functions | Yes | | +| E021-09 | TRIM function | Yes | | +| E021-10 | Implicit casting among the fixed-length and variable-length character string types | Partial | ANSI SQL allows arbitrary implicit cast between string types, while ClickHouse relies on functions having multiple overloads instead of implicit cast | +| E021-11 | POSITION function | Partial | No support for `IN` and `USING` clauses, no `POSITION_REGEX` variant | +| E021-12 | Character comparison | Yes | | +| **E031** | **Identifiers** | Partial| | +| E031-01 | Delimited identifiers | Partial | Unicode literal support is limited | +| E031-02 | Lower case identifiers | Yes | | +| E031-03 | Trailing underscore | Yes | | +| **E051** | **Basic query specification** | Partial| | +| E051-01 | SELECT DISTINCT | Yes | | +| E051-02 | GROUP BY clause | Yes | | +| E051-04 | GROUP BY can contain columns not in `