forked from scylladb/scylladb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compaction.cc
1806 lines (1581 loc) · 78.7 KB
/
compaction.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (C) 2015-present ScyllaDB
*/
/*
* SPDX-License-Identifier: (AGPL-3.0-or-later and Apache-2.0)
*/
/*
*/
#include <vector>
#include <map>
#include <functional>
#include <utility>
#include <assert.h>
#include <algorithm>
#include <boost/range/algorithm.hpp>
#include <boost/range/adaptors.hpp>
#include <boost/range/join.hpp>
#include <boost/algorithm/cxx11/any_of.hpp>
#include <boost/algorithm/string/join.hpp>
#include <seastar/core/future-util.hh>
#include <seastar/core/scheduling.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/util/closeable.hh>
#include "sstables/sstables.hh"
#include "sstables/sstable_writer.hh"
#include "sstables/progress_monitor.hh"
#include "sstables/sstables_manager.hh"
#include "compaction.hh"
#include "compaction_manager.hh"
#include "schema.hh"
#include "db/system_keyspace.hh"
#include "service/priority_manager.hh"
#include "db_clock.hh"
#include "mutation_compactor.hh"
#include "leveled_manifest.hh"
#include "dht/token.hh"
#include "mutation_writer/shard_based_splitting_writer.hh"
#include "mutation_writer/partition_based_splitting_writer.hh"
#include "mutation_source_metadata.hh"
#include "mutation_fragment_stream_validator.hh"
#include "utils/UUID_gen.hh"
#include "utils/utf8.hh"
#include "utils/fmt-compat.hh"
#include "readers/filtering.hh"
#include "readers/compacting.hh"
#include "tombstone_gc.hh"
namespace sstables {
bool is_eligible_for_compaction(const shared_sstable& sst) noexcept {
return !sst->requires_view_building() && !sst->is_quarantined();
}
logging::logger clogger("compaction");
static const std::unordered_map<compaction_type, sstring> compaction_types = {
{ compaction_type::Compaction, "COMPACTION" },
{ compaction_type::Cleanup, "CLEANUP" },
{ compaction_type::Validation, "VALIDATION" },
{ compaction_type::Scrub, "SCRUB" },
{ compaction_type::Index_build, "INDEX_BUILD" },
{ compaction_type::Reshard, "RESHARD" },
{ compaction_type::Upgrade, "UPGRADE" },
{ compaction_type::Reshape, "RESHAPE" },
};
sstring compaction_name(compaction_type type) {
auto ret = compaction_types.find(type);
if (ret != compaction_types.end()) {
return ret->second;
}
throw std::runtime_error("Invalid Compaction Type");
}
compaction_type to_compaction_type(sstring type_name) {
for (auto& it : compaction_types) {
if (it.second == type_name) {
return it.first;
}
}
throw std::runtime_error("Invalid Compaction Type Name");
}
std::string_view to_string(compaction_type type) {
switch (type) {
case compaction_type::Compaction: return "Compact";
case compaction_type::Cleanup: return "Cleanup";
case compaction_type::Validation: return "Validate";
case compaction_type::Scrub: return "Scrub";
case compaction_type::Index_build: return "Index_build";
case compaction_type::Reshard: return "Reshard";
case compaction_type::Upgrade: return "Upgrade";
case compaction_type::Reshape: return "Reshape";
}
on_internal_error_noexcept(clogger, format("Invalid compaction type {}", int(type)));
return "(invalid)";
}
std::ostream& operator<<(std::ostream& os, compaction_type type) {
os << to_string(type);
return os;
}
std::string_view to_string(compaction_type_options::scrub::mode scrub_mode) {
switch (scrub_mode) {
case compaction_type_options::scrub::mode::abort:
return "abort";
case compaction_type_options::scrub::mode::skip:
return "skip";
case compaction_type_options::scrub::mode::segregate:
return "segregate";
case compaction_type_options::scrub::mode::validate:
return "validate";
}
on_internal_error_noexcept(clogger, format("Invalid scrub mode {}", int(scrub_mode)));
return "(invalid)";
}
std::ostream& operator<<(std::ostream& os, compaction_type_options::scrub::mode scrub_mode) {
return os << to_string(scrub_mode);
}
std::string_view to_string(compaction_type_options::scrub::quarantine_mode quarantine_mode) {
switch (quarantine_mode) {
case compaction_type_options::scrub::quarantine_mode::include:
return "include";
case compaction_type_options::scrub::quarantine_mode::exclude:
return "exclude";
case compaction_type_options::scrub::quarantine_mode::only:
return "only";
}
on_internal_error_noexcept(clogger, format("Invalid scrub quarantine mode {}", int(quarantine_mode)));
return "(invalid)";
}
std::ostream& operator<<(std::ostream& os, compaction_type_options::scrub::quarantine_mode quarantine_mode) {
return os << to_string(quarantine_mode);
}
std::ostream& operator<<(std::ostream& os, pretty_printed_data_size data) {
static constexpr const char* suffixes[] = { " bytes", "kB", "MB", "GB", "TB", "PB" };
unsigned exp = 0;
while ((data._size >= 1000) && (exp < sizeof(suffixes))) {
exp++;
data._size /= 1000;
}
os << data._size << suffixes[exp];
return os;
}
std::ostream& operator<<(std::ostream& os, pretty_printed_throughput tp) {
uint64_t throughput = tp._duration.count() > 0 ? tp._size / tp._duration.count() : 0;
os << pretty_printed_data_size(throughput) << "/s";
return os;
}
static api::timestamp_type get_max_purgeable_timestamp(const table_state& table_s, sstable_set::incremental_selector& selector,
const std::unordered_set<shared_sstable>& compacting_set, const dht::decorated_key& dk) {
auto timestamp = table_s.min_memtable_timestamp();
std::optional<utils::hashed_key> hk;
for (auto&& sst : boost::range::join(selector.select(dk).sstables, table_s.compacted_undeleted_sstables())) {
if (compacting_set.contains(sst)) {
continue;
}
if (!hk) {
hk = sstables::sstable::make_hashed_key(*table_s.schema(), dk.key());
}
if (sst->filter_has_key(*hk)) {
timestamp = std::min(timestamp, sst->get_stats_metadata().min_timestamp);
}
}
return timestamp;
}
static std::vector<shared_sstable> get_uncompacting_sstables(const table_state& table_s, std::vector<shared_sstable> sstables) {
auto all_sstables = boost::copy_range<std::vector<shared_sstable>>(*table_s.get_sstable_set().all());
auto& compacted_undeleted = table_s.compacted_undeleted_sstables();
all_sstables.insert(all_sstables.end(), compacted_undeleted.begin(), compacted_undeleted.end());
boost::sort(all_sstables, [] (const shared_sstable& x, const shared_sstable& y) {
return x->generation() < y->generation();
});
std::sort(sstables.begin(), sstables.end(), [] (const shared_sstable& x, const shared_sstable& y) {
return x->generation() < y->generation();
});
std::vector<shared_sstable> not_compacted_sstables;
boost::set_difference(all_sstables, sstables,
std::back_inserter(not_compacted_sstables), [] (const shared_sstable& x, const shared_sstable& y) {
return x->generation() < y->generation();
});
return not_compacted_sstables;
}
class compaction;
class compaction_write_monitor final : public sstables::write_monitor, public backlog_write_progress_manager {
sstables::shared_sstable _sst;
table_state& _table_s;
const sstables::writer_offset_tracker* _tracker = nullptr;
uint64_t _progress_seen = 0;
api::timestamp_type _maximum_timestamp;
unsigned _sstable_level;
public:
compaction_write_monitor(sstables::shared_sstable sst, table_state& table_s, api::timestamp_type max_timestamp, unsigned sstable_level)
: _sst(sst)
, _table_s(table_s)
, _maximum_timestamp(max_timestamp)
, _sstable_level(sstable_level)
{}
~compaction_write_monitor() {
if (_sst) {
_table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
}
}
virtual void on_write_started(const sstables::writer_offset_tracker& tracker) override {
_tracker = &tracker;
_table_s.get_compaction_strategy().get_backlog_tracker().register_partially_written_sstable(_sst, *this);
}
virtual void on_data_write_completed() override {
if (_tracker) {
_progress_seen = _tracker->offset;
_tracker = nullptr;
}
}
virtual uint64_t written() const override {
if (_tracker) {
return _tracker->offset;
}
return _progress_seen;
}
api::timestamp_type maximum_timestamp() const override {
return _maximum_timestamp;
}
unsigned level() const override {
return _sstable_level;
}
};
struct compaction_writer {
shared_sstable sst;
// We use a ptr for pointer stability and so that it can be null
// when using a noop monitor.
sstable_writer writer;
// The order in here is important. A monitor must be destroyed before the writer it is monitoring since it has a
// periodic timer that checks the writer.
// The writer must be destroyed before the shared_sstable since the it may depend on the sstable
// (as in the mx::writer over compressed_file_data_sink_impl case that depends on sstables::compression).
std::unique_ptr<compaction_write_monitor> monitor;
compaction_writer(std::unique_ptr<compaction_write_monitor> monitor, sstable_writer writer, shared_sstable sst)
: sst(std::move(sst)), writer(std::move(writer)), monitor(std::move(monitor)) {}
compaction_writer(sstable_writer writer, shared_sstable sst)
: compaction_writer(nullptr, std::move(writer), std::move(sst)) {}
};
class compacted_fragments_writer {
compaction& _c;
std::optional<compaction_writer> _compaction_writer = {};
using creator_func_t = std::function<compaction_writer(const dht::decorated_key&)>;
using stop_func_t = std::function<void(compaction_writer*)>;
creator_func_t _create_compaction_writer;
stop_func_t _stop_compaction_writer;
std::optional<utils::observer<>> _stop_request_observer;
bool _unclosed_partition = false;
private:
inline void maybe_abort_compaction();
utils::observer<> make_stop_request_observer(utils::observable<>& sro) {
return sro.observe([this] () mutable {
assert(!_unclosed_partition);
consume_end_of_stream();
});
}
public:
explicit compacted_fragments_writer(compaction& c, creator_func_t cpw, stop_func_t scw)
: _c(c)
, _create_compaction_writer(std::move(cpw))
, _stop_compaction_writer(std::move(scw)) {
}
explicit compacted_fragments_writer(compaction& c, creator_func_t cpw, stop_func_t scw, utils::observable<>& sro)
: _c(c)
, _create_compaction_writer(std::move(cpw))
, _stop_compaction_writer(std::move(scw))
, _stop_request_observer(make_stop_request_observer(sro)) {
}
compacted_fragments_writer(compacted_fragments_writer&& other);
compacted_fragments_writer& operator=(const compacted_fragments_writer&) = delete;
compacted_fragments_writer(const compacted_fragments_writer&) = delete;
void consume_new_partition(const dht::decorated_key& dk);
void consume(tombstone t) { _compaction_writer->writer.consume(t); }
stop_iteration consume(static_row&& sr, tombstone, bool) {
maybe_abort_compaction();
return _compaction_writer->writer.consume(std::move(sr));
}
stop_iteration consume(static_row&& sr) {
return consume(std::move(sr), tombstone{}, bool{});
}
stop_iteration consume(clustering_row&& cr, row_tombstone, bool) {
maybe_abort_compaction();
return _compaction_writer->writer.consume(std::move(cr));
}
stop_iteration consume(clustering_row&& cr) {
return consume(std::move(cr), row_tombstone{}, bool{});
}
stop_iteration consume(range_tombstone_change&& rtc) {
maybe_abort_compaction();
return _compaction_writer->writer.consume(std::move(rtc));
}
stop_iteration consume_end_of_partition();
void consume_end_of_stream();
};
struct compaction_read_monitor_generator final : public read_monitor_generator {
class compaction_read_monitor final : public sstables::read_monitor, public backlog_read_progress_manager {
sstables::shared_sstable _sst;
table_state& _table_s;
const sstables::reader_position_tracker* _tracker = nullptr;
uint64_t _last_position_seen = 0;
public:
virtual void on_read_started(const sstables::reader_position_tracker& tracker) override {
_tracker = &tracker;
_table_s.get_compaction_strategy().get_backlog_tracker().register_compacting_sstable(_sst, *this);
}
virtual void on_read_completed() override {
if (_tracker) {
_last_position_seen = _tracker->position;
_tracker = nullptr;
}
}
virtual uint64_t compacted() const override {
if (_tracker) {
return _tracker->position;
}
return _last_position_seen;
}
void remove_sstable() {
if (_sst) {
_table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
}
_sst = {};
}
compaction_read_monitor(sstables::shared_sstable sst, table_state& table_s)
: _sst(std::move(sst)), _table_s(table_s) { }
~compaction_read_monitor() {
// We failed to finish handling this SSTable, so we have to update the backlog_tracker
// about it.
if (_sst) {
_table_s.get_compaction_strategy().get_backlog_tracker().revert_charges(_sst);
}
}
friend class compaction_read_monitor_generator;
};
virtual sstables::read_monitor& operator()(sstables::shared_sstable sst) override {
auto p = _generated_monitors.emplace(sst->generation(), compaction_read_monitor(sst, _table_s));
return p.first->second;
}
explicit compaction_read_monitor_generator(table_state& table_s)
: _table_s(table_s) {}
void remove_exhausted_sstables(const std::vector<sstables::shared_sstable>& exhausted_sstables) {
for (auto& sst : exhausted_sstables) {
auto it = _generated_monitors.find(sst->generation());
if (it != _generated_monitors.end()) {
it->second.remove_sstable();
}
}
}
private:
table_state& _table_s;
std::unordered_map<int64_t, compaction_read_monitor> _generated_monitors;
};
class formatted_sstables_list {
bool _include_origin = true;
std::vector<sstring> _ssts;
public:
formatted_sstables_list() = default;
explicit formatted_sstables_list(const std::vector<shared_sstable>& ssts, bool include_origin) : _include_origin(include_origin) {
_ssts.reserve(ssts.size());
for (const auto& sst : ssts) {
*this += sst;
}
}
formatted_sstables_list& operator+=(const shared_sstable& sst) {
if (_include_origin) {
_ssts.emplace_back(format("{}:level={:d}:origin={}", sst->get_filename(), sst->get_sstable_level(), sst->get_origin()));
} else {
_ssts.emplace_back(format("{}:level={:d}", sst->get_filename(), sst->get_sstable_level()));
}
return *this;
}
friend std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst);
};
std::ostream& operator<<(std::ostream& os, const formatted_sstables_list& lst) {
os << "[";
os << boost::algorithm::join(lst._ssts, ",");
os << "]";
return os;
}
class compaction {
protected:
compaction_data& _cdata;
table_state& _table_s;
compaction_sstable_creator_fn _sstable_creator;
schema_ptr _schema;
reader_permit _permit;
std::vector<shared_sstable> _sstables;
std::vector<unsigned long> _input_sstable_generations;
// Unused sstables are tracked because if compaction is interrupted we can only delete them.
// Deleting used sstables could potentially result in data loss.
std::unordered_set<shared_sstable> _new_partial_sstables;
std::vector<shared_sstable> _new_unused_sstables;
std::vector<shared_sstable> _all_new_sstables;
lw_shared_ptr<sstable_set> _compacting;
sstables::compaction_type _type;
uint64_t _max_sstable_size;
uint32_t _sstable_level;
uint64_t _start_size = 0;
uint64_t _end_size = 0;
uint64_t _estimated_partitions = 0;
db::replay_position _rp;
encoding_stats_collector _stats_collector;
bool _contains_multi_fragment_runs = false;
mutation_source_metadata _ms_metadata = {};
compaction_sstable_replacer_fn _replacer;
utils::UUID _run_identifier;
::io_priority_class _io_priority;
// optional clone of sstable set to be used for expiration purposes, so it will be set if expiration is enabled.
std::optional<sstable_set> _sstable_set;
// used to incrementally calculate max purgeable timestamp, as we iterate through decorated keys.
std::optional<sstable_set::incremental_selector> _selector;
std::unordered_set<shared_sstable> _compacting_for_max_purgeable_func;
// Garbage collected sstables that are sealed but were not added to SSTable set yet.
std::vector<shared_sstable> _unused_garbage_collected_sstables;
// Garbage collected sstables that were added to SSTable set and should be eventually removed from it.
std::vector<shared_sstable> _used_garbage_collected_sstables;
utils::observable<> _stop_request_observable;
private:
compaction_data& init_compaction_data(compaction_data& cdata, const compaction_descriptor& descriptor) const {
cdata.compaction_fan_in = descriptor.fan_in();
return cdata;
}
protected:
compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
: _cdata(init_compaction_data(cdata, descriptor))
, _table_s(table_s)
, _sstable_creator(std::move(descriptor.creator))
, _schema(_table_s.schema())
, _permit(_table_s.make_compaction_reader_permit())
, _sstables(std::move(descriptor.sstables))
, _type(descriptor.options.type())
, _max_sstable_size(descriptor.max_sstable_bytes)
, _sstable_level(descriptor.level)
, _replacer(std::move(descriptor.replacer))
, _run_identifier(descriptor.run_identifier)
, _io_priority(descriptor.io_priority)
, _sstable_set(std::move(descriptor.all_sstables_snapshot))
, _selector(_sstable_set ? _sstable_set->make_incremental_selector() : std::optional<sstable_set::incremental_selector>{})
, _compacting_for_max_purgeable_func(std::unordered_set<shared_sstable>(_sstables.begin(), _sstables.end()))
{
for (auto& sst : _sstables) {
_stats_collector.update(sst->get_encoding_stats_for_compaction());
}
std::unordered_set<utils::UUID> ssts_run_ids;
_contains_multi_fragment_runs = std::any_of(_sstables.begin(), _sstables.end(), [&ssts_run_ids] (shared_sstable& sst) {
return !ssts_run_ids.insert(sst->run_identifier()).second;
});
}
virtual uint64_t partitions_per_sstable() const {
// some tests use _max_sstable_size == 0 for force many one partition per sstable
auto max_sstable_size = std::max<uint64_t>(_max_sstable_size, 1);
uint64_t estimated_sstables = std::max(1UL, uint64_t(ceil(double(_start_size) / max_sstable_size)));
return std::min(uint64_t(ceil(double(_estimated_partitions) / estimated_sstables)),
_table_s.get_compaction_strategy().adjust_partition_estimate(_ms_metadata, _estimated_partitions));
}
void setup_new_sstable(shared_sstable& sst) {
_all_new_sstables.push_back(sst);
_new_partial_sstables.insert(sst);
for (auto ancestor : _input_sstable_generations) {
sst->add_ancestor(ancestor);
}
}
void finish_new_sstable(compaction_writer* writer) {
writer->writer.consume_end_of_stream();
writer->sst->open_data().get0();
_end_size += writer->sst->bytes_on_disk();
_new_unused_sstables.push_back(writer->sst);
_new_partial_sstables.erase(writer->sst);
}
sstable_writer_config make_sstable_writer_config(compaction_type type) {
auto s = compaction_name(type);
std::transform(s.begin(), s.end(), s.begin(), [] (char c) {
return std::tolower(c);
});
sstable_writer_config cfg = _table_s.configure_writer(std::move(s));
cfg.max_sstable_size = _max_sstable_size;
cfg.monitor = &default_write_monitor();
cfg.run_identifier = _run_identifier;
cfg.replay_position = _rp;
cfg.sstable_level = _sstable_level;
return cfg;
}
api::timestamp_type maximum_timestamp() const {
auto m = std::max_element(_sstables.begin(), _sstables.end(), [] (const shared_sstable& sst1, const shared_sstable& sst2) {
return sst1->get_stats_metadata().max_timestamp < sst2->get_stats_metadata().max_timestamp;
});
return (*m)->get_stats_metadata().max_timestamp;
}
encoding_stats get_encoding_stats() const {
return _stats_collector.get();
}
virtual compaction_completion_desc
get_compaction_completion_desc(std::vector<shared_sstable> input_sstables, std::vector<shared_sstable> output_sstables) {
return compaction_completion_desc{std::move(input_sstables), std::move(output_sstables)};
}
// Tombstone expiration is enabled based on the presence of sstable set.
// If it's not present, we cannot purge tombstones without the risk of resurrecting data.
bool tombstone_expiration_enabled() const {
return bool(_sstable_set);
}
compaction_writer create_gc_compaction_writer() const {
auto sst = _sstable_creator(this_shard_id());
auto&& priority = _io_priority;
auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
sstable_writer_config cfg = _table_s.configure_writer("garbage_collection");
cfg.run_identifier = _run_identifier;
cfg.monitor = monitor.get();
auto writer = sst->get_writer(*schema(), partitions_per_sstable(), cfg, get_encoding_stats(), priority);
return compaction_writer(std::move(monitor), std::move(writer), std::move(sst));
}
void stop_gc_compaction_writer(compaction_writer* c_writer) {
c_writer->writer.consume_end_of_stream();
auto sst = c_writer->sst;
sst->open_data().get0();
_unused_garbage_collected_sstables.push_back(std::move(sst));
}
// Writes a temporary sstable run containing only garbage collected data.
// Whenever regular compaction writer seals a new sstable, this writer will flush a new sstable as well,
// right before there's an attempt to release exhausted sstables earlier.
// Generated sstables will be temporarily added to table to make sure that a compaction crash will not
// result in data resurrection.
// When compaction finishes, all the temporary sstables generated here will be deleted and removed
// from table's sstable set.
compacted_fragments_writer get_gc_compacted_fragments_writer() {
return compacted_fragments_writer(*this,
[this] (const dht::decorated_key&) { return create_gc_compaction_writer(); },
[this] (compaction_writer* cw) { stop_gc_compaction_writer(cw); },
_stop_request_observable);
}
// Retrieves all unused garbage collected sstables that will be subsequently added
// to the SSTable set, and mark them as used.
std::vector<shared_sstable> consume_unused_garbage_collected_sstables() {
auto unused = std::exchange(_unused_garbage_collected_sstables, {});
_used_garbage_collected_sstables.insert(_used_garbage_collected_sstables.end(), unused.begin(), unused.end());
return unused;
}
const std::vector<shared_sstable>& used_garbage_collected_sstables() const {
return _used_garbage_collected_sstables;
}
public:
compaction& operator=(const compaction&) = delete;
compaction(const compaction&) = delete;
compaction(compaction&& other) = delete;
compaction& operator=(compaction&& other) = delete;
virtual ~compaction() {
}
private:
// Default range sstable reader that will only return mutation that belongs to current shard.
virtual flat_mutation_reader_v2 make_sstable_reader() const = 0;
virtual sstables::sstable_set make_sstable_set_for_input() const {
return _table_s.get_compaction_strategy().make_sstable_set(_schema);
}
void setup() {
auto ssts = make_lw_shared<sstables::sstable_set>(make_sstable_set_for_input());
formatted_sstables_list formatted_msg;
auto fully_expired = _table_s.fully_expired_sstables(_sstables, gc_clock::now());
min_max_tracker<api::timestamp_type> timestamp_tracker;
for (auto& sst : _sstables) {
auto& sst_stats = sst->get_stats_metadata();
timestamp_tracker.update(sst_stats.min_timestamp);
timestamp_tracker.update(sst_stats.max_timestamp);
// Compacted sstable keeps track of its ancestors.
_input_sstable_generations.push_back(sst->generation());
_start_size += sst->bytes_on_disk();
_cdata.total_partitions += sst->get_estimated_key_count();
formatted_msg += sst;
// Do not actually compact a sstable that is fully expired and can be safely
// dropped without ressurrecting old data.
if (tombstone_expiration_enabled() && fully_expired.contains(sst)) {
log_debug("Fully expired sstable {} will be dropped on compaction completion", sst->get_filename());
continue;
}
// We also capture the sstable, so we keep it alive while the read isn't done
ssts->insert(sst);
// FIXME: If the sstables have cardinality estimation bitmaps, use that
// for a better estimate for the number of partitions in the merged
// sstable than just adding up the lengths of individual sstables.
_estimated_partitions += sst->get_estimated_key_count();
// TODO:
// Note that this is not fully correct. Since we might be merging sstables that originated on
// another shard (#cpu changed), we might be comparing RP:s with differing shard ids,
// which might vary in "comparable" size quite a bit. However, since the worst that happens
// is that we might miss a high water mark for the commit log replayer,
// this is kind of ok, esp. since we will hopefully not be trying to recover based on
// compacted sstables anyway (CL should be clean by then).
_rp = std::max(_rp, sst_stats.position);
}
log_info("{} {}", report_start_desc(), formatted_msg);
if (ssts->all()->size() < _sstables.size()) {
log_debug("{} out of {} input sstables are fully expired sstables that will not be actually compacted",
_sstables.size() - ssts->all()->size(), _sstables.size());
}
_compacting = std::move(ssts);
_ms_metadata.min_timestamp = timestamp_tracker.min();
_ms_metadata.max_timestamp = timestamp_tracker.max();
}
// This consumer will perform mutation compaction on producer side using
// compacting_reader. It's useful for allowing data from different buckets
// to be compacted together.
future<> consume_without_gc_writer(gc_clock::time_point compaction_time) {
auto consumer = make_interposer_consumer([this] (flat_mutation_reader_v2 reader) mutable {
return seastar::async([this, reader = std::move(reader)] () mutable {
auto close_reader = deferred_close(reader);
auto cfc = compacted_fragments_writer(get_compacted_fragments_writer());
reader.consume_in_thread(std::move(cfc));
});
});
return consumer(make_compacting_reader(make_sstable_reader(), compaction_time, max_purgeable_func()));
}
future<> consume() {
auto now = gc_clock::now();
// consume_without_gc_writer(), which uses compacting_reader, is ~3% slower.
// let's only use it when GC writer is disabled and interposer consumer is enabled, as we
// wouldn't like others to pay the penalty for something they don't need.
if (!enable_garbage_collected_sstable_writer() && use_interposer_consumer()) {
return consume_without_gc_writer(now);
}
auto consumer = make_interposer_consumer([this, now] (flat_mutation_reader_v2 reader) mutable
{
return seastar::async([this, reader = std::move(reader), now] () mutable {
auto close_reader = deferred_close(reader);
if (enable_garbage_collected_sstable_writer()) {
using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, compacted_fragments_writer>;
auto cfc = compact_mutations(*schema(), now,
max_purgeable_func(),
get_compacted_fragments_writer(),
get_gc_compacted_fragments_writer());
reader.consume_in_thread(std::move(cfc));
return;
}
using compact_mutations = compact_for_compaction_v2<compacted_fragments_writer, noop_compacted_fragments_consumer>;
auto cfc = compact_mutations(*schema(), now,
max_purgeable_func(),
get_compacted_fragments_writer(),
noop_compacted_fragments_consumer());
reader.consume_in_thread(std::move(cfc));
});
});
return consumer(make_sstable_reader());
}
virtual reader_consumer_v2 make_interposer_consumer(reader_consumer_v2 end_consumer) {
return _table_s.get_compaction_strategy().make_interposer_consumer(_ms_metadata, std::move(end_consumer));
}
virtual bool use_interposer_consumer() const {
return _table_s.get_compaction_strategy().use_interposer_consumer();
}
compaction_result finish(std::chrono::time_point<db_clock> started_at, std::chrono::time_point<db_clock> ended_at) {
compaction_result ret {
.new_sstables = std::move(_all_new_sstables),
.ended_at = ended_at,
.end_size = _end_size,
};
auto ratio = double(_end_size) / double(_start_size);
auto duration = std::chrono::duration<float>(ended_at - started_at);
// Don't report NaN or negative number.
on_end_of_compaction();
formatted_sstables_list new_sstables_msg(ret.new_sstables, false);
// FIXME: there is some missing information in the log message below.
// look at CompactionTask::runMayThrow() in origin for reference.
// - add support to merge summary (message: Partition merge counts were {%s}.).
// - there is no easy way, currently, to know the exact number of total partitions.
// By the time being, using estimated key count.
log_info("{} {} sstables to {}. {} to {} (~{}% of original) in {}ms = {}. ~{} total partitions merged to {}.",
report_finish_desc(),
_input_sstable_generations.size(), new_sstables_msg, pretty_printed_data_size(_start_size), pretty_printed_data_size(_end_size), int(ratio * 100),
std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(), pretty_printed_throughput(_end_size, duration),
_cdata.total_partitions, _cdata.total_keys_written);
return ret;
}
void on_interrupt(std::exception_ptr ex) {
log_info("{} of {} sstables interrupted due to: {}", report_start_desc(), _input_sstable_generations.size(), ex);
delete_sstables_for_interrupted_compaction();
}
virtual std::string_view report_start_desc() const = 0;
virtual std::string_view report_finish_desc() const = 0;
std::function<api::timestamp_type(const dht::decorated_key&)> max_purgeable_func() {
if (!tombstone_expiration_enabled()) {
return [] (const dht::decorated_key& dk) {
return api::min_timestamp;
};
}
return [this] (const dht::decorated_key& dk) {
return get_max_purgeable_timestamp(_table_s, *_selector, _compacting_for_max_purgeable_func, dk);
};
}
virtual void on_new_partition() {}
virtual void on_end_of_compaction() {};
// create a writer based on decorated key.
virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) = 0;
// stop current writer
virtual void stop_sstable_writer(compaction_writer* writer) = 0;
compacted_fragments_writer get_compacted_fragments_writer() {
return compacted_fragments_writer(*this,
[this] (const dht::decorated_key& dk) { return create_compaction_writer(dk); },
[this] (compaction_writer* cw) { stop_sstable_writer(cw); });
}
const schema_ptr& schema() const {
return _schema;
}
void delete_sstables_for_interrupted_compaction() {
// Delete either partially or fully written sstables of a compaction that
// was either stopped abruptly (e.g. out of disk space) or deliberately
// (e.g. nodetool stop COMPACTION).
for (auto& sst : boost::range::join(_new_partial_sstables, _new_unused_sstables)) {
log_debug("Deleting sstable {} of interrupted compaction for {}.{}", sst->get_filename(), _schema->ks_name(), _schema->cf_name());
sst->mark_for_deletion();
}
}
protected:
template <typename... Args>
void log(log_level level, std::string_view fmt, const Args&... args) const {
if (clogger.is_enabled(level)) {
auto msg = fmt::format(fmt::runtime(fmt), args...);
clogger.log(level, "[{} {}.{} {}] {}", _type, _schema->ks_name(), _schema->cf_name(), _cdata.compaction_uuid, msg);
}
}
template <typename... Args>
void log_error(std::string_view fmt, Args&&... args) const {
log(log_level::error, std::move(fmt), std::forward<Args>(args)...);
}
template <typename... Args>
void log_warning(std::string_view fmt, Args&&... args) const {
log(log_level::warn, std::move(fmt), std::forward<Args>(args)...);
}
template <typename... Args>
void log_info(std::string_view fmt, Args&&... args) const {
log(log_level::info, std::move(fmt), std::forward<Args>(args)...);
}
template <typename... Args>
void log_debug(std::string_view fmt, Args&&... args) const {
log(log_level::debug, std::move(fmt), std::forward<Args>(args)...);
}
template <typename... Args>
void log_trace(std::string_view fmt, Args&&... args) const {
log(log_level::trace, std::move(fmt), std::forward<Args>(args)...);
}
public:
bool enable_garbage_collected_sstable_writer() const noexcept {
return _contains_multi_fragment_runs && _max_sstable_size != std::numeric_limits<uint64_t>::max();
}
static future<compaction_result> run(std::unique_ptr<compaction> c);
friend class compacted_fragments_writer;
};
compacted_fragments_writer::compacted_fragments_writer(compacted_fragments_writer&& other)
: _c(other._c)
, _compaction_writer(std::move(other._compaction_writer))
, _create_compaction_writer(std::move(other._create_compaction_writer))
, _stop_compaction_writer(std::move(other._stop_compaction_writer)) {
if (std::exchange(other._stop_request_observer, std::nullopt)) {
_stop_request_observer = make_stop_request_observer(_c._stop_request_observable);
}
}
void compacted_fragments_writer::maybe_abort_compaction() {
if (_c._cdata.is_stop_requested()) [[unlikely]] {
// Compaction manager will catch this exception and re-schedule the compaction.
throw compaction_stopped_exception(_c._schema->ks_name(), _c._schema->cf_name(), _c._cdata.stop_requested);
}
}
void compacted_fragments_writer::consume_new_partition(const dht::decorated_key& dk) {
maybe_abort_compaction();
if (!_compaction_writer) {
_compaction_writer = _create_compaction_writer(dk);
}
_c.on_new_partition();
_compaction_writer->writer.consume_new_partition(dk);
_c._cdata.total_keys_written++;
_unclosed_partition = true;
}
stop_iteration compacted_fragments_writer::consume_end_of_partition() {
auto ret = _compaction_writer->writer.consume_end_of_partition();
_unclosed_partition = false;
if (ret == stop_iteration::yes) {
// stop sstable writer being currently used.
_stop_compaction_writer(&*_compaction_writer);
_compaction_writer = std::nullopt;
}
return ret;
}
void compacted_fragments_writer::consume_end_of_stream() {
if (_compaction_writer) {
_stop_compaction_writer(&*_compaction_writer);
_compaction_writer = std::nullopt;
}
}
class reshape_compaction : public compaction {
public:
reshape_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
: compaction(table_s, std::move(descriptor), cdata) {
}
virtual sstables::sstable_set make_sstable_set_for_input() const override {
return sstables::make_partitioned_sstable_set(_schema, make_lw_shared<sstable_list>(sstable_list{}), false);
}
flat_mutation_reader_v2 make_sstable_reader() const override {
return _compacting->make_local_shard_sstable_reader(_schema,
_permit,
query::full_partition_range,
_schema->full_slice(),
_io_priority,
tracing::trace_state_ptr(),
::streamed_mutation::forwarding::no,
::mutation_reader::forwarding::no,
default_read_monitor_generator());
}
std::string_view report_start_desc() const override {
return "Reshaping";
}
std::string_view report_finish_desc() const override {
return "Reshaped";
}
virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
auto sst = _sstable_creator(this_shard_id());
setup_new_sstable(sst);
sstable_writer_config cfg = make_sstable_writer_config(compaction_type::Reshape);
return compaction_writer{sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
}
virtual void stop_sstable_writer(compaction_writer* writer) override {
if (writer) {
finish_new_sstable(writer);
}
}
};
class regular_compaction : public compaction {
// keeps track of monitors for input sstable, which are responsible for adjusting backlog as compaction progresses.
mutable compaction_read_monitor_generator _monitor_generator;
seastar::semaphore _replacer_lock = {1};
public:
regular_compaction(table_state& table_s, compaction_descriptor descriptor, compaction_data& cdata)
: compaction(table_s, std::move(descriptor), cdata)
, _monitor_generator(_table_s)
{
}
flat_mutation_reader_v2 make_sstable_reader() const override {
return _compacting->make_local_shard_sstable_reader(_schema,
_permit,
query::full_partition_range,
_schema->full_slice(),
_io_priority,
tracing::trace_state_ptr(),
::streamed_mutation::forwarding::no,
::mutation_reader::forwarding::no,
_monitor_generator);
}
std::string_view report_start_desc() const override {
return "Compacting";
}
std::string_view report_finish_desc() const override {
return "Compacted";
}
virtual compaction_writer create_compaction_writer(const dht::decorated_key& dk) override {
auto sst = _sstable_creator(this_shard_id());
setup_new_sstable(sst);
auto monitor = std::make_unique<compaction_write_monitor>(sst, _table_s, maximum_timestamp(), _sstable_level);
sstable_writer_config cfg = make_sstable_writer_config(_type);
cfg.monitor = monitor.get();
return compaction_writer{std::move(monitor), sst->get_writer(*_schema, partitions_per_sstable(), cfg, get_encoding_stats(), _io_priority), sst};
}
virtual void stop_sstable_writer(compaction_writer* writer) override {
if (writer) {
finish_new_sstable(writer);
maybe_replace_exhausted_sstables_by_sst(writer->sst);
}
}
void on_new_partition() override {
update_pending_ranges();
}
virtual void on_end_of_compaction() override {
replace_remaining_exhausted_sstables();
}
private:
void maybe_replace_exhausted_sstables_by_sst(shared_sstable sst) {
// Skip earlier replacement of exhausted sstables if compaction works with only single-fragment runs,
// meaning incremental compaction is disabled for this compaction.
if (!enable_garbage_collected_sstable_writer()) {
return;
}
auto permit = seastar::get_units(_replacer_lock, 1).get0();
// Replace exhausted sstable(s), if any, by new one(s) in the column family.
auto not_exhausted = [s = _schema, &dk = sst->get_last_decorated_key()] (shared_sstable& sst) {
return sst->get_last_decorated_key().tri_compare(*s, dk) > 0;