Skip to content

Commit 78159d7

Browse files
drcrallennishantmonu51
authored andcommitted
Move off-heap QTL global cache delete lock outside of subclass lock (apache#3597)
* Move off-heap QTL global cache delete lock outside of subclass lock * Make `delete` thread safe
1 parent 0799640 commit 78159d7

File tree

4 files changed

+104
-81
lines changed

4 files changed

+104
-81
lines changed

extensions-core/lookups-cached-global/src/main/java/io/druid/server/lookup/namespace/cache/NamespaceExtractionCacheManager.java

+87-65
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import io.druid.query.lookup.namespace.ExtractionNamespace;
3838
import io.druid.query.lookup.namespace.ExtractionNamespaceCacheFactory;
3939

40+
import javax.annotation.concurrent.GuardedBy;
4041
import java.util.Collection;
4142
import java.util.Map;
4243
import java.util.UUID;
@@ -49,6 +50,7 @@
4950
import java.util.concurrent.TimeUnit;
5051
import java.util.concurrent.atomic.AtomicBoolean;
5152
import java.util.concurrent.atomic.AtomicLong;
53+
import java.util.concurrent.atomic.AtomicReference;
5254

5355
/**
5456
*
@@ -71,16 +73,17 @@ public NamespaceImplData(
7173
final ListenableFuture<?> future;
7274
final ExtractionNamespace namespace;
7375
final String name;
76+
final Object changeLock = new Object();
7477
final AtomicBoolean enabled = new AtomicBoolean(false);
7578
final CountDownLatch firstRun = new CountDownLatch(1);
79+
final AtomicReference<String> latestVersion = new AtomicReference<>(null);
7680
}
7781

7882
private static final Logger log = new Logger(NamespaceExtractionCacheManager.class);
7983
private final ListeningScheduledExecutorService listeningScheduledExecutorService;
8084
protected final ConcurrentMap<String, NamespaceImplData> implData = new ConcurrentHashMap<>();
8185
protected final AtomicLong tasksStarted = new AtomicLong(0);
8286
protected final ServiceEmitter serviceEmitter;
83-
private final ConcurrentHashMap<String, String> lastVersion = new ConcurrentHashMap<>();
8487
private final Map<Class<? extends ExtractionNamespace>, ExtractionNamespaceCacheFactory<?>> namespaceFunctionFactoryMap;
8588

8689
public NamespaceExtractionCacheManager(
@@ -148,10 +151,8 @@ protected boolean waitForServiceToEnd(long time, TimeUnit unit) throws Interrupt
148151
}
149152

150153

151-
protected <T extends ExtractionNamespace> Runnable getPostRunnable(
154+
protected Runnable getPostRunnable(
152155
final String id,
153-
final T namespace,
154-
final ExtractionNamespaceCacheFactory<T> factory,
155156
final String cacheId
156157
)
157158
{
@@ -165,17 +166,20 @@ public void run()
165166
// was removed
166167
return;
167168
}
168-
synchronized (namespaceDatum.enabled) {
169-
try {
169+
try {
170+
if (!namespaceDatum.enabled.get()) {
171+
// skip because it was disabled
172+
return;
173+
}
174+
synchronized (namespaceDatum.enabled) {
170175
if (!namespaceDatum.enabled.get()) {
171-
// skip because it was disabled
172176
return;
173177
}
174178
swapAndClearCache(id, cacheId);
175179
}
176-
finally {
177-
namespaceDatum.firstRun.countDown();
178-
}
180+
}
181+
finally {
182+
namespaceDatum.firstRun.countDown();
179183
}
180184
}
181185
};
@@ -221,7 +225,10 @@ public boolean scheduleOrUpdate(
221225
if (log.isDebugEnabled()) {
222226
log.debug("Namespace [%s] needs updated to [%s]", implDatum.namespace, namespace);
223227
}
224-
removeNamespaceLocalMetadata(implDatum);
228+
// Ensure it is not changing state right now.
229+
synchronized (implDatum.changeLock) {
230+
removeNamespaceLocalMetadata(implDatum);
231+
}
225232
schedule(id, namespace);
226233
return true;
227234
}
@@ -257,59 +264,59 @@ public boolean scheduleAndWait(
257264
return success;
258265
}
259266

267+
@GuardedBy("implDatum.changeLock")
260268
private void cancelFuture(final NamespaceImplData implDatum)
261269
{
262-
synchronized (implDatum.enabled) {
263-
final CountDownLatch latch = new CountDownLatch(1);
264-
final ListenableFuture<?> future = implDatum.future;
265-
Futures.addCallback(
266-
future, new FutureCallback<Object>()
270+
final CountDownLatch latch = new CountDownLatch(1);
271+
final ListenableFuture<?> future = implDatum.future;
272+
Futures.addCallback(
273+
future, new FutureCallback<Object>()
274+
{
275+
@Override
276+
public void onSuccess(Object result)
267277
{
268-
@Override
269-
public void onSuccess(Object result)
270-
{
271-
latch.countDown();
272-
}
278+
latch.countDown();
279+
}
273280

274-
@Override
275-
public void onFailure(Throwable t)
276-
{
277-
// Expect CancellationException
278-
latch.countDown();
279-
if (!(t instanceof CancellationException)) {
280-
log.error(t, "Error in namespace [%s]", implDatum.name);
281-
}
281+
@Override
282+
public void onFailure(Throwable t)
283+
{
284+
// Expect CancellationException
285+
latch.countDown();
286+
if (!(t instanceof CancellationException)) {
287+
log.error(t, "Error in namespace [%s]", implDatum.name);
282288
}
283289
}
284-
);
285-
if (!future.isDone()
286-
&& !future.cancel(true)) { // Interrupt to make sure we don't pollute stuff after we've already cleaned up
287-
throw new ISE("Future for namespace [%s] was not able to be canceled", implDatum.name);
288-
}
289-
try {
290-
latch.await();
291-
}
292-
catch (InterruptedException e) {
293-
Thread.currentThread().interrupt();
294-
throw Throwables.propagate(e);
295-
}
290+
}
291+
);
292+
if (!future.isDone()
293+
&& !future.cancel(true)) { // Interrupt to make sure we don't pollute stuff after we've already cleaned up
294+
throw new ISE("Future for namespace [%s] was not able to be canceled", implDatum.name);
295+
}
296+
try {
297+
latch.await();
298+
}
299+
catch (InterruptedException e) {
300+
Thread.currentThread().interrupt();
301+
throw Throwables.propagate(e);
296302
}
297303
}
298304

305+
// Not thread safe
306+
@GuardedBy("implDatum.changeLock")
299307
private boolean removeNamespaceLocalMetadata(final NamespaceImplData implDatum)
300308
{
301309
if (implDatum == null) {
302310
return false;
303311
}
304-
synchronized (implDatum.enabled) {
305-
if (!implDatum.enabled.compareAndSet(true, false)) {
306-
return false;
307-
}
308-
if (!implDatum.future.isDone()) {
309-
cancelFuture(implDatum);
310-
}
311-
return implData.remove(implDatum.name, implDatum);
312+
// "Leader" election for doing the deletion
313+
if (!implDatum.enabled.compareAndSet(true, false)) {
314+
return false;
315+
}
316+
if (!implDatum.future.isDone()) {
317+
cancelFuture(implDatum);
312318
}
319+
return implData.remove(implDatum.name, implDatum);
313320
}
314321

315322
// Optimistic scheduling of updates to a namespace.
@@ -321,7 +328,7 @@ public <T extends ExtractionNamespace> ListenableFuture<?> schedule(final String
321328
throw new ISE("Cannot find factory for namespace [%s]", namespace);
322329
}
323330
final String cacheId = String.format("namespace-cache-%s-%s", id, UUID.randomUUID().toString());
324-
return schedule(id, namespace, factory, getPostRunnable(id, namespace, factory, cacheId), cacheId);
331+
return schedule(id, namespace, factory, getPostRunnable(id, cacheId), cacheId);
325332
}
326333

327334
// For testing purposes this is protected
@@ -336,7 +343,7 @@ protected <T extends ExtractionNamespace> ListenableFuture<?> schedule(
336343
log.debug("Trying to update namespace [%s]", id);
337344
final NamespaceImplData implDatum = implData.get(id);
338345
if (implDatum != null) {
339-
synchronized (implDatum.enabled) {
346+
synchronized (implDatum.changeLock) {
340347
if (implDatum.enabled.get()) {
341348
// We also check at the end of the function, but fail fast here
342349
throw new IAE("Namespace [%s] already exists! Leaving prior running", namespace.toString());
@@ -345,6 +352,8 @@ protected <T extends ExtractionNamespace> ListenableFuture<?> schedule(
345352
}
346353
final long updateMs = namespace.getPollMs();
347354
final CountDownLatch startLatch = new CountDownLatch(1);
355+
// Must be set before leader election occurs or else runnable will fail
356+
final AtomicReference<NamespaceImplData> implDataAtomicReference = new AtomicReference<>(null);
348357

349358
final Runnable command = new Runnable()
350359
{
@@ -354,8 +363,13 @@ public void run()
354363
try {
355364
startLatch.await(); // wait for "election" to leadership or cancellation
356365
if (!Thread.currentThread().isInterrupted()) {
366+
final NamespaceImplData implData = implDataAtomicReference.get();
367+
if (implData == null) {
368+
// should never happen
369+
throw new NullPointerException(String.format("No data for namespace [%s]", id));
370+
}
357371
final Map<String, String> cache = getCacheMap(cacheId);
358-
final String preVersion = lastVersion.get(id);
372+
final String preVersion = implData.latestVersion.get();
359373
final Callable<String> runnable = factory.getCachePopulator(id, namespace, preVersion, cache);
360374

361375
tasksStarted.incrementAndGet();
@@ -364,7 +378,9 @@ public void run()
364378
throw new CancellationException(String.format("Version `%s` already exists", preVersion));
365379
}
366380
if (newVersion != null) {
367-
lastVersion.put(id, newVersion);
381+
if (!implData.latestVersion.compareAndSet(preVersion, newVersion)) {
382+
log.wtf("Somehow multiple threads are updating the same implData for [%s]", id);
383+
}
368384
}
369385
postRunnable.run();
370386
log.debug("Namespace [%s] successfully updated", id);
@@ -392,7 +408,9 @@ public void run()
392408
future = listeningScheduledExecutorService.schedule(command, 0, TimeUnit.MILLISECONDS);
393409
}
394410

411+
// Do not need to synchronize here as we haven't set enabled to true yet, and haven't released startLatch
395412
final NamespaceImplData me = new NamespaceImplData(future, namespace, id);
413+
implDataAtomicReference.set(me);
396414
final NamespaceImplData other = implData.putIfAbsent(id, me);
397415
if (other != null) {
398416
if (!future.isDone() && !future.cancel(true)) {
@@ -433,8 +451,6 @@ public void run()
433451

434452
/**
435453
* Clears out resources used by the namespace such as threads. Implementations may override this and call super.delete(...) if they have resources of their own which need cleared.
436-
* <p/>
437-
* This particular method is NOT thread safe, and any impl which is intended to be thread safe should safe-guard calls to this method.
438454
*
439455
* @param ns The namespace to be deleted
440456
*
@@ -445,25 +461,31 @@ public void run()
445461
public boolean delete(final String ns)
446462
{
447463
final NamespaceImplData implDatum = implData.get(ns);
448-
final boolean deleted = removeNamespaceLocalMetadata(implDatum);
449-
// At this point we have won leader election on canceling this implDatum
450-
if (deleted) {
451-
log.info("Deleting namespace [%s]", ns);
452-
lastVersion.remove(implDatum.name);
453-
return true;
454-
} else {
455-
log.debug("Did not delete namespace [%s]", ns);
464+
if (implDatum == null) {
465+
log.debug("Found no running cache for [%s]", ns);
456466
return false;
457467
}
468+
synchronized (implDatum.changeLock) {
469+
if (removeNamespaceLocalMetadata(implDatum)) {
470+
log.info("Deleted namespace [%s]", ns);
471+
return true;
472+
} else {
473+
log.debug("Did not delete namespace [%s]", ns);
474+
return false;
475+
}
476+
}
458477
}
459478

460479
public String getVersion(String namespace)
461480
{
462481
if (namespace == null) {
463482
return null;
464-
} else {
465-
return lastVersion.get(namespace);
466483
}
484+
final NamespaceImplData implDatum = implData.get(namespace);
485+
if (implDatum == null) {
486+
return null;
487+
}
488+
return implDatum.latestVersion.get();
467489
}
468490

469491
public Collection<String> getKnownIDs()

extensions-core/lookups-cached-global/src/main/java/io/druid/server/lookup/namespace/cache/OffHeapNamespaceExtractionCacheManager.java

+10-11
Original file line numberDiff line numberDiff line change
@@ -134,22 +134,21 @@ protected boolean swapAndClearCache(String namespaceKey, String cacheKey)
134134
@Override
135135
public boolean delete(final String namespaceKey)
136136
{
137+
// `super.delete` has a synchronization in it, don't call it in the lock.
138+
if (!super.delete(namespaceKey)) {
139+
return false;
140+
}
137141
final Lock lock = nsLocks.get(namespaceKey);
138142
lock.lock();
139143
try {
140-
if (super.delete(namespaceKey)) {
141-
final String mmapDBkey = currentNamespaceCache.remove(namespaceKey);
142-
if (mmapDBkey != null) {
143-
final long pre = tmpFile.length();
144-
mmapDB.delete(mmapDBkey);
145-
log.debug("MapDB file size: pre %d post %d", pre, tmpFile.length());
146-
return true;
147-
} else {
148-
return false;
149-
}
150-
} else {
144+
final String mmapDBkey = currentNamespaceCache.remove(namespaceKey);
145+
if (mmapDBkey == null) {
151146
return false;
152147
}
148+
final long pre = tmpFile.length();
149+
mmapDB.delete(mmapDBkey);
150+
log.debug("MapDB file size: pre %d post %d", pre, tmpFile.length());
151+
return true;
153152
}
154153
finally {
155154
lock.unlock();

extensions-core/lookups-cached-global/src/main/java/io/druid/server/lookup/namespace/cache/OnHeapNamespaceExtractionCacheManager.java

+5-1
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,14 @@ public ConcurrentMap<String, String> getCacheMap(String namespaceOrCacheKey)
9393
@Override
9494
public boolean delete(final String namespaceKey)
9595
{
96+
// `super.delete` has a synchronization in it, don't call it in the lock.
97+
if (!super.delete(namespaceKey)) {
98+
return false;
99+
}
96100
final Lock lock = nsLocks.get(namespaceKey);
97101
lock.lock();
98102
try {
99-
return super.delete(namespaceKey) && mapMap.remove(namespaceKey) != null;
103+
return mapMap.remove(namespaceKey) != null;
100104
}
101105
finally {
102106
lock.unlock();

extensions-core/lookups-cached-global/src/test/java/io/druid/server/lookup/namespace/cache/NamespaceExtractionCacheManagerExecutorsTest.java

+2-4
Original file line numberDiff line numberDiff line change
@@ -123,14 +123,12 @@ lifecycle, new NoopServiceEmitter(),
123123
)
124124
{
125125
@Override
126-
protected <T extends ExtractionNamespace> Runnable getPostRunnable(
126+
protected Runnable getPostRunnable(
127127
final String id,
128-
final T namespace,
129-
final ExtractionNamespaceCacheFactory<T> factory,
130128
final String cacheId
131129
)
132130
{
133-
final Runnable runnable = super.getPostRunnable(id, namespace, factory, cacheId);
131+
final Runnable runnable = super.getPostRunnable(id, cacheId);
134132
cacheUpdateAlerts.putIfAbsent(id, new Object());
135133
final Object cacheUpdateAlerter = cacheUpdateAlerts.get(id);
136134
return new Runnable()

0 commit comments

Comments
 (0)