Skip to content

Commit

Permalink
NIFI-54096: Consolidating new model into existing List processors.
Browse files Browse the repository at this point in the history
Adding followings:

- Use separate DistributedMapCache for tracking entities to avoid
conflict with existing code
- Added more validation
- Delete listed entities from cache if reset is needed
- Support Local scope
- Added Initial Listing Target

This closes apache#2876.

Signed-off-by: Mark Payne <[email protected]>
  • Loading branch information
ijokarumawak authored and markap14 committed Jul 17, 2018
1 parent 0a493bf commit 8b9d446
Show file tree
Hide file tree
Showing 11 changed files with 705 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.state.Scope;
import org.apache.nifi.context.PropertyContext;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processor.util.list.AbstractListProcessor;
import org.apache.nifi.processor.util.list.ListedEntityTracker;
import org.apache.nifi.processors.azure.storage.utils.AzureStorageUtils;
import org.apache.nifi.processors.azure.storage.utils.BlobInfo;
import org.apache.nifi.processors.azure.storage.utils.BlobInfo.Builder;
Expand Down Expand Up @@ -92,23 +94,26 @@ public class ListAzureBlobStorage extends AbstractListProcessor<BlobInfo> {
.build();

private static final List<PropertyDescriptor> PROPERTIES = Collections.unmodifiableList(Arrays.asList(
LISTING_STRATEGY,
AzureStorageUtils.CONTAINER,
AzureStorageUtils.PROP_SAS_TOKEN,
AzureStorageUtils.ACCOUNT_NAME,
AzureStorageUtils.ACCOUNT_KEY,
PROP_PREFIX,
AzureStorageUtils.PROXY_CONFIGURATION_SERVICE));
AzureStorageUtils.PROXY_CONFIGURATION_SERVICE,
ListedEntityTracker.TRACKING_STATE_CACHE,
ListedEntityTracker.TRACKING_TIME_WINDOW,
ListedEntityTracker.INITIAL_LISTING_TARGET
));

@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return PROPERTIES;
}

@Override
protected Collection<ValidationResult> customValidate(ValidationContext validationContext) {
final Collection<ValidationResult> results = AzureStorageUtils.validateCredentialProperties(validationContext);
protected void customValidate(ValidationContext validationContext, Collection<ValidationResult> results) {
AzureStorageUtils.validateProxySpec(validationContext, results);
return results;
}

@Override
Expand Down Expand Up @@ -144,7 +149,7 @@ protected boolean isListingResetNecessary(final PropertyDescriptor property) {
}

@Override
protected Scope getStateScope(final ProcessContext context) {
protected Scope getStateScope(final PropertyContext context) {
return Scope.CLUSTER;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,4 +209,9 @@ public String getIdentifier() {
public long getTimestamp() {
return getLastModifiedTime();
}

@Override
public long getSize() {
return length;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@
import org.apache.nifi.annotation.notification.PrimaryNodeState;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.state.Scope;
import org.apache.nifi.components.state.StateManager;
import org.apache.nifi.components.state.StateMap;
import org.apache.nifi.context.PropertyContext;
import org.apache.nifi.distributed.cache.client.Deserializer;
import org.apache.nifi.distributed.cache.client.DistributedMapCacheClient;
import org.apache.nifi.distributed.cache.client.Serializer;
Expand All @@ -49,6 +52,7 @@
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
Expand All @@ -62,8 +66,8 @@

/**
* <p>
* An Abstract Processor that is intended to simplify the coding required in order to perform Listing operations of remote resources.
* Those remote resources may be files, "objects", "messages", or any other sort of entity that may need to be listed in such a way that
* An Abstract Processor that is intended to simplify the coding required in order to perform Listing operations of remote or local resources.
* Those resources may be files, "objects", "messages", or any other sort of entity that may need to be listed in such a way that
* we identity the entity only once. Each of these objects, messages, etc. is referred to as an "entity" for the scope of this Processor.
* </p>
* <p>
Expand All @@ -83,6 +87,9 @@
* than the last timestamp pulled, then the entity is considered new.
* </li>
* <li>
* With 'Tracking Entities' strategy, the size of entity content is also used to determine if an entity is "new". If the size changes the entity is considered "new".
* </li>
* <li>
* Entity must have a user-readable name that can be used for logging purposes.
* </li>
* </ul>
Expand All @@ -96,18 +103,19 @@
* NOTE: This processor performs migrations of legacy state mechanisms inclusive of locally stored, file-based state and the optional utilization of the <code>Distributed Cache
* Service</code> property to the new {@link StateManager} functionality. Upon successful migration, the associated data from one or both of the legacy mechanisms is purged.
* </p>
*
* <p>
* For each new entity that is listed, the Processor will send a FlowFile to the 'success' relationship. The FlowFile will have no content but will have some set
* of attributes (defined by the concrete implementation) that can be used to fetch those remote resources or interact with them in whatever way makes sense for
* of attributes (defined by the concrete implementation) that can be used to fetch those resources or interact with them in whatever way makes sense for
* the configured dataflow.
* </p>
* <p>
* Subclasses are responsible for the following:
* </p>
* <ul>
* <li>
* Perform a listing of remote resources. The subclass will implement the {@link #performListing(ProcessContext, Long)} method, which creates a listing of all
* entities on the remote system that have timestamps later than the provided timestamp. If the entities returned have a timestamp before the provided one, those
* Perform a listing of resources. The subclass will implement the {@link #performListing(ProcessContext, Long)} method, which creates a listing of all
* entities on the target system that have timestamps later than the provided timestamp. If the entities returned have a timestamp before the provided one, those
* entities will be filtered out. It is therefore not necessary to perform the filtering of timestamps but is provided in order to give the implementation the ability
* to filter those resources on the server side rather than pulling back all of the information, if it makes sense to do so in the concrete implementation.
* </li>
Expand Down Expand Up @@ -138,9 +146,11 @@ public abstract class AbstractListProcessor<T extends ListableEntity> extends Ab

public static final PropertyDescriptor DISTRIBUTED_CACHE_SERVICE = new PropertyDescriptor.Builder()
.name("Distributed Cache Service")
.description("Specifies the Controller Service that should be used to maintain state about what has been pulled from the remote server so that if a new node "
+ "begins pulling data, it won't duplicate all of the work that has been done. If not specified, the information will not be shared across the cluster. "
+ "This property does not need to be set for standalone instances of NiFi but should be configured if NiFi is run within a cluster.")
.description("NOTE: This property is used merely for migration from old NiFi version before state management was introduced at version 0.5.0. "
+ "The stored value in the cache service will be migrated into the state when this processor is started at the first time. "
+ "The specified Controller Service was used to maintain state about what had been pulled from the remote server so that if a new node "
+ "begins pulling data, it won't duplicate all of the work that has been done. If not specified, the information was not shared across the cluster. "
+ "This property did not need to be set for standalone instances of NiFi but was supposed to be configured if NiFi had been running within a cluster.")
.required(false)
.identifiesControllerService(DistributedMapCacheClient.class)
.build();
Expand Down Expand Up @@ -169,6 +179,28 @@ public abstract class AbstractListProcessor<T extends ListableEntity> extends Ab
.description("All FlowFiles that are received are routed to success")
.build();

public static final AllowableValue BY_TIMESTAMPS = new AllowableValue("timestamps", "Tracking Timestamps",
"This strategy tracks the latest timestamp of listed entity to determine new/updated entities." +
" Since it only tracks few timestamps, it can manage listing state efficiently." +
" However, any newly added, or updated entity having timestamp older than the tracked latest timestamp can not be picked by this strategy." +
" For example, such situation can happen in a file system if a file with old timestamp" +
" is copied or moved into the target directory without its last modified timestamp being updated.");

public static final AllowableValue BY_ENTITIES = new AllowableValue("entities", "Tracking Entities",
"This strategy tracks information of all the listed entities within the latest 'Entity Tracking Time Window' to determine new/updated entities." +
" This strategy can pick entities having old timestamp that can be missed with 'Tracing Timestamps'." +
" However additional DistributedMapCache controller service is required and more JVM heap memory is used." +
" See the description of 'Entity Tracking Time Window' property for further details on how it works.");

public static final PropertyDescriptor LISTING_STRATEGY = new PropertyDescriptor.Builder()
.name("listing-strategy")
.displayName("Listing Strategy")
.description("Specify how to determine new/updated entities. See each strategy descriptions for detail.")
.required(true)
.allowableValues(BY_TIMESTAMPS, BY_ENTITIES)
.defaultValue(BY_TIMESTAMPS.getValue())
.build();

/**
* Represents the timestamp of an entity which was the latest one within those listed at the previous cycle.
* It does not necessary mean it has been processed as well.
Expand All @@ -185,6 +217,8 @@ public abstract class AbstractListProcessor<T extends ListableEntity> extends Ab
private volatile boolean resetState = false;
private volatile List<String> latestIdentifiersProcessed = new ArrayList<>();

private volatile ListedEntityTracker<T> listedEntityTracker;

/*
* A constant used in determining an internal "yield" of processing files. Given the logic to provide a pause on the newest
* files according to timestamp, it is ensured that at least the specified millis has been eclipsed to avoid getting scheduled
Expand All @@ -206,14 +240,6 @@ public File getPersistenceFile() {
return new File("conf/state/" + getIdentifier());
}

@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
final List<PropertyDescriptor> properties = new ArrayList<>();
properties.add(DISTRIBUTED_CACHE_SERVICE);
properties.add(TARGET_SYSTEM_TIMESTAMP_PRECISION);
return properties;
}

@Override
public void onPropertyModified(final PropertyDescriptor descriptor, final String oldValue, final String newValue) {
if (isConfigurationRestored() && isListingResetNecessary(descriptor)) {
Expand All @@ -230,6 +256,32 @@ public Set<Relationship> getRelationships() {
return relationships;
}

/**
* In order to add custom validation at sub-classes, implement {@link #customValidate(ValidationContext, Collection)} method.
*/
@Override
protected final Collection<ValidationResult> customValidate(ValidationContext context) {
final Collection<ValidationResult> results = new ArrayList<>();

final String listingStrategy = context.getProperty(LISTING_STRATEGY).getValue();
if (BY_ENTITIES.equals(listingStrategy)) {
ListedEntityTracker.validateProperties(context, results, getStateScope(context));
}

customValidate(context, results);
return results;
}


/**
* Sub-classes can add custom validation by implementing this method.
* @param validationContext the validation context
* @param validationResults add custom validation result to this collection
*/
protected void customValidate(ValidationContext validationContext, Collection<ValidationResult> validationResults) {

}

@OnPrimaryNodeStateChange
public void onPrimaryNodeChange(final PrimaryNodeState newState) {
justElectedPrimaryNode = (newState == PrimaryNodeState.ELECTED_PRIMARY_NODE);
Expand Down Expand Up @@ -260,7 +312,6 @@ public final void updateState(final ProcessContext context) throws IOException {

if (resetState) {
context.getStateManager().clear(getStateScope(context));
resetState = false;
}
}

Expand Down Expand Up @@ -352,9 +403,24 @@ private EntityListing deserialize(final String serializedState) throws JsonParse
return mapper.readValue(serializedState, EntityListing.class);
}


@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {

resetState = false;

final String listingStrategy = context.getProperty(LISTING_STRATEGY).getValue();
if (BY_TIMESTAMPS.equals(listingStrategy)) {
listByTrackingTimestamps(context, session);

} else if (BY_ENTITIES.equals(listingStrategy)) {
listByTrackingEntities(context, session);

} else {
throw new ProcessException("Unknown listing strategy: " + listingStrategy);
}
}

public void listByTrackingTimestamps(final ProcessContext context, final ProcessSession session) throws ProcessException {
Long minTimestampToListMillis = lastListedLatestEntryTimestampMillis;

if (this.lastListedLatestEntryTimestampMillis == null || this.lastProcessedLatestEntryTimestampMillis == null || justElectedPrimaryNode) {
Expand Down Expand Up @@ -624,7 +690,7 @@ private void resetTimeStates() {
* @param context the ProcessContext to use in order to make a determination
* @return a Scope that specifies where the state should be managed for this Processor
*/
protected abstract Scope getStateScope(final ProcessContext context);
protected abstract Scope getStateScope(final PropertyContext context);


private static class StringSerDe implements Serializer<String>, Deserializer<String> {
Expand All @@ -642,4 +708,41 @@ public void serialize(final String value, final OutputStream out) throws Seriali
out.write(value.getBytes(StandardCharsets.UTF_8));
}
}

@OnScheduled
public void initListedEntityTracker(ProcessContext context) {
final boolean isTrackingEntityStrategy = BY_ENTITIES.getValue().equals(context.getProperty(LISTING_STRATEGY).getValue());
if (listedEntityTracker != null && (resetState || !isTrackingEntityStrategy)) {
try {
listedEntityTracker.clearListedEntities();
} catch (IOException e) {
throw new RuntimeException("Failed to reset previously listed entities due to " + e, e);
}
}

if (isTrackingEntityStrategy) {
if (listedEntityTracker == null) {
listedEntityTracker = createListedEntityTracker();
}
} else {
listedEntityTracker = null;
}
}

protected ListedEntityTracker<T> createListedEntityTracker() {
return new ListedEntityTracker<>(getIdentifier(), getLogger());
}

private void listByTrackingEntities(ProcessContext context, ProcessSession session) throws ProcessException {
listedEntityTracker.trackEntities(context, session, justElectedPrimaryNode, getStateScope(context), minTimestampToList -> {
try {
return performListing(context, minTimestampToList);
} catch (final IOException e) {
getLogger().error("Failed to perform listing on remote host due to {}", e);
return Collections.emptyList();
}
}, entity -> createAttributes(entity, context));
justElectedPrimaryNode = false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,9 @@ public interface ListableEntity {
*/
long getTimestamp();

/**
* @return the size of the entity content.
*/
long getSize();

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processor.util.list;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;

public class ListedEntity {
/**
* Milliseconds.
*/
private final long timestamp;
/**
* Bytes.
*/
private final long size;

@JsonCreator
public ListedEntity(@JsonProperty("timestamp") long timestamp, @JsonProperty("size") long size) {
this.timestamp = timestamp;
this.size = size;
}

public long getTimestamp() {
return timestamp;
}

public long getSize() {
return size;
}
}
Loading

0 comments on commit 8b9d446

Please sign in to comment.