forked from apache/samza
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SAMZA-1158 Adding monitor to clean up stale local stores of tasks
- Loading branch information
Shanthoosh Venkataraman
authored and
Xinyu Liu
committed
Mar 20, 2017
1 parent
26280ca
commit bd71d60
Showing
8 changed files
with
571 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
132 changes: 132 additions & 0 deletions
132
samza-rest/src/main/java/org/apache/samza/monitor/JobsClient.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.samza.monitor; | ||
|
||
import com.google.common.base.Preconditions; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.function.Function; | ||
import org.apache.commons.httpclient.HttpClient; | ||
import org.apache.commons.httpclient.methods.GetMethod; | ||
import org.apache.samza.SamzaException; | ||
import org.apache.samza.rest.model.Job; | ||
import org.apache.samza.rest.model.JobStatus; | ||
import org.apache.samza.rest.model.Task; | ||
import org.apache.samza.rest.proxy.job.JobInstance; | ||
import org.apache.samza.rest.resources.ResourceConstants; | ||
import org.codehaus.jackson.map.ObjectMapper; | ||
import org.codehaus.jackson.type.TypeReference; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
||
/** | ||
* This is a helper class to interact with the samza-rest apis. | ||
* It contains the functionality to read the tasks associated with a samza job, to get the status of a samza job. | ||
*/ | ||
public class JobsClient { | ||
|
||
private static final Logger LOG = LoggerFactory.getLogger(JobsClient.class); | ||
|
||
private final HttpClient httpClient; | ||
|
||
// list of jobStatusServers that will be used, where each jobStatusServer is of the form Host:Port | ||
private final List<String> jobStatusServers; | ||
|
||
/** | ||
* @param jobStatusServers list of jobStatusServers, where each jobStatusServer is of the form Host:Port | ||
*/ | ||
public JobsClient(List<String> jobStatusServers) { | ||
Preconditions.checkState(!jobStatusServers.isEmpty(), "Job status servers cannot be empty."); | ||
this.jobStatusServers = new ArrayList<>(jobStatusServers); | ||
this.httpClient = new HttpClient(); | ||
} | ||
|
||
/** | ||
* This method retrieves and returns the list of tasks that are associated with a JobInstance. | ||
* @param jobInstance an instance of the samza job. | ||
* @return the list of tasks that are associated with the samza job. | ||
* @throws SamzaException if there were any problems with the http request. | ||
*/ | ||
public List<Task> getTasks(JobInstance jobInstance) { | ||
return retriableHttpGet(baseUrl -> String.format(ResourceConstants.GET_TASKS_URL, baseUrl, | ||
jobInstance.getJobName(), jobInstance.getJobId())); | ||
} | ||
|
||
/** | ||
* This method should be used to find the JobStatus of a jobInstance. | ||
* @param jobInstance a instance of the job. | ||
* @return the job status of the {@link JobInstance}. | ||
* @throws SamzaException if there are any problems with the http request. | ||
*/ | ||
public JobStatus getJobStatus(JobInstance jobInstance) { | ||
Job job = retriableHttpGet(baseUrl -> String.format(ResourceConstants.GET_JOBS_URL, baseUrl, | ||
jobInstance.getJobName(), jobInstance.getJobId())); | ||
return job.getStatus(); | ||
} | ||
|
||
/** | ||
* | ||
* This method initiates http get request to the job status servers sequentially, | ||
* returns the first response from an job status server that returns a 2xx code(success response). | ||
* When a job status server is down or returns a error response, it tries to reach out to | ||
* the next job status server in the sequence, to complete the http get request. | ||
* | ||
* @param urlMapFunction to build the request url, given job status server base url. | ||
* @param <T> return type of the http get response. | ||
* @return the response from any one of the job status server. | ||
* @throws Exception when all the job status servers are unavailable. | ||
* | ||
*/ | ||
private <T> T retriableHttpGet(Function<String, String> urlMapFunction) { | ||
Exception fetchException = null; | ||
for (String jobStatusServer : jobStatusServers) { | ||
String requestUrl = urlMapFunction.apply(jobStatusServer); | ||
try { | ||
ObjectMapper objectMapper = new ObjectMapper(); | ||
return objectMapper.readValue(httpGet(requestUrl), new TypeReference<T>() {}); | ||
} catch (Exception e) { | ||
LOG.error(String.format("Exception when fetching tasks from the url : %s", requestUrl), e); | ||
fetchException = e; | ||
} | ||
} | ||
throw new SamzaException(String.format("Exception during http get from urls : %s", jobStatusServers), | ||
fetchException); | ||
} | ||
|
||
/** | ||
* This method initiates http get request on the request url and returns the | ||
* response returned from the http get. | ||
* @param requestUrl url on which the http get request has to be performed. | ||
* @return the input stream of the http get response. | ||
* @throws IOException if there are problems with the http get request. | ||
*/ | ||
private InputStream httpGet(String requestUrl) throws IOException { | ||
GetMethod getMethod = new GetMethod(requestUrl); | ||
try { | ||
int responseCode = httpClient.executeMethod(getMethod); | ||
LOG.debug("Received response code {} for the get request on the url : {}", responseCode, requestUrl); | ||
return getMethod.getResponseBodyAsStream(); | ||
} finally { | ||
getMethod.releaseConnection(); | ||
} | ||
} | ||
} |
157 changes: 157 additions & 0 deletions
157
samza-rest/src/main/java/org/apache/samza/monitor/LocalStoreMonitor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.samza.monitor; | ||
|
||
import com.google.common.base.Preconditions; | ||
import com.google.common.base.Strings; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.net.InetAddress; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import org.apache.commons.io.filefilter.DirectoryFileFilter; | ||
import org.apache.commons.io.FileUtils; | ||
import org.apache.samza.container.TaskName; | ||
import org.apache.samza.metrics.MetricsRegistry; | ||
import org.apache.samza.rest.model.JobStatus; | ||
import org.apache.samza.rest.model.Task; | ||
import org.apache.samza.rest.proxy.job.JobInstance; | ||
import org.apache.samza.storage.TaskStorageManager; | ||
import org.apache.samza.util.Clock; | ||
import org.apache.samza.util.SystemClock; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
||
/** | ||
* This monitor class periodically checks for the presence | ||
* of stale store directories and deletes them if the | ||
* samza task that created it is not running | ||
* for more than X days. | ||
*/ | ||
public class LocalStoreMonitor implements Monitor { | ||
private static final Clock CLOCK = SystemClock.instance(); | ||
|
||
private static final Logger LOG = LoggerFactory.getLogger(LocalStoreMonitor.class); | ||
|
||
private static final String OFFSET_FILE_NAME = "OFFSET"; | ||
|
||
private final JobsClient jobsClient; | ||
|
||
private final LocalStoreMonitorConfig config; | ||
|
||
// MetricsRegistry should be used in the future to send metrics from this monitor. | ||
// Metrics from the monitor is a way to know if the monitor is alive. | ||
public LocalStoreMonitor(LocalStoreMonitorConfig config, | ||
MetricsRegistry metricsRegistry, | ||
JobsClient jobsClient) { | ||
Preconditions.checkState(!Strings.isNullOrEmpty(config.getLocalStoreBaseDir()), | ||
String.format("%s is not set in config.", LocalStoreMonitorConfig.CONFIG_LOCAL_STORE_DIR)); | ||
this.config = config; | ||
this.jobsClient = jobsClient; | ||
} | ||
|
||
/** | ||
* This monitor method is invoked periodically to delete the stale state stores | ||
* of dead jobs/tasks. | ||
* @throws Exception if there was any problem in running the monitor. | ||
*/ | ||
@Override | ||
public void monitor() throws Exception { | ||
File localStoreDir = new File(config.getLocalStoreBaseDir()); | ||
Preconditions.checkState(localStoreDir.isDirectory(), | ||
String.format("LocalStoreDir: %s is not a directory", localStoreDir.getAbsolutePath())); | ||
String localHostName = InetAddress.getLocalHost().getHostName(); | ||
for (JobInstance jobInstance : getHostAffinityEnabledJobs(localStoreDir)) { | ||
File jobDir = new File(localStoreDir, | ||
String.format("%s-%s", jobInstance.getJobName(), jobInstance.getJobId())); | ||
Preconditions.checkState(jobDir.exists(), "JobDir is null"); | ||
Preconditions.checkNotNull(jobDir , "JobDir is null"); | ||
JobStatus jobStatus = jobsClient.getJobStatus(jobInstance); | ||
for (Task task : jobsClient.getTasks(jobInstance)) { | ||
for (String storeName : jobDir.list(DirectoryFileFilter.DIRECTORY)) { | ||
LOG.info("Job: {} has the running status: {} with preferred host: {}", jobInstance, jobStatus, task.getPreferredHost()); | ||
/** | ||
* A task store is active if all of the following conditions are true: | ||
* a) If the store is amongst the active stores of the task. | ||
* b) If the job has been started. | ||
* c) If the preferred host of the task is the localhost on which the monitor is run. | ||
*/ | ||
if (jobStatus.hasBeenStarted() | ||
&& task.getStoreNames().contains(storeName) | ||
&& task.getPreferredHost().equals(localHostName)) { | ||
LOG.info(String.format("Store %s is actively used by the task: %s.", storeName, task.getTaskName())); | ||
} else { | ||
LOG.info(String.format("Store %s not used by the task: %s.", storeName, task.getTaskName())); | ||
markSweepTaskStore(TaskStorageManager.getStorePartitionDir(jobDir, storeName, new TaskName(task.getTaskName()))); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Helper method to find and return the list of host affinity enabled jobs on this NM. | ||
* @param localStoreDirFile the location in which all stores of host affinity enabled jobs are persisted. | ||
* @return the list of the host affinity enabled jobs that are installed on this NM. | ||
*/ | ||
private static List<JobInstance> getHostAffinityEnabledJobs(File localStoreDirFile) { | ||
List<JobInstance> jobInstances = new ArrayList<>(); | ||
for (File jobStore : localStoreDirFile.listFiles(File::isDirectory)) { | ||
// Name of the jobStore(jobStorePath) is of the form : ${job.name}-${job.id}. | ||
String jobStorePath = jobStore.getName(); | ||
int indexSeparator = jobStorePath.lastIndexOf("-"); | ||
if (indexSeparator != -1) { | ||
jobInstances.add(new JobInstance(jobStorePath.substring(0, indexSeparator), | ||
jobStorePath.substring(indexSeparator + 1))); | ||
} | ||
} | ||
return jobInstances; | ||
} | ||
|
||
/** | ||
* Role of this method is to garbage collect(mark-sweep) the task store. | ||
* @param taskStoreDir store directory of the task to perform garbage collection. | ||
* | ||
* This method cleans up each of the task store directory in two phases. | ||
* | ||
* Phase 1: | ||
* Delete the offset file in the task store if (curTime - lastModifiedTimeOfOffsetFile) > offsetTTL. | ||
* | ||
* Phase 2: | ||
* Delete the task store directory if the offsetFile does not exist in task store directory. | ||
* | ||
* Time interval between the two phases is controlled by this monitor scheduling | ||
* interval in milli seconds. | ||
* @throws IOException if there is an exception during the clean up of the task store files. | ||
*/ | ||
private void markSweepTaskStore(File taskStoreDir) throws IOException { | ||
String taskStorePath = taskStoreDir.getAbsolutePath(); | ||
File offsetFile = new File(taskStoreDir, OFFSET_FILE_NAME); | ||
if (!offsetFile.exists()) { | ||
LOG.info("Deleting the task store : {}, since it has no offset file.", taskStorePath); | ||
FileUtils.deleteDirectory(taskStoreDir); | ||
} else if ((CLOCK.currentTimeMillis() - offsetFile.lastModified()) >= config.getOffsetFileTTL()) { | ||
LOG.info("Deleting the offset file from the store : {}, since the last modified timestamp : {} " | ||
+ "of the offset file is older than config file ttl : {}.", | ||
taskStorePath, offsetFile.lastModified(), config.getOffsetFileTTL()); | ||
offsetFile.delete(); | ||
} | ||
} | ||
} |
Oops, something went wrong.