Skip to content

Commit af78fd7

Browse files
committed
MAPREDUCE-5488. Changed MR client to keep trying to reach the application when it sees that on attempt's AM is down. Contributed by Jian He.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1524856 13f79535-47bb-0310-9956-ffa450edef68
1 parent 6b1f507 commit af78fd7

File tree

6 files changed

+91
-11
lines changed

6 files changed

+91
-11
lines changed

hadoop-mapreduce-project/CHANGES.txt

+3
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ Release 2.2.0 - UNRELEASED
193193
MAPREDUCE-5504. mapred queue -info inconsistent with types (Kousuke Saruta
194194
via tgraves)
195195

196+
MAPREDUCE-5488. Changed MR client to keep trying to reach the application
197+
when it sees that on attempt's AM is down. (Jian He via vinodkv)
198+
196199
Release 2.1.1-beta - 2013-09-23
197200

198201
INCOMPATIBLE CHANGES

hadoop-mapreduce-project/dev-support/findbugs-exclude.xml

+6
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,12 @@
496496
<Field name="sslFileBufferSize" />
497497
<Bug pattern="IS2_INCONSISTENT_SYNC" />
498498
</Match>
499+
500+
<Match>
501+
<Class name="org.apache.hadoop.mapred.ClientServiceDelegate" />
502+
<Method name="invoke" />
503+
<Bug pattern="SWL_SLEEP_WITH_LOCK_HELD" />
504+
</Match>
499505

500506
<Match>
501507
<Class name="org.apache.hadoop.mapreduce.util.ProcessTree" />

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ public interface MRJobConfig {
357357
public static final int DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES = 3;
358358

359359
/**
360-
* The number of client retries to the RM/HS/AM before throwing exception.
360+
* The number of client retries to the RM/HS before throwing exception.
361361
*/
362362
public static final String MR_CLIENT_MAX_RETRIES =
363363
MR_PREFIX + "client.max-retries";

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -982,15 +982,15 @@
982982

983983
<property>
984984
<name>yarn.app.mapreduce.client-am.ipc.max-retries</name>
985-
<value>1</value>
985+
<value>3</value>
986986
<description>The number of client retries to the AM - before reconnecting
987987
to the RM to fetch Application Status.</description>
988988
</property>
989989

990990
<property>
991991
<name>yarn.app.mapreduce.client.max-retries</name>
992992
<value>3</value>
993-
<description>The number of client retries to the RM/HS/AM before
993+
<description>The number of client retries to the RM/HS before
994994
throwing exception. This is a layer above the ipc.</description>
995995
</property>
996996

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/main/java/org/apache/hadoop/mapred/ClientServiceDelegate.java

+37-8
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import java.util.EnumSet;
2727
import java.util.HashMap;
2828
import java.util.List;
29+
import java.util.concurrent.atomic.AtomicBoolean;
2930

3031
import org.apache.commons.lang.StringUtils;
3132
import org.apache.commons.logging.Log;
@@ -77,6 +78,8 @@
7778
import org.apache.hadoop.yarn.security.client.ClientToAMTokenIdentifier;
7879
import org.apache.hadoop.yarn.util.ConverterUtils;
7980

81+
import com.google.common.annotations.VisibleForTesting;
82+
8083
public class ClientServiceDelegate {
8184
private static final Log LOG = LogFactory.getLog(ClientServiceDelegate.class);
8285
private static final String UNAVAILABLE = "N/A";
@@ -93,7 +96,8 @@ public class ClientServiceDelegate {
9396
private RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null);
9497
private static String UNKNOWN_USER = "Unknown User";
9598
private String trackingUrl;
96-
99+
private AtomicBoolean usingAMProxy = new AtomicBoolean(false);
100+
private int maxClientRetry;
97101
private boolean amAclDisabledStatusLogged = false;
98102

99103
public ClientServiceDelegate(Configuration conf, ResourceMgrDelegate rm,
@@ -287,6 +291,7 @@ MRClientProtocol instantiateAMProxy(final InetSocketAddress serviceAddr)
287291
MRClientProtocol proxy =
288292
(MRClientProtocol) rpc.getProxy(MRClientProtocol.class,
289293
serviceAddr, conf);
294+
usingAMProxy.set(true);
290295
LOG.trace("Connected to ApplicationMaster at: " + serviceAddr);
291296
return proxy;
292297
}
@@ -301,36 +306,60 @@ private synchronized Object invoke(String method, Class argClass,
301306
} catch (NoSuchMethodException e) {
302307
throw new YarnRuntimeException("Method name mismatch", e);
303308
}
304-
int maxRetries = this.conf.getInt(
309+
maxClientRetry = this.conf.getInt(
305310
MRJobConfig.MR_CLIENT_MAX_RETRIES,
306311
MRJobConfig.DEFAULT_MR_CLIENT_MAX_RETRIES);
307312
IOException lastException = null;
308-
while (maxRetries > 0) {
313+
while (maxClientRetry > 0) {
314+
MRClientProtocol MRClientProxy = null;
309315
try {
310-
return methodOb.invoke(getProxy(), args);
316+
MRClientProxy = getProxy();
317+
return methodOb.invoke(MRClientProxy, args);
311318
} catch (InvocationTargetException e) {
312319
// Will not throw out YarnException anymore
313320
LOG.debug("Failed to contact AM/History for job " + jobId +
314321
" retrying..", e.getTargetException());
315322
// Force reconnection by setting the proxy to null.
316323
realProxy = null;
317324
// HS/AMS shut down
318-
maxRetries--;
325+
// if it's AM shut down, do not decrement maxClientRetry as we wait for
326+
// AM to be restarted.
327+
if (!usingAMProxy.get()) {
328+
maxClientRetry--;
329+
}
330+
usingAMProxy.set(false);
319331
lastException = new IOException(e.getTargetException());
320-
332+
try {
333+
Thread.sleep(100);
334+
} catch (InterruptedException ie) {
335+
LOG.warn("ClientServiceDelegate invoke call interrupted", ie);
336+
throw new YarnRuntimeException(ie);
337+
}
321338
} catch (Exception e) {
322339
LOG.debug("Failed to contact AM/History for job " + jobId
323340
+ " Will retry..", e);
324341
// Force reconnection by setting the proxy to null.
325342
realProxy = null;
326343
// RM shutdown
327-
maxRetries--;
328-
lastException = new IOException(e.getMessage());
344+
maxClientRetry--;
345+
lastException = new IOException(e.getMessage());
346+
try {
347+
Thread.sleep(100);
348+
} catch (InterruptedException ie) {
349+
LOG.warn("ClientServiceDelegate invoke call interrupted", ie);
350+
throw new YarnRuntimeException(ie);
351+
}
329352
}
330353
}
331354
throw lastException;
332355
}
333356

357+
// Only for testing
358+
@VisibleForTesting
359+
public int getMaxClientRetry() {
360+
return this.maxClientRetry;
361+
}
362+
334363
public org.apache.hadoop.mapreduce.Counters getJobCounters(JobID arg0) throws IOException,
335364
InterruptedException {
336365
org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter.toYarn(arg0);

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/src/test/java/org/apache/hadoop/mapred/TestClientServiceDelegate.java

+42
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,48 @@ public void testRetriesOnConnectionFailure() throws Exception {
140140
any(GetJobReportRequest.class));
141141
}
142142

143+
@Test
144+
public void testRetriesOnAMConnectionFailures() throws Exception {
145+
if (!isAMReachableFromClient) {
146+
return;
147+
}
148+
149+
ResourceMgrDelegate rm = mock(ResourceMgrDelegate.class);
150+
when(rm.getApplicationReport(TypeConverter.toYarn(oldJobId).getAppId()))
151+
.thenReturn(getRunningApplicationReport("am1", 78));
152+
153+
// throw exception in 1st, 2nd, 3rd and 4th call of getJobReport, and
154+
// succeed in the 5th call.
155+
final MRClientProtocol amProxy = mock(MRClientProtocol.class);
156+
when(amProxy.getJobReport(any(GetJobReportRequest.class)))
157+
.thenThrow(new RuntimeException("11"))
158+
.thenThrow(new RuntimeException("22"))
159+
.thenThrow(new RuntimeException("33"))
160+
.thenThrow(new RuntimeException("44")).thenReturn(getJobReportResponse());
161+
Configuration conf = new YarnConfiguration();
162+
conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME);
163+
conf.setBoolean(MRJobConfig.JOB_AM_ACCESS_DISABLED,
164+
!isAMReachableFromClient);
165+
ClientServiceDelegate clientServiceDelegate =
166+
new ClientServiceDelegate(conf, rm, oldJobId, null) {
167+
@Override
168+
MRClientProtocol instantiateAMProxy(
169+
final InetSocketAddress serviceAddr) throws IOException {
170+
super.instantiateAMProxy(serviceAddr);
171+
return amProxy;
172+
}
173+
};
174+
175+
JobStatus jobStatus = clientServiceDelegate.getJobStatus(oldJobId);
176+
177+
Assert.assertNotNull(jobStatus);
178+
// assert maxClientRetry is not decremented.
179+
Assert.assertEquals(conf.getInt(MRJobConfig.MR_CLIENT_MAX_RETRIES,
180+
MRJobConfig.DEFAULT_MR_CLIENT_MAX_RETRIES), clientServiceDelegate
181+
.getMaxClientRetry());
182+
verify(amProxy, times(5)).getJobReport(any(GetJobReportRequest.class));
183+
}
184+
143185
@Test
144186
public void testHistoryServerNotConfigured() throws Exception {
145187
//RM doesn't have app report and job History Server is not configured

0 commit comments

Comments
 (0)