/** * The crux of the matter... * * @param host {@link MapHost} from which we need to shuffle available map-outputs. */ @VisibleForTesting protected void copyFromHost(MapHost host) throws IOException { // Get completed maps on 'host' List<TaskAttemptID> maps = scheduler.getMapsForHost(host); // Sanity check to catch hosts with only 'OBSOLETE' maps, // especially at the tail of large jobs if (maps.size() == 0) { return; } if (LOG.isDebugEnabled()) { LOG.debug("Fetcher " + id + " going to fetch from " + host + " for: " + maps); } // List of maps to be fetched yet Set<TaskAttemptID> remaining = new HashSet<TaskAttemptID>(maps); // Construct the url and connect DataInputStream input = null; try { URL url = getMapOutputURL(host, maps); openConnection(url); if (stopped) { abortConnect(host, remaining); return; } // generate hash of the url String msgToEncode = SecureShuffleUtils.buildMsgFrom(url); String encHash = SecureShuffleUtils.hashFromString(msgToEncode, shuffleSecretKey); // put url hash into http header connection.addRequestProperty(SecureShuffleUtils.HTTP_HEADER_URL_HASH, encHash); // set the read timeout connection.setReadTimeout(readTimeout); // put shuffle version into http header connection.addRequestProperty( ShuffleHeader.HTTP_HEADER_NAME, ShuffleHeader.DEFAULT_HTTP_HEADER_NAME); connection.addRequestProperty( ShuffleHeader.HTTP_HEADER_VERSION, ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION); connect(connection, connectionTimeout); // verify that the thread wasn't stopped during calls to connect if (stopped) { abortConnect(host, remaining); return; } input = new DataInputStream(connection.getInputStream()); // Validate response code int rc = connection.getResponseCode(); if (rc != HttpURLConnection.HTTP_OK) { throw new IOException( "Got invalid response code " + rc + " from " + url + ": " + connection.getResponseMessage()); } // get the shuffle version if (!ShuffleHeader.DEFAULT_HTTP_HEADER_NAME.equals( connection.getHeaderField(ShuffleHeader.HTTP_HEADER_NAME)) || !ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION.equals( connection.getHeaderField(ShuffleHeader.HTTP_HEADER_VERSION))) { throw new IOException("Incompatible shuffle response version"); } // get the replyHash which is HMac of the encHash we sent to the server String replyHash = connection.getHeaderField(SecureShuffleUtils.HTTP_HEADER_REPLY_URL_HASH); if (replyHash == null) { throw new IOException("security validation of TT Map output failed"); } LOG.debug("url=" + msgToEncode + ";encHash=" + encHash + ";replyHash=" + replyHash); // verify that replyHash is HMac of encHash SecureShuffleUtils.verifyReply(replyHash, encHash, shuffleSecretKey); LOG.info("for url=" + msgToEncode + " sent hash and received reply"); } catch (IOException ie) { boolean connectExcpt = ie instanceof ConnectException; ioErrs.increment(1); LOG.warn("Failed to connect to " + host + " with " + remaining.size() + " map outputs", ie); // If connect did not succeed, just mark all the maps as failed, // indirectly penalizing the host scheduler.hostFailed(host.getHostName()); for (TaskAttemptID left : remaining) { scheduler.copyFailed(left, host, false, connectExcpt); } // Add back all the remaining maps, WITHOUT marking them as failed for (TaskAttemptID left : remaining) { scheduler.putBackKnownMapOutput(host, left); } return; } try { // Loop through available map-outputs and fetch them // On any error, faildTasks is not null and we exit // after putting back the remaining maps to the // yet_to_be_fetched list and marking the failed tasks. TaskAttemptID[] failedTasks = null; while (!remaining.isEmpty() && failedTasks == null) { failedTasks = copyMapOutput(host, input, remaining); } if (failedTasks != null && failedTasks.length > 0) { LOG.warn("copyMapOutput failed for tasks " + Arrays.toString(failedTasks)); scheduler.hostFailed(host.getHostName()); for (TaskAttemptID left : failedTasks) { scheduler.copyFailed(left, host, true, false); } } // Sanity check if (failedTasks == null && !remaining.isEmpty()) { throw new IOException( "server didn't return all expected map outputs: " + remaining.size() + " left."); } input.close(); input = null; } finally { if (input != null) { IOUtils.cleanup(LOG, input); input = null; } for (TaskAttemptID left : remaining) { scheduler.putBackKnownMapOutput(host, left); } } }
private void abortConnect(MapHost host, Set<TaskAttemptID> remaining) { for (TaskAttemptID left : remaining) { scheduler.putBackKnownMapOutput(host, left); } closeConnection(); }