// TODO: Replace this whenever hadoop gets their act together and stops breaking with more recent // versions of Guava public static long unzipNoGuava( final Path zip, final Configuration configuration, final File outDir, final Progressable progressable) throws IOException { final DataPusher zipPusher = (DataPusher) RetryProxy.create( DataPusher.class, new DataPusher() { @Override public long push() throws IOException { try { final FileSystem fileSystem = zip.getFileSystem(configuration); long size = 0L; final byte[] buffer = new byte[1 << 13]; progressable.progress(); try (ZipInputStream in = new ZipInputStream(fileSystem.open(zip, 1 << 13))) { for (ZipEntry entry = in.getNextEntry(); entry != null; entry = in.getNextEntry()) { final String fileName = entry.getName(); try (final OutputStream out = new BufferedOutputStream( new FileOutputStream( outDir.getAbsolutePath() + File.separator + fileName), 1 << 13)) { for (int len = in.read(buffer); len >= 0; len = in.read(buffer)) { progressable.progress(); if (len == 0) { continue; } size += len; out.write(buffer, 0, len); } out.flush(); } } } progressable.progress(); return size; } catch (IOException | RuntimeException exception) { log.error(exception, "Exception in unzip retry loop"); throw exception; } } }, RetryPolicies.exponentialBackoffRetry( NUM_RETRIES, SECONDS_BETWEEN_RETRIES, TimeUnit.SECONDS)); return zipPusher.push(); }
public boolean execute( CommandInterpreter interpreter, CommandLine commandLine, boolean batchMode) { final String localPathToDataDir = commandLine.getOptionValue('l'); final String groupName = commandLine.getOptionValue('g'); final String destDataDirId = commandLine.getOptionValue('d'); final String jobIdString = commandLine.getOptionValue('j'); final String partitionPatternString = commandLine.getOptionValue('p'); final int partitionGroupNum = Integer.parseInt(commandLine.getOptionValue('n')); final String message = "Sending data to nodes (backgrounded):\n" + "\tlocalPathToDataDir=" + localPathToDataDir + "\n" + "\tgroupName=" + groupName + "\n" + "\tdestDataDirId=" + destDataDirId + "\n" + "\tjobIdString=" + jobIdString + "\n" + "\tpartitionPatternString=" + partitionPatternString + "\n" + "\tpartitionGroupNum=" + partitionGroupNum + "\n"; interpreter.showMessage(message, batchMode); final String jobDirPostfix = DataPusher.getJobDirPostfix(jobIdString, destDataDirId); DataPusher.sendDataToNodes( console.getClusterDefinition(), groupName, jobDirPostfix, localPathToDataDir, Pattern.compile(partitionPatternString), partitionGroupNum, 3, 1); return true; }
public static void writeSegmentDescriptor( final FileSystem outputFS, final DataSegment segment, final Path descriptorPath, final Progressable progressable) throws IOException { final DataPusher descriptorPusher = (DataPusher) RetryProxy.create( DataPusher.class, new DataPusher() { @Override public long push() throws IOException { try { progressable.progress(); if (outputFS.exists(descriptorPath)) { if (!outputFS.delete(descriptorPath, false)) { throw new IOException( String.format("Failed to delete descriptor at [%s]", descriptorPath)); } } try (final OutputStream descriptorOut = outputFS.create( descriptorPath, true, DEFAULT_FS_BUFFER_SIZE, progressable)) { HadoopDruidIndexerConfig.jsonMapper.writeValue(descriptorOut, segment); descriptorOut.flush(); } } catch (RuntimeException | IOException ex) { log.info(ex, "Exception in descriptor pusher retry loop"); throw ex; } return -1; } }, RetryPolicies.exponentialBackoffRetry( NUM_RETRIES, SECONDS_BETWEEN_RETRIES, TimeUnit.SECONDS)); descriptorPusher.push(); }
public static DataSegment serializeOutIndex( final DataSegment segmentTemplate, final Configuration configuration, final Progressable progressable, final TaskAttemptID taskAttemptID, final File mergedBase, final Path segmentBasePath) throws IOException { final FileSystem outputFS = FileSystem.get(segmentBasePath.toUri(), configuration); final Path tmpPath = new Path(segmentBasePath, String.format("index.zip.%d", taskAttemptID.getId())); final AtomicLong size = new AtomicLong(0L); final DataPusher zipPusher = (DataPusher) RetryProxy.create( DataPusher.class, new DataPusher() { @Override public long push() throws IOException { try (OutputStream outputStream = outputFS.create(tmpPath, true, DEFAULT_FS_BUFFER_SIZE, progressable)) { size.set(zipAndCopyDir(mergedBase, outputStream, progressable)); outputStream.flush(); } catch (IOException | RuntimeException exception) { log.error(exception, "Exception in retry loop"); throw exception; } return -1; } }, RetryPolicies.exponentialBackoffRetry( NUM_RETRIES, SECONDS_BETWEEN_RETRIES, TimeUnit.SECONDS)); zipPusher.push(); log.info("Zipped %,d bytes to [%s]", size.get(), tmpPath.toUri()); final Path finalIndexZipFilePath = new Path(segmentBasePath, "index.zip"); final URI indexOutURI = finalIndexZipFilePath.toUri(); final ImmutableMap<String, Object> loadSpec; // TODO: Make this a part of Pushers or Pullers switch (outputFS.getScheme()) { case "hdfs": loadSpec = ImmutableMap.<String, Object>of("type", "hdfs", "path", indexOutURI.toString()); break; case "s3": case "s3n": loadSpec = ImmutableMap.<String, Object>of( "type", "s3_zip", "bucket", indexOutURI.getHost(), "key", indexOutURI.getPath().substring(1) // remove the leading "/" ); break; case "file": loadSpec = ImmutableMap.<String, Object>of("type", "local", "path", indexOutURI.getPath()); break; default: throw new IAE("Unknown file system scheme [%s]", outputFS.getScheme()); } final DataSegment finalSegment = segmentTemplate .withLoadSpec(loadSpec) .withSize(size.get()) .withBinaryVersion(SegmentUtils.getVersionFromDir(mergedBase)); if (!renameIndexFiles(outputFS, tmpPath, finalIndexZipFilePath)) { throw new IOException( String.format( "Unable to rename [%s] to [%s]", tmpPath.toUri().toString(), finalIndexZipFilePath.toUri().toString())); } writeSegmentDescriptor( outputFS, finalSegment, new Path(segmentBasePath, "descriptor.json"), progressable); return finalSegment; }