private void verifyJobOutput() throws IOException { final String _SUCCESS = "_SUCCESS"; final String REDUCER_OUTPUT = "part-r-"; boolean wasSuccessful = false; boolean reducerOutputExists = false; FileSystem fs = FileSystem.getLocal(new Configuration()); RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(OUTPUT_PATH), false); LocatedFileStatus fileStatus = null; String fileName = null; while (iterator.hasNext()) { fileStatus = iterator.next(); fileName = fileStatus.getPath().getName(); if (fileName.contains(_SUCCESS)) { wasSuccessful = true; } if (fileName.contains(REDUCER_OUTPUT)) { reducerOutputExists = true; } } // verify presence of _SUCCESS file Assert.assertEquals(wasSuccessful, true); // verify presence of Reducer output Assert.assertEquals(reducerOutputExists, true); }
private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException { RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); String filename = path.toString().substring(path.getParent().toString().length(), path.toString().length()); if (filename.startsWith("/part-")) { long filesize = fs.getFileStatus(path).getLen(); if (offset < filesize) { FSDataInputStream handle = fs.open(path); if (offset > 0) { handle.seek(offset); } fileHandleList.add(handle); } offset -= filesize; } } if (fileHandleList.size() == 1) return fileHandleList.get(0); else if (fileHandleList.size() > 1) { Enumeration<FSDataInputStream> enu = fileHandleList.elements(); return new SequenceInputStream(enu); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh fisrt!"); return null; } }
protected Iterable<Path> listFiles(FileSystem fs, Path basePath) throws IOException { List<Path> ret = new ArrayList<>(); RemoteIterator<LocatedFileStatus> filesIt = fs.listFiles(basePath, true); while (filesIt.hasNext()) { ret.add(filesIt.next().getPath()); } return ret; }
/** * Processes the input file/folder argument. If the input is a file, then it is directly * considered for further processing by TraceBuilder. If the input is a folder, then all the * history logs in the input folder are considered for further processing. * * <p>If isRecursive is true, then the input path is recursively scanned for job history logs * for further processing by TraceBuilder. * * <p>NOTE: If the input represents a globbed path, then it is first flattened and then the * individual paths represented by the globbed input path are considered for further processing. * * @param input input path, possibly globbed * @param conf configuration * @param isRecursive whether to recursively traverse the input paths to find history logs * @return the input history log files' paths * @throws FileNotFoundException * @throws IOException */ static List<Path> processInputArgument(String input, Configuration conf, boolean isRecursive) throws FileNotFoundException, IOException { Path inPath = new Path(input); FileSystem fs = inPath.getFileSystem(conf); FileStatus[] inStatuses = fs.globStatus(inPath); List<Path> inputPaths = new LinkedList<Path>(); if (inStatuses == null || inStatuses.length == 0) { return inputPaths; } for (FileStatus inStatus : inStatuses) { Path thisPath = inStatus.getPath(); if (inStatus.isDirectory()) { // Find list of files in this path(recursively if -recursive option // is specified). List<FileStatus> historyLogs = new ArrayList<FileStatus>(); RemoteIterator<LocatedFileStatus> iter = fs.listFiles(thisPath, isRecursive); while (iter.hasNext()) { LocatedFileStatus child = iter.next(); String fileName = child.getPath().getName(); if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) { historyLogs.add(child); } } if (historyLogs.size() > 0) { // Add the sorted history log file names in this path to the // inputPaths list FileStatus[] sortableNames = historyLogs.toArray(new FileStatus[historyLogs.size()]); Arrays.sort(sortableNames, new HistoryLogsComparator()); for (FileStatus historyLog : sortableNames) { inputPaths.add(historyLog.getPath()); } } } else { inputPaths.add(thisPath); } } return inputPaths; }
private List<byte[]> readResults(Path outputPath, Configuration config, FileSystem fs) throws IOException { List<byte[]> ret = new ArrayList<>(); for (RemoteIterator<LocatedFileStatus> it = fs.listFiles(outputPath, false); it.hasNext(); ) { Path p = it.next().getPath(); if (p.getName().equals("_SUCCESS")) { fs.delete(p, false); continue; } SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(p)); LongWritable key = new LongWritable(); BytesWritable value = new BytesWritable(); while (reader.next(key, value)) { ret.add(value.copyBytes()); } reader.close(); fs.delete(p, false); } fs.delete(outputPath, false); if (LOG.isDebugEnabled()) { LOG.debug(outputPath + ": Returning " + ret.size()); } return ret; }
/** * This uses {@link org.apache.flink.streaming.connectors.fs.DateTimeBucketer} to produce rolling * files. The clock of DateTimeBucketer is set to {@link ModifyableClock} to keep the time in * lockstep with the processing of elements using latches. */ @Test public void testDateTimeRollingStringWriter() throws Exception { final int NUM_ELEMENTS = 20; final int PARALLELISM = 2; final String outPath = hdfsURI + "/rolling-out"; DateTimeBucketer.setClock(new ModifyableClock()); ModifyableClock.setCurrentTime(0); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(PARALLELISM); DataStream<Tuple2<Integer, String>> source = env.addSource(new WaitingTestSourceFunction(NUM_ELEMENTS)).broadcast(); // the parallel flatMap is chained to the sink, so when it has seen 5 elements it can // fire the latch DataStream<String> mapped = source.flatMap( new RichFlatMapFunction<Tuple2<Integer, String>, String>() { private static final long serialVersionUID = 1L; int count = 0; @Override public void flatMap(Tuple2<Integer, String> value, Collector<String> out) throws Exception { out.collect(value.f1); count++; if (count >= 5) { if (getRuntimeContext().getIndexOfThisSubtask() == 0) { latch1.trigger(); } else { latch2.trigger(); } count = 0; } } }); RollingSink<String> sink = new RollingSink<String>(outPath) .setBucketer(new DateTimeBucketer("ss")) .setPartPrefix("part") .setPendingPrefix("") .setPendingSuffix(""); mapped.addSink(sink); env.execute("RollingSink String Write Test"); RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true); // we should have 8 rolling files, 4 time intervals and parallelism of 2 int numFiles = 0; while (files.hasNext()) { LocatedFileStatus file = files.next(); numFiles++; if (file.getPath().toString().contains("rolling-out/00")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 0; i < 5; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/05")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 5; i < 10; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/10")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 10; i < 15; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/15")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 15; i < 20; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else { Assert.fail("File " + file + " does not match any expected roll pattern."); } } Assert.assertEquals(8, numFiles); }
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); } final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath( JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath( getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error( "Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList.copyOf( Lists.transform( goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE( "Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue( stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }