@Override public InputFormat get() { final Map<String, LocatedFileStatus> locationMap = Maps.newHashMap(); for (LocatedFileStatus status : locations) { locationMap.put(status.getPath().getName(), status); } return new TextInputFormat() { @Override protected boolean isSplitable(FileSystem fs, Path file) { return false; } @Override protected FileStatus[] listStatus(JobConf job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } FileStatus[] status = new FileStatus[dirs.length]; for (int i = 0; i < dirs.length; i++) { status[i] = locationMap.get(dirs[i].getName()); } return status; } }; }
private void doWalk( Path path, FileStatusCallback callback, AtomicLong taskCount, SettableFuture<Void> future) { try (SetThreadName ignored = new SetThreadName("HiveHdfsWalker")) { RemoteIterator<LocatedFileStatus> iterator = getLocatedFileStatusRemoteIterator(path); while (iterator.hasNext()) { LocatedFileStatus status = getLocatedFileStatus(iterator); // ignore hidden files. Hive ignores files starting with _ and . as well. String fileName = status.getPath().getName(); if (fileName.startsWith("_") || fileName.startsWith(".")) { continue; } if (isDirectory(status)) { recursiveWalk(status.getPath(), callback, taskCount, future); } else { callback.process(status, status.getBlockLocations()); } if (future.isDone()) { return; } } } catch (FileNotFoundException e) { future.setException(new FileNotFoundException("Partition location does not exist: " + path)); } catch (Throwable t) { future.setException(t); } finally { if (taskCount.decrementAndGet() == 0) { future.set(null); } } }
private void verifyJobOutput() throws IOException { final String _SUCCESS = "_SUCCESS"; final String REDUCER_OUTPUT = "part-r-"; boolean wasSuccessful = false; boolean reducerOutputExists = false; FileSystem fs = FileSystem.getLocal(new Configuration()); RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(OUTPUT_PATH), false); LocatedFileStatus fileStatus = null; String fileName = null; while (iterator.hasNext()) { fileStatus = iterator.next(); fileName = fileStatus.getPath().getName(); if (fileName.contains(_SUCCESS)) { wasSuccessful = true; } if (fileName.contains(REDUCER_OUTPUT)) { reducerOutputExists = true; } } // verify presence of _SUCCESS file Assert.assertEquals(wasSuccessful, true); // verify presence of Reducer output Assert.assertEquals(reducerOutputExists, true); }
/** * Add files in the input path recursively into the results. * * @param result The List to store all files. * @param fs The FileSystem. * @param path The input path. * @param inputFilter The input filter that can be used to filter files/dirs. * @throws IOException */ protected void addInputPathRecursively( List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } }
/** * Processes the input file/folder argument. If the input is a file, then it is directly * considered for further processing by TraceBuilder. If the input is a folder, then all the * history logs in the input folder are considered for further processing. * * <p>If isRecursive is true, then the input path is recursively scanned for job history logs * for further processing by TraceBuilder. * * <p>NOTE: If the input represents a globbed path, then it is first flattened and then the * individual paths represented by the globbed input path are considered for further processing. * * @param input input path, possibly globbed * @param conf configuration * @param isRecursive whether to recursively traverse the input paths to find history logs * @return the input history log files' paths * @throws FileNotFoundException * @throws IOException */ static List<Path> processInputArgument(String input, Configuration conf, boolean isRecursive) throws FileNotFoundException, IOException { Path inPath = new Path(input); FileSystem fs = inPath.getFileSystem(conf); FileStatus[] inStatuses = fs.globStatus(inPath); List<Path> inputPaths = new LinkedList<Path>(); if (inStatuses == null || inStatuses.length == 0) { return inputPaths; } for (FileStatus inStatus : inStatuses) { Path thisPath = inStatus.getPath(); if (inStatus.isDirectory()) { // Find list of files in this path(recursively if -recursive option // is specified). List<FileStatus> historyLogs = new ArrayList<FileStatus>(); RemoteIterator<LocatedFileStatus> iter = fs.listFiles(thisPath, isRecursive); while (iter.hasNext()) { LocatedFileStatus child = iter.next(); String fileName = child.getPath().getName(); if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) { historyLogs.add(child); } } if (historyLogs.size() > 0) { // Add the sorted history log file names in this path to the // inputPaths list FileStatus[] sortableNames = historyLogs.toArray(new FileStatus[historyLogs.size()]); Arrays.sort(sortableNames, new HistoryLogsComparator()); for (FileStatus historyLog : sortableNames) { inputPaths.add(historyLog.getPath()); } } } else { inputPaths.add(thisPath); } } return inputPaths; }
private List<FileStatus> singleThreadedListStatus( JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
@Override protected LocatedFileStatus computeNext() { try { if (remoteIterator == null) { remoteIterator = getLocatedFileStatusRemoteIterator(path); } while (remoteIterator.hasNext()) { LocatedFileStatus status = getLocatedFileStatus(remoteIterator); // ignore hidden files. Hive ignores files starting with _ and . as well. String fileName = status.getPath().getName(); if (fileName.startsWith("_") || fileName.startsWith(".")) { continue; } return status; } return endOfData(); } catch (FileNotFoundException e) { throw new PrestoException(HIVE_FILE_NOT_FOUND, "Partition location does not exist: " + path); } catch (IOException e) { throw new PrestoException(HIVE_FILESYSTEM_ERROR, e); } }
/** * This uses {@link org.apache.flink.streaming.connectors.fs.DateTimeBucketer} to produce rolling * files. The clock of DateTimeBucketer is set to {@link ModifyableClock} to keep the time in * lockstep with the processing of elements using latches. */ @Test public void testDateTimeRollingStringWriter() throws Exception { final int NUM_ELEMENTS = 20; final int PARALLELISM = 2; final String outPath = hdfsURI + "/rolling-out"; DateTimeBucketer.setClock(new ModifyableClock()); ModifyableClock.setCurrentTime(0); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(PARALLELISM); DataStream<Tuple2<Integer, String>> source = env.addSource(new WaitingTestSourceFunction(NUM_ELEMENTS)).broadcast(); // the parallel flatMap is chained to the sink, so when it has seen 5 elements it can // fire the latch DataStream<String> mapped = source.flatMap( new RichFlatMapFunction<Tuple2<Integer, String>, String>() { private static final long serialVersionUID = 1L; int count = 0; @Override public void flatMap(Tuple2<Integer, String> value, Collector<String> out) throws Exception { out.collect(value.f1); count++; if (count >= 5) { if (getRuntimeContext().getIndexOfThisSubtask() == 0) { latch1.trigger(); } else { latch2.trigger(); } count = 0; } } }); RollingSink<String> sink = new RollingSink<String>(outPath) .setBucketer(new DateTimeBucketer("ss")) .setPartPrefix("part") .setPendingPrefix("") .setPendingSuffix(""); mapped.addSink(sink); env.execute("RollingSink String Write Test"); RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true); // we should have 8 rolling files, 4 time intervals and parallelism of 2 int numFiles = 0; while (files.hasNext()) { LocatedFileStatus file = files.next(); numFiles++; if (file.getPath().toString().contains("rolling-out/00")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 0; i < 5; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/05")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 5; i < 10; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/10")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 10; i < 15; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/15")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 15; i < 20; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else { Assert.fail("File " + file + " does not match any expected roll pattern."); } } Assert.assertEquals(8, numFiles); }
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); } final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath( JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath( getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error( "Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList.copyOf( Lists.transform( goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE( "Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue( stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }