private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException { RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); String filename = path.toString().substring(path.getParent().toString().length(), path.toString().length()); if (filename.startsWith("/part-")) { long filesize = fs.getFileStatus(path).getLen(); if (offset < filesize) { FSDataInputStream handle = fs.open(path); if (offset > 0) { handle.seek(offset); } fileHandleList.add(handle); } offset -= filesize; } } if (fileHandleList.size() == 1) return fileHandleList.get(0); else if (fileHandleList.size() > 1) { Enumeration<FSDataInputStream> enu = fileHandleList.elements(); return new SequenceInputStream(enu); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh fisrt!"); return null; } }
@Override public int run(Configuration conf, List<String> args) throws IOException { if (!args.isEmpty()) { System.err.println("Can't understand argument: " + args.get(0)); return 1; } final DistributedFileSystem dfs = getDFS(conf); try { final TableListing listing = new TableListing.Builder() .addField("") .addField("", true) .wrapWidth(MAX_LINE_WIDTH) .hideHeaders() .build(); final RemoteIterator<EncryptionZone> it = dfs.listEncryptionZones(); while (it.hasNext()) { EncryptionZone ez = it.next(); listing.addRow(ez.getPath(), ez.getKeyName()); } System.out.println(listing.toString()); } catch (IOException e) { System.err.println(prettifyException(e)); return 2; } return 0; }
private void verifyJobOutput() throws IOException { final String _SUCCESS = "_SUCCESS"; final String REDUCER_OUTPUT = "part-r-"; boolean wasSuccessful = false; boolean reducerOutputExists = false; FileSystem fs = FileSystem.getLocal(new Configuration()); RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(OUTPUT_PATH), false); LocatedFileStatus fileStatus = null; String fileName = null; while (iterator.hasNext()) { fileStatus = iterator.next(); fileName = fileStatus.getPath().getName(); if (fileName.contains(_SUCCESS)) { wasSuccessful = true; } if (fileName.contains(REDUCER_OUTPUT)) { reducerOutputExists = true; } } // verify presence of _SUCCESS file Assert.assertEquals(wasSuccessful, true); // verify presence of Reducer output Assert.assertEquals(reducerOutputExists, true); }
private void doWalk( Path path, FileStatusCallback callback, AtomicLong taskCount, SettableFuture<Void> future) { try (SetThreadName ignored = new SetThreadName("HiveHdfsWalker")) { RemoteIterator<LocatedFileStatus> iterator = getLocatedFileStatusRemoteIterator(path); while (iterator.hasNext()) { LocatedFileStatus status = getLocatedFileStatus(iterator); // ignore hidden files. Hive ignores files starting with _ and . as well. String fileName = status.getPath().getName(); if (fileName.startsWith("_") || fileName.startsWith(".")) { continue; } if (isDirectory(status)) { recursiveWalk(status.getPath(), callback, taskCount, future); } else { callback.process(status, status.getBlockLocations()); } if (future.isDone()) { return; } } } catch (FileNotFoundException e) { future.setException(new FileNotFoundException("Partition location does not exist: " + path)); } catch (Throwable t) { future.setException(t); } finally { if (taskCount.decrementAndGet() == 0) { future.set(null); } } }
protected Iterable<Path> listFiles(FileSystem fs, Path basePath) throws IOException { List<Path> ret = new ArrayList<>(); RemoteIterator<LocatedFileStatus> filesIt = fs.listFiles(basePath, true); while (filesIt.hasNext()) { ret.add(filesIt.next().getPath()); } return ret; }
@Override public FileStatus[] listStatus(Path path) throws IOException { List<LocatedFileStatus> list = new ArrayList<>(); RemoteIterator<LocatedFileStatus> iterator = listLocatedStatus(path); while (iterator.hasNext()) { list.add(iterator.next()); } return toArray(list, LocatedFileStatus.class); }
private static List<FileStatus> remoteIterToList(RemoteIterator<FileStatus> rIter) throws IOException { List<FileStatus> fsList = new LinkedList<FileStatus>(); if (rIter == null) return fsList; while (rIter.hasNext()) { fsList.add(rIter.next()); } return fsList; }
private static List<FileStatus> scanDirectory(Path path, FileContext fc, PathFilter pathFilter) throws IOException { path = fc.makeQualified(path); List<FileStatus> jhStatusList = new ArrayList<FileStatus>(); RemoteIterator<FileStatus> fileStatusIter = fc.listStatus(path); while (fileStatusIter.hasNext()) { FileStatus fileStatus = fileStatusIter.next(); Path filePath = fileStatus.getPath(); if (fileStatus.isFile() && pathFilter.accept(filePath)) { jhStatusList.add(fileStatus); } } return jhStatusList; }
/** * Add files in the input path recursively into the results. * * @param result The List to store all files. * @param fs The FileSystem. * @param path The input path. * @param inputFilter The input filter that can be used to filter files/dirs. * @throws IOException */ protected void addInputPathRecursively( List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } }
@Test public void testPurge() throws IOException, InterruptedException { FileSystem fileSystem = FileSystem.newInstance(new Configuration()); testTransferWindowFiles(); RemoteIterator<LocatedFileStatus> iterator = fileSystem.listLocatedStatus(new Path(testMeta.applicationPath + "/bucket_data")); Assert.assertTrue(iterator.hasNext()); testMeta.managedStateContext.getBucketsFileSystem().deleteTimeBucketsLessThanEqualTo(200); iterator = fileSystem.listLocatedStatus(new Path(testMeta.applicationPath + "/bucket_data")); if (iterator.hasNext()) { Assert.fail("All buckets should be deleted"); } }
/** * Looks for the dirs to clean. The folder structure is YYYY/MM/DD/Serial so we can use that to * more efficiently find the directories to clean by comparing the cutoff timestamp with the * timestamp from the folder structure. * * @param fc done dir FileContext * @param root folder for completed jobs * @param cutoff The cutoff for the max history age * @return The list of directories for cleaning * @throws IOException */ public static List<FileStatus> getHistoryDirsForCleaning(FileContext fc, Path root, long cutoff) throws IOException { List<FileStatus> fsList = new ArrayList<FileStatus>(); Calendar cCal = Calendar.getInstance(); cCal.setTimeInMillis(cutoff); int cYear = cCal.get(Calendar.YEAR); int cMonth = cCal.get(Calendar.MONTH) + 1; int cDate = cCal.get(Calendar.DATE); RemoteIterator<FileStatus> yearDirIt = fc.listStatus(root); while (yearDirIt.hasNext()) { FileStatus yearDir = yearDirIt.next(); try { int year = Integer.parseInt(yearDir.getPath().getName()); if (year <= cYear) { RemoteIterator<FileStatus> monthDirIt = fc.listStatus(yearDir.getPath()); while (monthDirIt.hasNext()) { FileStatus monthDir = monthDirIt.next(); try { int month = Integer.parseInt(monthDir.getPath().getName()); // If we only checked the month here, then something like 07/2013 // would incorrectly not pass when the cutoff is 06/2014 if (year < cYear || month <= cMonth) { RemoteIterator<FileStatus> dateDirIt = fc.listStatus(monthDir.getPath()); while (dateDirIt.hasNext()) { FileStatus dateDir = dateDirIt.next(); try { int date = Integer.parseInt(dateDir.getPath().getName()); // If we only checked the date here, then something like // 07/21/2013 would incorrectly not pass when the cutoff is // 08/20/2013 or 07/20/2012 if (year < cYear || month < cMonth || date <= cDate) { fsList.addAll(remoteIterToList(fc.listStatus(dateDir.getPath()))); } } catch (NumberFormatException nfe) { // the directory didn't fit the format we're looking for so // skip the dir } } } } catch (NumberFormatException nfe) { // the directory didn't fit the format we're looking for so skip // the dir } } } } catch (NumberFormatException nfe) { // the directory didn't fit the format we're looking for so skip the dir } } return fsList; }
/** * Processes the input file/folder argument. If the input is a file, then it is directly * considered for further processing by TraceBuilder. If the input is a folder, then all the * history logs in the input folder are considered for further processing. * * <p>If isRecursive is true, then the input path is recursively scanned for job history logs * for further processing by TraceBuilder. * * <p>NOTE: If the input represents a globbed path, then it is first flattened and then the * individual paths represented by the globbed input path are considered for further processing. * * @param input input path, possibly globbed * @param conf configuration * @param isRecursive whether to recursively traverse the input paths to find history logs * @return the input history log files' paths * @throws FileNotFoundException * @throws IOException */ static List<Path> processInputArgument(String input, Configuration conf, boolean isRecursive) throws FileNotFoundException, IOException { Path inPath = new Path(input); FileSystem fs = inPath.getFileSystem(conf); FileStatus[] inStatuses = fs.globStatus(inPath); List<Path> inputPaths = new LinkedList<Path>(); if (inStatuses == null || inStatuses.length == 0) { return inputPaths; } for (FileStatus inStatus : inStatuses) { Path thisPath = inStatus.getPath(); if (inStatus.isDirectory()) { // Find list of files in this path(recursively if -recursive option // is specified). List<FileStatus> historyLogs = new ArrayList<FileStatus>(); RemoteIterator<LocatedFileStatus> iter = fs.listFiles(thisPath, isRecursive); while (iter.hasNext()) { LocatedFileStatus child = iter.next(); String fileName = child.getPath().getName(); if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) { historyLogs.add(child); } } if (historyLogs.size() > 0) { // Add the sorted history log file names in this path to the // inputPaths list FileStatus[] sortableNames = historyLogs.toArray(new FileStatus[historyLogs.size()]); Arrays.sort(sortableNames, new HistoryLogsComparator()); for (FileStatus historyLog : sortableNames) { inputPaths.add(historyLog.getPath()); } } } else { inputPaths.add(thisPath); } } return inputPaths; }
private LocatedFileStatus getLocatedFileStatus(RemoteIterator<LocatedFileStatus> iterator) throws IOException { try (TimeStat.BlockTimer timer = namenodeStats.getRemoteIteratorNext().time()) { return iterator.next(); } catch (IOException | RuntimeException e) { namenodeStats.getRemoteIteratorNext().recordException(e); throw e; } }
/** * Gets the cache directive matching the given ID. Returns null if no matching directives were * found. */ private static CacheDirectiveEntry getDirective(long directiveId) throws ImpalaRuntimeException { LOG.trace("Getting cache directive id: " + directiveId); if (!(dfs instanceof DistributedFileSystem)) { LOG.trace( "Filesystem instance is not the distibuted fs - for directive id \"" + directiveId + "\"."); return null; } CacheDirectiveInfo filter = new CacheDirectiveInfo.Builder().setId(directiveId).build(); try { RemoteIterator<CacheDirectiveEntry> itr = ((DistributedFileSystem) dfs).listCacheDirectives(filter); if (itr.hasNext()) return itr.next(); } catch (IOException e) { // Handle connection issues with e.g. HDFS and possible not found errors throw new ImpalaRuntimeException(e.getMessage(), e); } throw new ImpalaRuntimeException( "HDFS cache directive filter returned empty result. This must not happen"); }
public void run() { LOG.trace("Reloading cache pool names from HDFS"); // Map of cache pool name to CachePoolInfo. Stored in a map to allow Set operations // to be performed on the keys. Map<String, CachePoolInfo> currentCachePools = Maps.newHashMap(); try { DistributedFileSystem dfs = FileSystemUtil.getDistributedFileSystem(); RemoteIterator<CachePoolEntry> itr = dfs.listCachePools(); while (itr.hasNext()) { CachePoolInfo cachePoolInfo = itr.next().getInfo(); currentCachePools.put(cachePoolInfo.getPoolName(), cachePoolInfo); } } catch (Exception e) { LOG.error("Error loading cache pools: ", e); return; } catalogLock_.writeLock().lock(); try { // Determine what has changed relative to what we have cached. Set<String> droppedCachePoolNames = Sets.difference(hdfsCachePools_.keySet(), currentCachePools.keySet()); Set<String> createdCachePoolNames = Sets.difference(currentCachePools.keySet(), hdfsCachePools_.keySet()); // Add all new cache pools. for (String createdCachePool : createdCachePoolNames) { HdfsCachePool cachePool = new HdfsCachePool(currentCachePools.get(createdCachePool)); cachePool.setCatalogVersion(CatalogServiceCatalog.this.incrementAndGetCatalogVersion()); hdfsCachePools_.add(cachePool); } // Remove dropped cache pools. for (String cachePoolName : droppedCachePoolNames) { hdfsCachePools_.remove(cachePoolName); CatalogServiceCatalog.this.incrementAndGetCatalogVersion(); } } finally { catalogLock_.writeLock().unlock(); } }
private List<FileStatus> singleThreadedListStatus( JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
private List<byte[]> readResults(Path outputPath, Configuration config, FileSystem fs) throws IOException { List<byte[]> ret = new ArrayList<>(); for (RemoteIterator<LocatedFileStatus> it = fs.listFiles(outputPath, false); it.hasNext(); ) { Path p = it.next().getPath(); if (p.getName().equals("_SUCCESS")) { fs.delete(p, false); continue; } SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(p)); LongWritable key = new LongWritable(); BytesWritable value = new BytesWritable(); while (reader.next(key, value)) { ret.add(value.copyBytes()); } reader.close(); fs.delete(p, false); } fs.delete(outputPath, false); if (LOG.isDebugEnabled()) { LOG.debug(outputPath + ": Returning " + ret.size()); } return ret; }
@Override protected LocatedFileStatus computeNext() { try { if (remoteIterator == null) { remoteIterator = getLocatedFileStatusRemoteIterator(path); } while (remoteIterator.hasNext()) { LocatedFileStatus status = getLocatedFileStatus(remoteIterator); // ignore hidden files. Hive ignores files starting with _ and . as well. String fileName = status.getPath().getName(); if (fileName.startsWith("_") || fileName.startsWith(".")) { continue; } return status; } return endOfData(); } catch (FileNotFoundException e) { throw new PrestoException(HIVE_FILE_NOT_FOUND, "Partition location does not exist: " + path); } catch (IOException e) { throw new PrestoException(HIVE_FILESYSTEM_ERROR, e); } }
@Override protected void render(Block html) { ContainerId containerId = verifyAndGetContainerId(html); NodeId nodeId = verifyAndGetNodeId(html); String appOwner = verifyAndGetAppOwner(html); LogLimits logLimits = verifyAndGetLogLimits(html); if (containerId == null || nodeId == null || appOwner == null || appOwner.isEmpty() || logLimits == null) { return; } ApplicationId applicationId = containerId.getApplicationAttemptId() .getApplicationId(); String logEntity = $(ENTITY_STRING); if (logEntity == null || logEntity.isEmpty()) { logEntity = containerId.toString(); } if (!conf.getBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, YarnConfiguration.DEFAULT_LOG_AGGREGATION_ENABLED)) { html.h1() ._("Aggregation is not enabled. Try the nodemanager at " + nodeId) ._(); return; } Path remoteRootLogDir = new Path(conf.get( YarnConfiguration.NM_REMOTE_APP_LOG_DIR, YarnConfiguration.DEFAULT_NM_REMOTE_APP_LOG_DIR)); Path remoteAppDir = LogAggregationUtils.getRemoteAppLogDir( remoteRootLogDir, applicationId, appOwner, LogAggregationUtils.getRemoteNodeLogDirSuffix(conf)); RemoteIterator<FileStatus> nodeFiles; try { Path qualifiedLogDir = FileContext.getFileContext(conf).makeQualified( remoteAppDir); nodeFiles = FileContext.getFileContext(qualifiedLogDir.toUri(), conf) .listStatus(remoteAppDir); } catch (FileNotFoundException fnf) { html.h1() ._("Logs not available for " + logEntity + ". Aggregation may not be complete, " + "Check back later or try the nodemanager at " + nodeId)._(); return; } catch (Exception ex) { html.h1() ._("Error getting logs at " + nodeId)._(); return; } boolean foundLog = false; String desiredLogType = $(CONTAINER_LOG_TYPE); try { while (nodeFiles.hasNext()) { AggregatedLogFormat.LogReader reader = null; try { FileStatus thisNodeFile = nodeFiles.next(); if (!thisNodeFile.getPath().getName() .contains(LogAggregationUtils.getNodeString(nodeId)) || thisNodeFile.getPath().getName() .endsWith(LogAggregationUtils.TMP_FILE_SUFFIX)) { continue; } long logUploadedTime = thisNodeFile.getModificationTime(); reader = new AggregatedLogFormat.LogReader(conf, thisNodeFile.getPath()); String owner = null; Map<ApplicationAccessType, String> appAcls = null; try { owner = reader.getApplicationOwner(); appAcls = reader.getApplicationAcls(); } catch (IOException e) { LOG.error("Error getting logs for " + logEntity, e); continue; } ApplicationACLsManager aclsManager = new ApplicationACLsManager(conf); aclsManager.addApplication(applicationId, appAcls); String remoteUser = request().getRemoteUser(); UserGroupInformation callerUGI = null; if (remoteUser != null) { callerUGI = UserGroupInformation.createRemoteUser(remoteUser); } if (callerUGI != null && !aclsManager.checkAccess(callerUGI, ApplicationAccessType.VIEW_APP, owner, applicationId)) { html.h1() ._("User [" + remoteUser + "] is not authorized to view the logs for " + logEntity + " in log file [" + thisNodeFile.getPath().getName() + "]")._(); LOG.error("User [" + remoteUser + "] is not authorized to view the logs for " + logEntity); continue; } AggregatedLogFormat.ContainerLogsReader logReader = reader .getContainerLogsReader(containerId); if (logReader == null) { continue; } foundLog = readContainerLogs(html, logReader, logLimits, desiredLogType, logUploadedTime); } catch (IOException ex) { LOG.error("Error getting logs for " + logEntity, ex); continue; } finally { if (reader != null) reader.close(); } } if (!foundLog) { if (desiredLogType.isEmpty()) { html.h1("No logs available for container " + containerId.toString()); } else { html.h1("Unable to locate '" + desiredLogType + "' log for container " + containerId.toString()); } } } catch (IOException e) { html.h1()._("Error getting logs for " + logEntity)._(); LOG.error("Error getting logs for " + logEntity, e); } }
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); } final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath( JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath( getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error( "Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList.copyOf( Lists.transform( goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE( "Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue( stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }
/** * This uses {@link org.apache.flink.streaming.connectors.fs.DateTimeBucketer} to produce rolling * files. The clock of DateTimeBucketer is set to {@link ModifyableClock} to keep the time in * lockstep with the processing of elements using latches. */ @Test public void testDateTimeRollingStringWriter() throws Exception { final int NUM_ELEMENTS = 20; final int PARALLELISM = 2; final String outPath = hdfsURI + "/rolling-out"; DateTimeBucketer.setClock(new ModifyableClock()); ModifyableClock.setCurrentTime(0); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(PARALLELISM); DataStream<Tuple2<Integer, String>> source = env.addSource(new WaitingTestSourceFunction(NUM_ELEMENTS)).broadcast(); // the parallel flatMap is chained to the sink, so when it has seen 5 elements it can // fire the latch DataStream<String> mapped = source.flatMap( new RichFlatMapFunction<Tuple2<Integer, String>, String>() { private static final long serialVersionUID = 1L; int count = 0; @Override public void flatMap(Tuple2<Integer, String> value, Collector<String> out) throws Exception { out.collect(value.f1); count++; if (count >= 5) { if (getRuntimeContext().getIndexOfThisSubtask() == 0) { latch1.trigger(); } else { latch2.trigger(); } count = 0; } } }); RollingSink<String> sink = new RollingSink<String>(outPath) .setBucketer(new DateTimeBucketer("ss")) .setPartPrefix("part") .setPendingPrefix("") .setPendingSuffix(""); mapped.addSink(sink); env.execute("RollingSink String Write Test"); RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true); // we should have 8 rolling files, 4 time intervals and parallelism of 2 int numFiles = 0; while (files.hasNext()) { LocatedFileStatus file = files.next(); numFiles++; if (file.getPath().toString().contains("rolling-out/00")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 0; i < 5; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/05")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 5; i < 10; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/10")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 10; i < 15; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else if (file.getPath().toString().contains("rolling-out/15")) { FSDataInputStream inStream = dfs.open(file.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 15; i < 20; i++) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); } else { Assert.fail("File " + file + " does not match any expected roll pattern."); } } Assert.assertEquals(8, numFiles); }