public LinkedHashSet<Path> scan(FileSystem fs, Path filePath, Set<String> consumedFiles) { LinkedHashSet<Path> pathSet = Sets.newLinkedHashSet(); try { LOG.debug("Scanning {} with pattern {}", filePath, this.filePatternRegexp); FileStatus[] files = fs.listStatus(filePath); for (FileStatus status : files) { Path path = status.getPath(); String filePathStr = path.toString(); if (consumedFiles.contains(filePathStr)) { continue; } if (ignoredFiles.contains(filePathStr)) { continue; } if (acceptFile(filePathStr)) { LOG.debug("Found {}", filePathStr); pathSet.add(path); } else { // don't look at it again ignoredFiles.add(filePathStr); } } } catch (FileNotFoundException e) { LOG.warn("Failed to list directory {}", filePath, e); } catch (IOException e) { throw new RuntimeException(e); } return pathSet; }
protected void replay(long windowId) { // This operator can partition itself dynamically. When that happens a file can be re-hashed // to a different partition than the previous one. In order to handle this, the partition loads // all the recovery data for a window and then processes only those files which would be hashed // to it in the current run. try { Map<Integer, Object> recoveryDataPerOperator = idempotentStorageManager.load(windowId); for (Object recovery : recoveryDataPerOperator.values()) { @SuppressWarnings("unchecked") LinkedList<RecoveryEntry> recoveryData = (LinkedList<RecoveryEntry>) recovery; for (RecoveryEntry recoveryEntry : recoveryData) { if (scanner.acceptFile(recoveryEntry.file)) { // The operator may have continued processing the same file in multiple windows. // So the recovery states of subsequent windows will have an entry for that file however // the offset changes. // In this case we continue reading from previously opened stream. if (currentFile == null || !(currentFile.equals(recoveryEntry.file) && offset == recoveryEntry.startOffset)) { if (inputStream != null) { closeFile(inputStream); } processedFiles.add(recoveryEntry.file); // removing the file from failed and unfinished queues and pending set Iterator<FailedFile> failedFileIterator = failedFiles.iterator(); while (failedFileIterator.hasNext()) { FailedFile ff = failedFileIterator.next(); if (ff.path.equals(recoveryEntry.file) && ff.offset == recoveryEntry.startOffset) { failedFileIterator.remove(); break; } } Iterator<FailedFile> unfinishedFileIterator = unfinishedFiles.iterator(); while (unfinishedFileIterator.hasNext()) { FailedFile ff = unfinishedFileIterator.next(); if (ff.path.equals(recoveryEntry.file) && ff.offset == recoveryEntry.startOffset) { unfinishedFileIterator.remove(); break; } } if (pendingFiles.contains(recoveryEntry.file)) { pendingFiles.remove(recoveryEntry.file); } inputStream = retryFailedFile(new FailedFile(recoveryEntry.file, recoveryEntry.startOffset)); while (offset < recoveryEntry.endOffset) { T line = readEntity(); offset++; emit(line); } } else { while (offset < recoveryEntry.endOffset) { T line = readEntity(); offset++; emit(line); } } } } } } catch (IOException e) { throw new RuntimeException("replay", e); } }