public boolean accept(Path path) { for (PathFilter filter : filters) { if (!filter.accept(path)) { return false; } } return true; }
@Override public List<HdfsFileStatusWithId> listLocatedHdfsStatus(FileSystem fs, Path p, PathFilter filter) throws IOException { DistributedFileSystem dfs = ensureDfs(fs); DFSClient dfsc = dfs.getClient(); final String src = p.toUri().getPath(); DirectoryListing current = dfsc.listPaths(src, org.apache.hadoop.hdfs.protocol.HdfsFileStatus.EMPTY_NAME, true); if (current == null) { // the directory does not exist throw new FileNotFoundException("File " + p + " does not exist."); } final URI fsUri = fs.getUri(); List<HdfsFileStatusWithId> result = new ArrayList<HdfsFileStatusWithId>(current.getPartialListing().length); while (current != null) { org.apache.hadoop.hdfs.protocol.HdfsFileStatus[] hfss = current.getPartialListing(); for (int i = 0; i < hfss.length; ++i) { HdfsLocatedFileStatus next = (HdfsLocatedFileStatus) (hfss[i]); if (filter != null) { Path filterPath = next.getFullPath(p).makeQualified(fsUri, null); if (!filter.accept(filterPath)) continue; } LocatedFileStatus lfs = next.makeQualifiedLocated(fsUri, p); result.add(new HdfsFileStatusWithIdImpl(lfs, next.getFileId())); } current = current.hasMore() ? dfsc.listPaths(src, current.getLastName(), true) : null; } return result; }
private static List<Path> listFiles(Path path, PathFilter filter) throws IOException { ArrayList<Path> files = new ArrayList<Path>(); FileSystem fs = FileSystem.get(confHadoop); FileStatus[] status = fs.listStatus(path); for (int i = 0; i < status.length; i++) { Path p = status[i].getPath(); if (filter.accept(p)) { files.add(p); } } return files; }
private static List<FileStatus> scanDirectory(Path path, FileContext fc, PathFilter pathFilter) throws IOException { path = fc.makeQualified(path); List<FileStatus> jhStatusList = new ArrayList<FileStatus>(); RemoteIterator<FileStatus> fileStatusIter = fc.listStatus(path); while (fileStatusIter.hasNext()) { FileStatus fileStatus = fileStatusIter.next(); Path filePath = fileStatus.getPath(); if (fileStatus.isFile() && pathFilter.accept(filePath)) { jhStatusList.add(fileStatus); } } return jhStatusList; }
/** * Add files in the input path recursively into the results. * * @param result The List to store all files. * @param fs The FileSystem. * @param path The input path. * @param inputFilter The input filter that can be used to filter files/dirs. * @throws IOException */ protected void addInputPathRecursively( List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } }
private static List<FileStatus> listFilteredStatus(FileContext fc, Path root, PathFilter filter) throws IOException { List<FileStatus> fsList = remoteIterToList(fc.listStatus(root)); if (filter == null) { return fsList; } else { List<FileStatus> filteredList = new LinkedList<FileStatus>(); for (FileStatus fs : fsList) { if (filter.accept(fs.getPath())) { filteredList.add(fs); } } return filteredList; } }
private List<FileStatus> singleThreadedListStatus( JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); List<IOException> errors = new ArrayList<IOException>(); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
/** * Adds all non-hidden directories and subdirectories to set param * * @throws IOException */ static boolean getAllSubDirs(URI location, Job job, Set<Path> paths) throws IOException { FileSystem fs = FileSystem.get(location, job.getConfiguration()); Path path = new Path(location.getPath()); if (PATH_FILTER.accept(path)) { try { FileStatus file = fs.getFileStatus(path); if (file.isDir()) { for (FileStatus sub : fs.listStatus(path)) { getAllSubDirs(sub.getPath().toUri(), job, paths); } } else { AvroStorageLog.details("Add input file:" + file); paths.add(file.getPath()); } } catch (FileNotFoundException e) { AvroStorageLog.details("getAllSubDirs: RETURN FALSE; Input path does not exist: " + path); AvroStorageLog.details("Input path does not exist: " + path); return false; } return true; } return false; }
private void doTestPathFilter(Map<String, Boolean> expected, PathFilter filter) { for (Map.Entry<String, Boolean> e : expected.entrySet()) { assertEquals(e.getValue(), filter.accept(new Path(e.getKey()))); } }
public boolean accept(Path file) { try { FileSystem fs = file.getFileSystem(conf); boolean unpack = conf.getBoolean(unpackParamName, true); if (defaultIgnores.accept(file) && fs.getFileStatus(file).isDir() == false) { String URI = file.toUri().toString(); // detect whether a file is likely to be an archive // TODO extend to other known types if (unpack && URI.toLowerCase().endsWith(".zip")) { FSDataInputStream fis = null; try { fis = fs.open(file); ArchiveInputStream input = new ArchiveStreamFactory().createArchiveInputStream(new BufferedInputStream(fis)); ArchiveEntry entry = null; while ((entry = input.getNextEntry()) != null) { String name = entry.getName(); long size = entry.getSize(); byte[] content = new byte[(int) size]; input.read(content); key.set(name); // fill the values for the content object value.setUrl(name); value.setContent(content); writer.append(key, value); counter++; if (reporter != null) { reporter.incrCounter(Counters.DOC_COUNT, 1); } } } catch (ArchiveException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { fis.close(); } } else { // Hmm, kind of dangerous to do this byte[] fileBArray = new byte[(int) fs.getFileStatus(file).getLen()]; FSDataInputStream fis = null; try { fis = fs.open(file); fis.readFully(0, fileBArray); fis.close(); key.set(URI); // fill the values for the content object value.setUrl(URI); value.setContent(fileBArray); writer.append(key, value); counter++; if (reporter != null) { reporter.incrCounter(Counters.DOC_COUNT, 1); } } catch (FileNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } } } // if it is a directory, accept it so we can possibly recurse on // it, // otherwise we don't care about actually accepting the file, // since // all the work is done in the accept method here. return fs.getFileStatus(file).isDir(); } catch (IOException e) { log.error("Exception", e); } return false; }