示例#1
0
 @Override
 public List<HdfsFileStatusWithId> listLocatedHdfsStatus(FileSystem fs, Path p, PathFilter filter)
     throws IOException {
   DistributedFileSystem dfs = ensureDfs(fs);
   DFSClient dfsc = dfs.getClient();
   final String src = p.toUri().getPath();
   DirectoryListing current =
       dfsc.listPaths(src, org.apache.hadoop.hdfs.protocol.HdfsFileStatus.EMPTY_NAME, true);
   if (current == null) { // the directory does not exist
     throw new FileNotFoundException("File " + p + " does not exist.");
   }
   final URI fsUri = fs.getUri();
   List<HdfsFileStatusWithId> result =
       new ArrayList<HdfsFileStatusWithId>(current.getPartialListing().length);
   while (current != null) {
     org.apache.hadoop.hdfs.protocol.HdfsFileStatus[] hfss = current.getPartialListing();
     for (int i = 0; i < hfss.length; ++i) {
       HdfsLocatedFileStatus next = (HdfsLocatedFileStatus) (hfss[i]);
       if (filter != null) {
         Path filterPath = next.getFullPath(p).makeQualified(fsUri, null);
         if (!filter.accept(filterPath)) continue;
       }
       LocatedFileStatus lfs = next.makeQualifiedLocated(fsUri, p);
       result.add(new HdfsFileStatusWithIdImpl(lfs, next.getFileId()));
     }
     current = current.hasMore() ? dfsc.listPaths(src, current.getLastName(), true) : null;
   }
   return result;
 }
 public boolean accept(Path path) {
   for (PathFilter filter : filters) {
     if (!filter.accept(path)) {
       return false;
     }
   }
   return true;
 }
 private static List<Path> listFiles(Path path, PathFilter filter) throws IOException {
   ArrayList<Path> files = new ArrayList<Path>();
   FileSystem fs = FileSystem.get(confHadoop);
   FileStatus[] status = fs.listStatus(path);
   for (int i = 0; i < status.length; i++) {
     Path p = status[i].getPath();
     if (filter.accept(p)) {
       files.add(p);
     }
   }
   return files;
 }
 private static List<FileStatus> scanDirectory(Path path, FileContext fc, PathFilter pathFilter)
     throws IOException {
   path = fc.makeQualified(path);
   List<FileStatus> jhStatusList = new ArrayList<FileStatus>();
   RemoteIterator<FileStatus> fileStatusIter = fc.listStatus(path);
   while (fileStatusIter.hasNext()) {
     FileStatus fileStatus = fileStatusIter.next();
     Path filePath = fileStatus.getPath();
     if (fileStatus.isFile() && pathFilter.accept(filePath)) {
       jhStatusList.add(fileStatus);
     }
   }
   return jhStatusList;
 }
 /**
  * Add files in the input path recursively into the results.
  *
  * @param result The List to store all files.
  * @param fs The FileSystem.
  * @param path The input path.
  * @param inputFilter The input filter that can be used to filter files/dirs.
  * @throws IOException
  */
 protected void addInputPathRecursively(
     List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter)
     throws IOException {
   RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path);
   while (iter.hasNext()) {
     LocatedFileStatus stat = iter.next();
     if (inputFilter.accept(stat.getPath())) {
       if (stat.isDirectory()) {
         addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
       } else {
         result.add(stat);
       }
     }
   }
 }
示例#6
0
 private static List<FileStatus> listFilteredStatus(FileContext fc, Path root, PathFilter filter)
     throws IOException {
   List<FileStatus> fsList = remoteIterToList(fc.listStatus(root));
   if (filter == null) {
     return fsList;
   } else {
     List<FileStatus> filteredList = new LinkedList<FileStatus>();
     for (FileStatus fs : fsList) {
       if (filter.accept(fs.getPath())) {
         filteredList.add(fs);
       }
     }
     return filteredList;
   }
 }
  private List<FileStatus> singleThreadedListStatus(
      JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
      Path p = dirs[i];
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat : matches) {
          if (globStat.isDirectory()) {
            RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
            while (iter.hasNext()) {
              LocatedFileStatus stat = iter.next();
              if (inputFilter.accept(stat.getPath())) {
                if (recursive && stat.isDirectory()) {
                  addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                } else {
                  result.add(stat);
                }
              }
            }
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    return result;
  }
示例#8
0
 /**
  * Adds all non-hidden directories and subdirectories to set param
  *
  * @throws IOException
  */
 static boolean getAllSubDirs(URI location, Job job, Set<Path> paths) throws IOException {
   FileSystem fs = FileSystem.get(location, job.getConfiguration());
   Path path = new Path(location.getPath());
   if (PATH_FILTER.accept(path)) {
     try {
       FileStatus file = fs.getFileStatus(path);
       if (file.isDir()) {
         for (FileStatus sub : fs.listStatus(path)) {
           getAllSubDirs(sub.getPath().toUri(), job, paths);
         }
       } else {
         AvroStorageLog.details("Add input file:" + file);
         paths.add(file.getPath());
       }
     } catch (FileNotFoundException e) {
       AvroStorageLog.details("getAllSubDirs: RETURN FALSE; Input path does not exist: " + path);
       AvroStorageLog.details("Input path does not exist: " + path);
       return false;
     }
     return true;
   }
   return false;
 }
 private void doTestPathFilter(Map<String, Boolean> expected, PathFilter filter) {
   for (Map.Entry<String, Boolean> e : expected.entrySet()) {
     assertEquals(e.getValue(), filter.accept(new Path(e.getKey())));
   }
 }
    public boolean accept(Path file) {
      try {
        FileSystem fs = file.getFileSystem(conf);
        boolean unpack = conf.getBoolean(unpackParamName, true);

        if (defaultIgnores.accept(file) && fs.getFileStatus(file).isDir() == false) {
          String URI = file.toUri().toString();

          // detect whether a file is likely to be an archive
          // TODO extend to other known types
          if (unpack && URI.toLowerCase().endsWith(".zip")) {
            FSDataInputStream fis = null;
            try {
              fis = fs.open(file);
              ArchiveInputStream input =
                  new ArchiveStreamFactory().createArchiveInputStream(new BufferedInputStream(fis));
              ArchiveEntry entry = null;
              while ((entry = input.getNextEntry()) != null) {
                String name = entry.getName();
                long size = entry.getSize();
                byte[] content = new byte[(int) size];
                input.read(content);
                key.set(name);
                // fill the values for the content object
                value.setUrl(name);
                value.setContent(content);

                writer.append(key, value);
                counter++;
                if (reporter != null) {
                  reporter.incrCounter(Counters.DOC_COUNT, 1);
                }
              }

            } catch (ArchiveException e) {
              // TODO Auto-generated catch block
              e.printStackTrace();
            } finally {
              fis.close();
            }

          } else {
            // Hmm, kind of dangerous to do this
            byte[] fileBArray = new byte[(int) fs.getFileStatus(file).getLen()];
            FSDataInputStream fis = null;
            try {
              fis = fs.open(file);
              fis.readFully(0, fileBArray);
              fis.close();
              key.set(URI);
              // fill the values for the content object
              value.setUrl(URI);
              value.setContent(fileBArray);

              writer.append(key, value);
              counter++;
              if (reporter != null) {
                reporter.incrCounter(Counters.DOC_COUNT, 1);
              }
            } catch (FileNotFoundException e) {
              throw new RuntimeException(e);
            } catch (IOException e) {
              throw new RuntimeException(e);
            }
          }
        }
        // if it is a directory, accept it so we can possibly recurse on
        // it,
        // otherwise we don't care about actually accepting the file,
        // since
        // all the work is done in the accept method here.
        return fs.getFileStatus(file).isDir();
      } catch (IOException e) {
        log.error("Exception", e);
      }
      return false;
    }