@Override
  public List<HdfsEntity> list(String path) throws IOException {
    FileStatus[] fileStatuses = fileSystem.listStatus(new Path(path));
    List<HdfsEntity> entities = new ArrayList<HdfsEntity>();

    for (FileStatus fileStatus : fileStatuses) {
      HdfsEntity entity = new HdfsEntity();

      entity.setDirectory(fileStatus.isDirectory());
      entity.setPath(fileStatus.getPath().toUri().getPath());
      entity.setModifiedAt(new Date(fileStatus.getModificationTime()));
      entity.setSize(fileStatus.getLen());

      if (fileStatus.isDirectory()) {
        Integer length = 0;
        try {
          FileStatus[] contents = fileSystem.listStatus(fileStatus.getPath());
          length = contents.length;
        } catch (org.apache.hadoop.security.AccessControlException exception) {
        }
        entity.setContentCount(length);
      }

      entities.add(entity);
    }

    return entities;
  }
Example #2
0
  /*
   * Test that {@link HFileOutputFormat2} creates an HFile with TIMERANGE
   * metadata used by time-restricted scans.
   */
  @Test
  public void test_TIMERANGE() throws Exception {
    Configuration conf = new Configuration(this.util.getConfiguration());
    RecordWriter<ImmutableBytesWritable, Cell> writer = null;
    TaskAttemptContext context = null;
    Path dir = util.getDataTestDir("test_TIMERANGE_present");
    LOG.info("Timerange dir writing to dir: " + dir);
    try {
      // build a record writer using HFileOutputFormat2
      Job job = new Job(conf);
      FileOutputFormat.setOutputPath(job, dir);
      context = createTestTaskAttemptContext(job);
      HFileOutputFormat2 hof = new HFileOutputFormat2();
      writer = hof.getRecordWriter(context);

      // Pass two key values with explicit times stamps
      final byte[] b = Bytes.toBytes("b");

      // value 1 with timestamp 2000
      KeyValue kv = new KeyValue(b, b, b, 2000, b);
      KeyValue original = kv.clone();
      writer.write(new ImmutableBytesWritable(), kv);
      assertEquals(original, kv);

      // value 2 with timestamp 1000
      kv = new KeyValue(b, b, b, 1000, b);
      original = kv.clone();
      writer.write(new ImmutableBytesWritable(), kv);
      assertEquals(original, kv);

      // verify that the file has the proper FileInfo.
      writer.close(context);

      // the generated file lives 1 directory down from the attempt directory
      // and is the only file, e.g.
      // _attempt__0000_r_000000_0/b/1979617994050536795
      FileSystem fs = FileSystem.get(conf);
      Path attemptDirectory = hof.getDefaultWorkFile(context, "").getParent();
      FileStatus[] sub1 = fs.listStatus(attemptDirectory);
      FileStatus[] file = fs.listStatus(sub1[0].getPath());

      // open as HFile Reader and pull out TIMERANGE FileInfo.
      HFile.Reader rd = HFile.createReader(fs, file[0].getPath(), new CacheConfig(conf), conf);
      Map<byte[], byte[]> finfo = rd.loadFileInfo();
      byte[] range = finfo.get("TIMERANGE".getBytes());
      assertNotNull(range);

      // unmarshall and check values.
      TimeRangeTracker timeRangeTracker = new TimeRangeTracker();
      Writables.copyWritable(range, timeRangeTracker);
      LOG.info(
          timeRangeTracker.getMinimumTimestamp() + "...." + timeRangeTracker.getMaximumTimestamp());
      assertEquals(1000, timeRangeTracker.getMinimumTimestamp());
      assertEquals(2000, timeRangeTracker.getMaximumTimestamp());
      rd.close();
    } finally {
      if (writer != null && context != null) writer.close(context);
      dir.getFileSystem(conf).delete(dir, true);
    }
  }
Example #3
0
 /**
  * Runs through the hbase rootdir and checks all stores have only one file in them -- that is,
  * they've been major compacted. Looks at root and meta tables too.
  *
  * @param fs
  * @param hbaseRootDir
  * @return True if this hbase install is major compacted.
  * @throws IOException
  */
 public static boolean isMajorCompacted(final FileSystem fs, final Path hbaseRootDir)
     throws IOException {
   // Presumes any directory under hbase.rootdir is a table.
   FileStatus[] tableDirs = fs.listStatus(hbaseRootDir, new DirFilter(fs));
   for (int i = 0; i < tableDirs.length; i++) {
     // Skip the .log directory.  All others should be tables.  Inside a table,
     // there are compaction.dir directories to skip.  Otherwise, all else
     // should be regions.  Then in each region, should only be family
     // directories.  Under each of these, should be one file only.
     Path d = tableDirs[i].getPath();
     if (d.getName().equals(HConstants.HREGION_LOGDIR_NAME)) {
       continue;
     }
     FileStatus[] regionDirs = fs.listStatus(d, new DirFilter(fs));
     for (int j = 0; j < regionDirs.length; j++) {
       Path dd = regionDirs[j].getPath();
       if (dd.getName().equals(HConstants.HREGION_COMPACTIONDIR_NAME)) {
         continue;
       }
       // Else its a region name.  Now look in region for families.
       FileStatus[] familyDirs = fs.listStatus(dd, new DirFilter(fs));
       for (int k = 0; k < familyDirs.length; k++) {
         Path family = familyDirs[k].getPath();
         // Now in family make sure only one file.
         FileStatus[] familyStatus = fs.listStatus(family);
         if (familyStatus.length > 1) {
           LOG.debug(family.toString() + " has " + familyStatus.length + " files.");
           return false;
         }
       }
     }
   }
   return true;
 }
Example #4
0
 /**
  * Runs through the hbase rootdir and checks all stores have only one file in them -- that is,
  * they've been major compacted. Looks at root and meta tables too. This version differs from
  * {@link #isMajorCompacted(FileSystem, Path)} in that it expects a pre-0.20.0 hbase layout on the
  * filesystem. Used migrating.
  *
  * @param fs
  * @param hbaseRootDir
  * @return True if this hbase install is major compacted.
  * @throws IOException
  */
 public static boolean isMajorCompactedPre020(final FileSystem fs, final Path hbaseRootDir)
     throws IOException {
   // Presumes any directory under hbase.rootdir is a table.
   FileStatus[] tableDirs = fs.listStatus(hbaseRootDir, new DirFilter(fs));
   for (int i = 0; i < tableDirs.length; i++) {
     // Inside a table, there are compaction.dir directories to skip.
     // Otherwise, all else should be regions.  Then in each region, should
     // only be family directories.  Under each of these, should be a mapfile
     // and info directory and in these only one file.
     Path d = tableDirs[i].getPath();
     if (d.getName().equals(HConstants.HREGION_LOGDIR_NAME)) {
       continue;
     }
     FileStatus[] regionDirs = fs.listStatus(d, new DirFilter(fs));
     for (int j = 0; j < regionDirs.length; j++) {
       Path dd = regionDirs[j].getPath();
       if (dd.getName().equals(HConstants.HREGION_COMPACTIONDIR_NAME)) {
         continue;
       }
       // Else its a region name.  Now look in region for families.
       FileStatus[] familyDirs = fs.listStatus(dd, new DirFilter(fs));
       for (int k = 0; k < familyDirs.length; k++) {
         Path family = familyDirs[k].getPath();
         FileStatus[] infoAndMapfile = fs.listStatus(family);
         // Assert that only info and mapfile in family dir.
         if (infoAndMapfile.length != 0 && infoAndMapfile.length != 2) {
           LOG.debug(
               family.toString()
                   + " has more than just info and mapfile: "
                   + infoAndMapfile.length);
           return false;
         }
         // Make sure directory named info or mapfile.
         for (int ll = 0; ll < 2; ll++) {
           if (infoAndMapfile[ll].getPath().getName().equals("info")
               || infoAndMapfile[ll].getPath().getName().equals("mapfiles")) continue;
           LOG.debug("Unexpected directory name: " + infoAndMapfile[ll].getPath());
           return false;
         }
         // Now in family, there are 'mapfile' and 'info' subdirs.  Just
         // look in the 'mapfile' subdir.
         FileStatus[] familyStatus = fs.listStatus(new Path(family, "mapfiles"));
         if (familyStatus.length > 1) {
           LOG.debug(family.toString() + " has " + familyStatus.length + " files.");
           return false;
         }
       }
     }
   }
   return true;
 }
Example #5
0
 /**
  * Returns all files belonging to the given region directory. Could return an empty list.
  *
  * @param fs The file system reference.
  * @param regionDir The region directory to scan.
  * @return The list of files found.
  * @throws IOException When scanning the files fails.
  */
 static List<Path> getStoreFiles(FileSystem fs, Path regionDir) throws IOException {
   List<Path> res = new ArrayList<Path>();
   PathFilter dirFilter = new FSUtils.DirFilter(fs);
   FileStatus[] familyDirs = fs.listStatus(regionDir, dirFilter);
   for (FileStatus dir : familyDirs) {
     FileStatus[] files = fs.listStatus(dir.getPath());
     for (FileStatus file : files) {
       if (!file.isDir()) {
         res.add(file.getPath());
       }
     }
   }
   return res;
 }
 @Test
 public void testListEmptyRootDirectory() throws IOException {
   // extra sanity checks here to avoid support calls about complete loss of data
   skipIfUnsupported(TEST_ROOT_TESTS_ENABLED);
   FileSystem fs = getFileSystem();
   Path root = new Path("/");
   FileStatus[] statuses = fs.listStatus(root);
   for (FileStatus status : statuses) {
     ContractTestUtils.assertDeleted(fs, status.getPath(), true);
   }
   assertEquals(
       "listStatus on empty root-directory returned a non-empty list",
       0,
       fs.listStatus(root).length);
 }
Example #7
0
  private static void addFolder2(
      FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) {
    try {
      if (fs == null) return;

      Futures futures = new Futures();
      for (FileStatus file : fs.listStatus(p)) {
        Path pfs = file.getPath();
        if (file.isDir()) {
          addFolder2(fs, pfs, keys, failed);
        } else {
          long size = file.getLen();
          Key res;
          if (pfs.getName().endsWith(Extensions.JSON)) {
            throw H2O.unimpl();
          } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
            throw H2O.unimpl();
          } else {
            Key k = null;
            keys.add((k = HdfsFileVec.make(file, futures)).toString());
            Log.info("PersistHdfs: DKV.put(" + k + ")");
          }
        }
      }
    } catch (Exception e) {
      Log.err(e);
      failed.add(p.toString());
    }
  }
Example #8
0
  private int readFile() throws IllegalArgumentException, IOException {
    int count = 0;
    final FileSystem fs = FileSystem.get(MapReduceTestUtils.getConfiguration());
    final FileStatus[] fss =
        fs.listStatus(
            new Path(
                TestUtils.TEMP_DIR
                    + File.separator
                    + MapReduceTestEnvironment.HDFS_BASE_DIRECTORY
                    + "/t1/pairs"));
    for (final FileStatus ifs : fss) {
      if (ifs.isFile() && ifs.getPath().toString().matches(".*part-r-0000[0-9]")) {
        try (SequenceFile.Reader reader =
            new SequenceFile.Reader(
                MapReduceTestUtils.getConfiguration(), Reader.file(ifs.getPath()))) {

          final Text key = new Text();
          final Text val = new Text();

          while (reader.next(key, val)) {
            count++;
            System.err.println(key + "\t" + val);
          }
        }
      }
    }
    return count;
  }
Example #9
0
  /** Returns the greatest partition number available for appending, for data files in targetDir. */
  private int getNextPartition(FileSystem fs, Path targetDir) throws IOException {

    int nextPartition = 0;
    FileStatus[] existingFiles = fs.listStatus(targetDir);
    if (existingFiles != null && existingFiles.length > 0) {
      Pattern patt = Pattern.compile("part.*-([0-9][0-9][0-9][0-9][0-9]).*");
      for (FileStatus fileStat : existingFiles) {
        if (!fileStat.isDir()) {
          String filename = fileStat.getPath().getName();
          Matcher mat = patt.matcher(filename);
          if (mat.matches()) {
            int thisPart = Integer.parseInt(mat.group(1));
            if (thisPart >= nextPartition) {
              nextPartition = thisPart;
              nextPartition++;
            }
          }
        }
      }
    }

    if (nextPartition > 0) {
      LOG.info("Using found partition " + nextPartition);
    }

    return nextPartition;
  }
  public static void run(Configuration conf, Path input, String outputFile)
      throws IOException, InstantiationException, IllegalAccessException {
    Writer writer;
    if (outputFile == null) {
      writer = new OutputStreamWriter(System.out);
    } else {
      writer =
          new OutputStreamWriter(
              new FileOutputStream(new File(outputFile)), Charset.forName("UTF-8"));
    }

    try {
      FileSystem fs = input.getFileSystem(conf);
      for (FileStatus fst : fs.listStatus(input, new DataPathFilter())) {
        Path dataPath = fst.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, dataPath, conf);
        try {
          Text key = reader.getKeyClass().asSubclass(Text.class).newInstance();
          DocumentMapping value = new DocumentMapping();
          while (reader.next(key, value)) {
            String docId = value.getDocId();
            writer.write(docId + "\t" + key + "\n");
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      writer.close();
    }
  }
 /**
  * Override this so that we don't set the targetTestRoot to any path under the root of the FS, and
  * so that we don't try to delete the test dir, but rather only its contents.
  */
 @Override
 void initializeTargetTestRoot() throws IOException {
   targetTestRoot = fHdfs.makeQualified(new Path("/"));
   for (FileStatus status : fHdfs.listStatus(targetTestRoot)) {
     fHdfs.delete(status.getPath(), true);
   }
 }
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    // TODO Auto-generated method stub
    JobConf conf = new JobConf();
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(5);

    FileSystem fs = FileSystem.get(conf);
    Path dir = new Path(args[0]);
    FileStatus[] stats = fs.listStatus(dir);
    numFiles = stats.length;

    Job job = new Job(conf);
    job.setJarByClass(FileCombiner.class);
    job.setJobName("File Combiner");

    job.setMapperClass(FileCombinerMapper.class);
    job.setReducerClass(FileCombinerReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
  }
  /**
   * Called after the MapReduce job has completed, to verify that the outputs generated by the
   * MapReduce job align with the expected outputs that were set with calls to {@link
   * #addExpectedOutput(String)} and {@link #addExpectedOutput(String...)}.
   *
   * @return a reference to this object
   * @throws IOException if something goes wrong
   */
  public TextIOJobBuilder verifyResults() throws IOException {

    FileStatus[] outputFiles =
        fs.listStatus(
            outputPath,
            new PathFilter() {
              @Override
              public boolean accept(final Path path) {
                return path.getName().startsWith("part");
              }
            });

    System.out.println("Output files: " + StringUtils.join(outputFiles));

    int i = 0;
    for (FileStatus file : outputFiles) {
      List<String> actualLines = FileUtils.readLines(fs, file.getPath());

      for (String actualLine : actualLines) {
        String expectedLine = expectedOutputs.get(i++);
        assertEquals(expectedLine, actualLine);
      }
    }

    assertEquals(expectedOutputs.size(), i);

    return this;
  }
Example #14
0
 public static Path[] getInputPaths(String rootPath) {
   try {
     Configuration conf = HBaseConfiguration.create();
     Path root = new Path(rootPath);
     ArrayList<Path> paths = new ArrayList<Path>();
     FileSystem fs = root.getFileSystem(conf);
     LinkedList<Path> list = new LinkedList<Path>();
     list.push(root);
     if (!fs.exists(root)) {
       System.out.println("path not exists: " + root.toString());
       return new Path[0];
     }
     while (!list.isEmpty()) {
       Path path = list.pop();
       if (fs.isFile(path)) {
         if (path.getName().matches("^.*part-r-\\d{5}.*$")) {
           paths.add(path);
           System.out.println("something is wrong with path" + path.toString());
         }
       } else {
         FileStatus[] statuses = fs.listStatus(path);
         for (FileStatus status : statuses) {
           if (status.isDir()) {
             list.add(status.getPath());
           } else if (status.getPath().getName().matches("^.*part-r-\\d{5}.*$")) {
             paths.add(status.getPath());
           }
         }
       }
     }
     return paths.toArray(new Path[paths.size()]);
   } catch (IOException ignored) {
     return new Path[0];
   }
 }
  private void loadRMDTSecretManagerState(RMState rmState) throws Exception {
    FileStatus[] childNodes = fs.listStatus(rmDTSecretManagerRoot);

    for (FileStatus childNodeStatus : childNodes) {
      assert childNodeStatus.isFile();
      String childNodeName = childNodeStatus.getPath().getName();
      if (childNodeName.startsWith(DELEGATION_TOKEN_SEQUENCE_NUMBER_PREFIX)) {
        rmState.rmSecretManagerState.dtSequenceNumber =
            Integer.parseInt(childNodeName.split("_")[1]);
        continue;
      }

      Path childNodePath = getNodePath(rmDTSecretManagerRoot, childNodeName);
      byte[] childData = readFile(childNodePath, childNodeStatus.getLen());
      ByteArrayInputStream is = new ByteArrayInputStream(childData);
      DataInputStream fsIn = new DataInputStream(is);
      if (childNodeName.startsWith(DELEGATION_KEY_PREFIX)) {
        DelegationKey key = new DelegationKey();
        key.readFields(fsIn);
        rmState.rmSecretManagerState.masterKeyState.add(key);
      } else if (childNodeName.startsWith(DELEGATION_TOKEN_PREFIX)) {
        RMDelegationTokenIdentifier identifier = new RMDelegationTokenIdentifier();
        identifier.readFields(fsIn);
        long renewDate = fsIn.readLong();
        rmState.rmSecretManagerState.delegationTokenState.put(identifier, renewDate);
      } else {
        LOG.warn("Unknown file for recovering RMDelegationTokenSecretManager");
      }
      fsIn.close();
    }
  }
Example #16
0
 public static void runJob(String... args) throws Exception {
   Path smallFilePath = new Path(args[0]);
   Configuration conf = new Configuration();
   FileSystem fs = smallFilePath.getFileSystem(conf);
   FileStatus smallFilePathStatus = fs.getFileStatus(smallFilePath);
   if (smallFilePathStatus.isDir()) {
     for (FileStatus f : fs.listStatus(smallFilePath)) {
       if (f.getPath().getName().contains(args[0])) {
         DistributedCache.addCacheFile(f.getPath().toUri(), conf);
       }
     }
   } else {
     DistributedCache.addCacheFile(smallFilePath.toUri(), conf);
   }
   Path inputPath = new Path(args[1]);
   Path outputPath = new Path(args[2]);
   Job job = new Job(conf);
   job.setJarByClass(Main.class);
   job.setMapperClass(GenericReplicatedJoin.class);
   job.setInputFormatClass(KeyValueTextInputFormat.class);
   job.setNumReduceTasks(0);
   outputPath.getFileSystem(conf).delete(outputPath, true);
   FileInputFormat.setInputPaths(job, inputPath);
   FileOutputFormat.setOutputPath(job, outputPath);
   job.waitForCompletion(true);
 }
Example #17
0
  @Test
  public void testAvroOut() {
    String type = "one";
    AvroOutputFormat<String> avroOut = new AvroOutputFormat<String>(String.class);

    org.apache.hadoop.fs.Path result = new org.apache.hadoop.fs.Path(hdfsURI + "/avroTest");

    avroOut.setOutputFilePath(new Path(result.toString()));
    avroOut.setWriteMode(FileSystem.WriteMode.NO_OVERWRITE);
    avroOut.setOutputDirectoryMode(FileOutputFormat.OutputDirectoryMode.ALWAYS);

    try {
      avroOut.open(0, 2);
      avroOut.writeRecord(type);
      avroOut.close();

      avroOut.open(1, 2);
      avroOut.writeRecord(type);
      avroOut.close();

      Assert.assertTrue("No result file present", hdfs.exists(result));
      FileStatus[] files = hdfs.listStatus(result);
      Assert.assertEquals(2, files.length);
      for (FileStatus file : files) {
        Assert.assertTrue(
            "1.avro".equals(file.getPath().getName()) || "2.avro".equals(file.getPath().getName()));
      }

    } catch (IOException e) {
      e.printStackTrace();
      Assert.fail(e.getMessage());
    }
  }
Example #18
0
  /**
   * Return a list of all urls matching this input. If autocomplete is false, the list contains only
   * 1 element (same as getUrl()). Otherwise, it will try to return all the files beginning with
   * what is returned by getUrl().
   *
   * @param jobConf A Configuration object
   * @return the list of input url
   */
  public HashSet<URI> getAllUrls(Configuration jobConf) {

    HashSet<URI> urls = new HashSet<URI>();

    if (!isAutoComplete()) {
      urls.add(url);
    } else {
      Path basePath = new Path(url);
      String filePrefix = basePath.getName();

      try {
        FileSystem fs = basePath.getFileSystem(jobConf);

        if (!fs.exists(basePath.getParent())) {
          throw new IOException("Input directory not found: " + url);
        }

        FileStatus[] stats = fs.listStatus(basePath.getParent());

        for (int i = 0; i < stats.length; i++) {
          Path path = stats[i].getPath();
          if (fs.isFile(path) && path.getName().startsWith(filePrefix)) urls.add(path.toUri());
        }
      } catch (IOException e) {
        System.err.println("Unable to autocomplete input file");
        e.printStackTrace();
        System.exit(1);
      }
    }

    return urls;
  }
  protected void cleanup() throws Exception {
    try {
      int rotateInterval =
          conf.getInt("chukwaCollector.rotateInterval", 1000 * 60 * 5); // defaults to 5 minutes

      Path pLocalOutputDir = new Path(localOutputDir);
      FileStatus[] files = localFs.listStatus(pLocalOutputDir);
      String fileName = null;
      for (FileStatus file : files) {
        fileName = file.getPath().getName();
        if (fileName.endsWith(".done")) {
          moveFile(localOutputDir + fileName);
        } else if (fileName.endsWith(".chukwa")) {
          long lastPeriod = System.currentTimeMillis() - rotateInterval - (2 * 60 * 1000);
          if (file.getModificationTime() < lastPeriod) {
            log.info("Moving .chukwa file over, " + localOutputDir + fileName);
            moveFile(localOutputDir + fileName);
          }
        }
      }
    } catch (Exception e) {
      log.warn("Cannot copy to the remote HDFS", e);
      throw e;
    }
  }
Example #20
0
 public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException {
   try {
     return fs.listStatus(path, filter);
   } catch (FileNotFoundException e) {
     return new FileStatus[0];
   }
 }
  private void loadDocumentIndex(String documentIndexPath) throws IOException {
    if (documentIndex == null) {
      documentIndex = new HashMap<String, Integer>();

      Path p = new Path(documentIndexPath);
      FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
      int index = 0;
      for (FileStatus status : fs.listStatus(p)) {
        Path currPath = status.getPath();
        if (!status.isDir() && !currPath.getName().startsWith("_")) {
          BufferedReader reader = null;
          try {
            reader = new BufferedReader(new InputStreamReader(fs.open(currPath)));
            String line = null;
            while ((line = reader.readLine()) != null) {
              documentIndex.put(line.trim(), index++);
            }
          } finally {
            if (reader != null) {
              reader.close();
            }
          }
        }
      }

      log.info("Loaded document index with size: " + documentIndex.size());
    }
  }
  /**
   * Given a filesystem and path to a node, gets all the files which belong to a partition, replica
   * type and chunk id
   *
   * <p>Works only for {@link ReadOnlyStorageFormat.READONLY_V2}
   *
   * @param fs Underlying filesystem
   * @param path The node directory path
   * @param partitionId The partition id for which we get the files
   * @param replicaType The replica type
   * @param chunkId The chunk id
   * @return Returns list of files of this partition, replicaType, chunkId
   * @throws IOException
   */
  public static FileStatus[] getDataChunkFiles(
      FileSystem fs, Path path, final int partitionId, final int replicaType, final int chunkId)
      throws IOException {
    return fs.listStatus(
        path,
        new PathFilter() {

          public boolean accept(Path input) {
            if (input
                .getName()
                .matches(
                    "^"
                        + Integer.toString(partitionId)
                        + "_"
                        + Integer.toString(replicaType)
                        + "_"
                        + Integer.toString(chunkId)
                        + "\\.data")) {
              return true;
            } else {
              return false;
            }
          }
        });
  }
Example #23
0
  /**
   * Check duplicated tweet IDs in <b>tweetIdDir</b>, and output the duplicates to stdout.
   *
   * @param tweetIdDir
   * @throws Exception
   */
  public static void checkTidDuplicates(String tweetIdDir) throws Exception {
    // First change path strings to URI strings starting with 'file:' or 'hdfs:'
    tweetIdDir = MultiFileFolderWriter.getUriStrForPath(tweetIdDir);

    Set<String> tidSet = new HashSet<String>();
    Configuration conf = HBaseConfiguration.create();
    FileSystem fs = FileSystem.get(new URI(tweetIdDir), conf);
    int dupCount = 0;
    for (FileStatus srcFileStatus : fs.listStatus(new Path(tweetIdDir))) {
      String srcFileName = srcFileStatus.getPath().getName();
      if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) {
        BufferedReader brTid =
            new BufferedReader(new InputStreamReader(fs.open(srcFileStatus.getPath())));
        String tid = brTid.readLine();
        while (tid != null) {
          if (tidSet.contains(tid)) {
            System.out.println("Duplicated tweet ID: " + tid);
            dupCount++;
          } else {
            tidSet.add(tid);
          }
          tid = brTid.readLine();
        }
        brTid.close();
      }
    }
    System.out.println(
        "Number of unique tweet IDs: " + tidSet.size() + ", number of duplicates: " + dupCount);
  }
Example #24
0
  public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub

    if (args.length < 1) {
      System.out.println("missing input ");
      System.exit(1);
    }

    Configuration conf = new Configuration();
    Job job = new Job(new JobConf(conf));
    job.setJarByClass(InstallApp.class);
    job.setMapperClass(InstallMapper.class);
    job.setReducerClass(InstalleReduce.class);
    job.setJobName("installtaotaosou");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileSystem fs = FileSystem.get(URI.create(args[0]), conf);
    FileStatus fileList[] = fs.listStatus(new Path(args[0]));

    int length = fileList.length;

    for (int i = 0; i < length; i++) {

      FileInputFormat.addInputPath(job, fileList[i].getPath());
    }

    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(1);
    job.waitForCompletion(true);
    return 0;
  }
Example #25
0
  public static void readHiveResult(String path, OutputStreamWriter outStream, Configuration conf)
      throws IOException {
    FileSystem fs = FileSystem.get(conf);
    Path dir = new Path(path);
    if (!fs.exists(dir)) {
      throw new IOException("can not found path:" + path);
    }
    FileStatus[] filelist = fs.listStatus(dir);

    Long bytesRead = 0l;
    long maxsize = 1024l * 1024 * 1024 * 10;

    for (FileStatus f : filelist) {
      if (!f.isDir() && !f.getPath().getName().startsWith("_")) {
        FSDataInputStream in = fs.open(f.getPath());
        BufferedReader bf = new BufferedReader(new InputStreamReader(in));
        String line;
        while ((line = bf.readLine()) != null) {
          bytesRead += line.getBytes().length;
          outStream.write(line.replaceAll("\001", ",").replaceAll("\t", ","));
          outStream.write("\r\n");
          if (bytesRead >= maxsize) {
            bf.close();
            in.close();
            return;
          }
        }
        bf.close();
        in.close();
      }
    }
    return;
  }
  /** determines which files have failed for a given job */
  private Set<String> getFailedFiles(Job job) throws IOException {
    Set<String> failedFiles = new HashSet<String>();

    Path outDir = SequenceFileOutputFormat.getOutputPath(job);
    FileSystem fs = outDir.getFileSystem(getConf());
    if (!fs.getFileStatus(outDir).isDir()) {
      throw new IOException(outDir.toString() + " is not a directory");
    }

    FileStatus[] files = fs.listStatus(outDir);

    for (FileStatus f : files) {
      Path fPath = f.getPath();
      if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) {
        LOG.info("opening " + fPath.toString());
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf());

        Text key = new Text();
        Text value = new Text();
        while (reader.next(key, value)) {
          failedFiles.add(key.toString());
        }
        reader.close();
      }
    }
    return failedFiles;
  }
  private double[] getSparkModelInfoFromHDFS(Path location, Configuration conf) throws Exception {

    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    FileStatus[] files = fileSystem.listStatus(location);

    if (files == null) throw new Exception("Couldn't find Spark Truck ML weights at: " + location);

    ArrayList<Double> modelInfo = new ArrayList<Double>();
    for (FileStatus file : files) {

      if (file.getPath().getName().startsWith("_")) {
        continue;
      }

      InputStream stream = fileSystem.open(file.getPath());

      StringWriter writer = new StringWriter();
      IOUtils.copy(stream, writer, "UTF-8");
      String raw = writer.toString();
      for (String str : raw.split("\n")) {
        modelInfo.add(Double.valueOf(str));
      }
    }

    return Doubles.toArray(modelInfo);
  }
  /** debugging TODO remove */
  private void readOutputFiles(String jobName, Path outDir) throws IOException {

    FileSystem fs = outDir.getFileSystem(getConf());
    if (!fs.getFileStatus(outDir).isDir()) {
      throw new IOException(outDir.toString() + " is not a directory");
    }

    FileStatus[] files = fs.listStatus(outDir);

    for (FileStatus f : files) {
      Path fPath = f.getPath();
      if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) {
        LOG.info("opening " + fPath.toString());
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf());
        Text key = new Text();
        Text value = new Text();
        while (reader.next(key, value)) {
          LOG.info("read " + f.getPath().toString());
          LOG.info("read: k=" + key.toString() + " v=" + value.toString());
        }
        LOG.info("done reading " + fPath.toString());

        reader.close();
      }
    }
  }
    public LinkedHashSet<Path> scan(FileSystem fs, Path filePath, Set<String> consumedFiles) {
      LinkedHashSet<Path> pathSet = Sets.newLinkedHashSet();
      try {
        LOG.debug("Scanning {} with pattern {}", filePath, this.filePatternRegexp);
        FileStatus[] files = fs.listStatus(filePath);
        for (FileStatus status : files) {
          Path path = status.getPath();
          String filePathStr = path.toString();

          if (consumedFiles.contains(filePathStr)) {
            continue;
          }

          if (ignoredFiles.contains(filePathStr)) {
            continue;
          }

          if (acceptFile(filePathStr)) {
            LOG.debug("Found {}", filePathStr);
            pathSet.add(path);
          } else {
            // don't look at it again
            ignoredFiles.add(filePathStr);
          }
        }
      } catch (FileNotFoundException e) {
        LOG.warn("Failed to list directory {}", filePath, e);
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      return pathSet;
    }
Example #30
0
 /**
  * Copy a directory to a new FS -both paths must be qualified
  *
  * @param conf conf file
  * @param srcDirPath src dir
  * @param destDirPath dest dir
  * @return # of files copies
  */
 public static int copyDirectory(Configuration conf, Path srcDirPath, Path destDirPath)
     throws IOException {
   FileSystem srcFS = FileSystem.get(srcDirPath.toUri(), conf);
   FileSystem destFS = FileSystem.get(destDirPath.toUri(), conf);
   // list all paths in the src.
   if (!srcFS.exists(srcDirPath)) {
     throw new FileNotFoundException("Source dir not found " + srcDirPath);
   }
   if (!srcFS.isDirectory(srcDirPath)) {
     throw new FileNotFoundException("Source dir not a directory " + srcDirPath);
   }
   FileStatus[] entries = srcFS.listStatus(srcDirPath);
   int srcFileCount = entries.length;
   if (srcFileCount == 0) {
     return 0;
   }
   if (!destFS.exists(destDirPath)) {
     destFS.mkdirs(destDirPath);
   }
   Path[] sourcePaths = new Path[srcFileCount];
   for (int i = 0; i < srcFileCount; i++) {
     FileStatus e = entries[i];
     Path srcFile = e.getPath();
     if (srcFS.isDirectory(srcFile)) {
       throw new IOException(
           "Configuration dir " + srcDirPath + " contains a directory " + srcFile);
     }
     log.debug("copying src conf file {}", srcFile);
     sourcePaths[i] = srcFile;
   }
   log.debug("Copying {} files from to {} to dest {}", srcFileCount, srcDirPath, destDirPath);
   FileUtil.copy(srcFS, sourcePaths, destFS, destDirPath, false, true, conf);
   return srcFileCount;
 }