Java RemoteIterator Examples, org.apache.hadoop.fs.RemoteIterator Java Examples

Example #1

0

Show file

File: FileDataGenNew.java Project: hwx293926/HiBench

  private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset)
      throws IOException {
    RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false);
    Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>();
    while (rit.hasNext()) {
      Path path = rit.next().getPath();
      String filename =
          path.toString().substring(path.getParent().toString().length(), path.toString().length());

      if (filename.startsWith("/part-")) {
        long filesize = fs.getFileStatus(path).getLen();
        if (offset < filesize) {
          FSDataInputStream handle = fs.open(path);
          if (offset > 0) {
            handle.seek(offset);
          }
          fileHandleList.add(handle);
        }
        offset -= filesize;
      }
    }
    if (fileHandleList.size() == 1) return fileHandleList.get(0);
    else if (fileHandleList.size() > 1) {
      Enumeration<FSDataInputStream> enu = fileHandleList.elements();
      return new SequenceInputStream(enu);
    } else {
      System.err.println("Error, no source file loaded. run genSeedDataset.sh fisrt!");
      return null;
    }
  }

Example #2

0

Show file

File: CryptoAdmin.java Project: aixuebo/had2.6.0

    @Override
    public int run(Configuration conf, List<String> args) throws IOException {
      if (!args.isEmpty()) {
        System.err.println("Can't understand argument: " + args.get(0));
        return 1;
      }

      final DistributedFileSystem dfs = getDFS(conf);
      try {
        final TableListing listing =
            new TableListing.Builder()
                .addField("")
                .addField("", true)
                .wrapWidth(MAX_LINE_WIDTH)
                .hideHeaders()
                .build();
        final RemoteIterator<EncryptionZone> it = dfs.listEncryptionZones();
        while (it.hasNext()) {
          EncryptionZone ez = it.next();
          listing.addRow(ez.getPath(), ez.getKeyName());
        }
        System.out.println(listing.toString());
      } catch (IOException e) {
        System.err.println(prettifyException(e));
        return 2;
      }

      return 0;
    }

Example #3

0

Show file

File: BulkIngestInputGenerationIT.java Project: CameronWork/geowave

  private void verifyJobOutput() throws IOException {

    final String _SUCCESS = "_SUCCESS";
    final String REDUCER_OUTPUT = "part-r-";
    boolean wasSuccessful = false;
    boolean reducerOutputExists = false;
    FileSystem fs = FileSystem.getLocal(new Configuration());
    RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(OUTPUT_PATH), false);
    LocatedFileStatus fileStatus = null;
    String fileName = null;

    while (iterator.hasNext()) {
      fileStatus = iterator.next();
      fileName = fileStatus.getPath().getName();

      if (fileName.contains(_SUCCESS)) {
        wasSuccessful = true;
      }
      if (fileName.contains(REDUCER_OUTPUT)) {
        reducerOutputExists = true;
      }
    }

    // verify presence of _SUCCESS file
    Assert.assertEquals(wasSuccessful, true);

    // verify presence of Reducer output
    Assert.assertEquals(reducerOutputExists, true);
  }

Example #4

0

Show file

File: AsyncRecursiveWalker.java Project: JinGeek/presto

  private void doWalk(
      Path path, FileStatusCallback callback, AtomicLong taskCount, SettableFuture<Void> future) {
    try (SetThreadName ignored = new SetThreadName("HiveHdfsWalker")) {
      RemoteIterator<LocatedFileStatus> iterator = getLocatedFileStatusRemoteIterator(path);

      while (iterator.hasNext()) {
        LocatedFileStatus status = getLocatedFileStatus(iterator);

        // ignore hidden files. Hive ignores files starting with _ and . as well.
        String fileName = status.getPath().getName();
        if (fileName.startsWith("_") || fileName.startsWith(".")) {
          continue;
        }
        if (isDirectory(status)) {
          recursiveWalk(status.getPath(), callback, taskCount, future);
        } else {
          callback.process(status, status.getBlockLocations());
        }
        if (future.isDone()) {
          return;
        }
      }
    } catch (FileNotFoundException e) {
      future.setException(new FileNotFoundException("Partition location does not exist: " + path));
    } catch (Throwable t) {
      future.setException(t);
    } finally {
      if (taskCount.decrementAndGet() == 0) {
        future.set(null);
      }
    }
  }

Example #5

0

Show file

File: PcapJob.java Project: charlesporter/incubator-metron

 protected Iterable<Path> listFiles(FileSystem fs, Path basePath) throws IOException {
   List<Path> ret = new ArrayList<>();
   RemoteIterator<LocatedFileStatus> filesIt = fs.listFiles(basePath, true);
   while (filesIt.hasNext()) {
     ret.add(filesIt.next().getPath());
   }
   return ret;
 }

Example #6

0

Show file

File: PrestoS3FileSystem.java Project: komamitsu/presto

 @Override
 public FileStatus[] listStatus(Path path) throws IOException {
   List<LocatedFileStatus> list = new ArrayList<>();
   RemoteIterator<LocatedFileStatus> iterator = listLocatedStatus(path);
   while (iterator.hasNext()) {
     list.add(iterator.next());
   }
   return toArray(list, LocatedFileStatus.class);
 }

Example #7

0

Show file

File: JobHistoryUtils.java Project: imace/hops

 private static List<FileStatus> remoteIterToList(RemoteIterator<FileStatus> rIter)
     throws IOException {
   List<FileStatus> fsList = new LinkedList<FileStatus>();
   if (rIter == null) return fsList;
   while (rIter.hasNext()) {
     fsList.add(rIter.next());
   }
   return fsList;
 }

Example #8

0

Show file

File: HistoryFileManager.java Project: 0xmchadha/optHadoop-2.2.0

 private static List<FileStatus> scanDirectory(Path path, FileContext fc, PathFilter pathFilter)
     throws IOException {
   path = fc.makeQualified(path);
   List<FileStatus> jhStatusList = new ArrayList<FileStatus>();
   RemoteIterator<FileStatus> fileStatusIter = fc.listStatus(path);
   while (fileStatusIter.hasNext()) {
     FileStatus fileStatus = fileStatusIter.next();
     Path filePath = fileStatus.getPath();
     if (fileStatus.isFile() && pathFilter.accept(filePath)) {
       jhStatusList.add(fileStatus);
     }
   }
   return jhStatusList;
 }

Example #9

0

Show file

File: FileInputFormat.java Project: lasaris/hadoop-common-ngmon

 /**
  * Add files in the input path recursively into the results.
  *
  * @param result The List to store all files.
  * @param fs The FileSystem.
  * @param path The input path.
  * @param inputFilter The input filter that can be used to filter files/dirs.
  * @throws IOException
  */
 protected void addInputPathRecursively(
     List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter)
     throws IOException {
   RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path);
   while (iter.hasNext()) {
     LocatedFileStatus stat = iter.next();
     if (inputFilter.accept(stat.getPath())) {
       if (stat.isDirectory()) {
         addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
       } else {
         result.add(stat);
       }
     }
   }
 }

Example #10

0

Show file

File: IncrementalCheckpointManagerTest.java Project: tushargosavi/incubator-apex-malhar

  @Test
  public void testPurge() throws IOException, InterruptedException {
    FileSystem fileSystem = FileSystem.newInstance(new Configuration());

    testTransferWindowFiles();
    RemoteIterator<LocatedFileStatus> iterator =
        fileSystem.listLocatedStatus(new Path(testMeta.applicationPath + "/bucket_data"));
    Assert.assertTrue(iterator.hasNext());

    testMeta.managedStateContext.getBucketsFileSystem().deleteTimeBucketsLessThanEqualTo(200);

    iterator = fileSystem.listLocatedStatus(new Path(testMeta.applicationPath + "/bucket_data"));
    if (iterator.hasNext()) {
      Assert.fail("All buckets should be deleted");
    }
  }

Example #11

0

Show file

File: JobHistoryUtils.java Project: imace/hops

  /**
   * Looks for the dirs to clean. The folder structure is YYYY/MM/DD/Serial so we can use that to
   * more efficiently find the directories to clean by comparing the cutoff timestamp with the
   * timestamp from the folder structure.
   *
   * @param fc done dir FileContext
   * @param root folder for completed jobs
   * @param cutoff The cutoff for the max history age
   * @return The list of directories for cleaning
   * @throws IOException
   */
  public static List<FileStatus> getHistoryDirsForCleaning(FileContext fc, Path root, long cutoff)
      throws IOException {
    List<FileStatus> fsList = new ArrayList<FileStatus>();
    Calendar cCal = Calendar.getInstance();
    cCal.setTimeInMillis(cutoff);
    int cYear = cCal.get(Calendar.YEAR);
    int cMonth = cCal.get(Calendar.MONTH) + 1;
    int cDate = cCal.get(Calendar.DATE);

    RemoteIterator<FileStatus> yearDirIt = fc.listStatus(root);
    while (yearDirIt.hasNext()) {
      FileStatus yearDir = yearDirIt.next();
      try {
        int year = Integer.parseInt(yearDir.getPath().getName());
        if (year <= cYear) {
          RemoteIterator<FileStatus> monthDirIt = fc.listStatus(yearDir.getPath());
          while (monthDirIt.hasNext()) {
            FileStatus monthDir = monthDirIt.next();
            try {
              int month = Integer.parseInt(monthDir.getPath().getName());
              // If we only checked the month here, then something like 07/2013
              // would incorrectly not pass when the cutoff is 06/2014
              if (year < cYear || month <= cMonth) {
                RemoteIterator<FileStatus> dateDirIt = fc.listStatus(monthDir.getPath());
                while (dateDirIt.hasNext()) {
                  FileStatus dateDir = dateDirIt.next();
                  try {
                    int date = Integer.parseInt(dateDir.getPath().getName());
                    // If we only checked the date here, then something like
                    // 07/21/2013 would incorrectly not pass when the cutoff is
                    // 08/20/2013 or 07/20/2012
                    if (year < cYear || month < cMonth || date <= cDate) {
                      fsList.addAll(remoteIterToList(fc.listStatus(dateDir.getPath())));
                    }
                  } catch (NumberFormatException nfe) {
                    // the directory didn't fit the format we're looking for so
                    // skip the dir
                  }
                }
              }
            } catch (NumberFormatException nfe) {
              // the directory didn't fit the format we're looking for so skip
              // the dir
            }
          }
        }
      } catch (NumberFormatException nfe) {
        // the directory didn't fit the format we're looking for so skip the dir
      }
    }
    return fsList;
  }

Example #12

0

Show file

File: TraceBuilder.java Project: RiseOfApes/hadoop

    /**
     * Processes the input file/folder argument. If the input is a file, then it is directly
     * considered for further processing by TraceBuilder. If the input is a folder, then all the
     * history logs in the input folder are considered for further processing.
     *
     * <p>If isRecursive is true, then the input path is recursively scanned for job history logs
     * for further processing by TraceBuilder.
     *
     * <p>NOTE: If the input represents a globbed path, then it is first flattened and then the
     * individual paths represented by the globbed input path are considered for further processing.
     *
     * @param input input path, possibly globbed
     * @param conf configuration
     * @param isRecursive whether to recursively traverse the input paths to find history logs
     * @return the input history log files' paths
     * @throws FileNotFoundException
     * @throws IOException
     */
    static List<Path> processInputArgument(String input, Configuration conf, boolean isRecursive)
        throws FileNotFoundException, IOException {
      Path inPath = new Path(input);
      FileSystem fs = inPath.getFileSystem(conf);
      FileStatus[] inStatuses = fs.globStatus(inPath);

      List<Path> inputPaths = new LinkedList<Path>();
      if (inStatuses == null || inStatuses.length == 0) {
        return inputPaths;
      }

      for (FileStatus inStatus : inStatuses) {
        Path thisPath = inStatus.getPath();
        if (inStatus.isDirectory()) {

          // Find list of files in this path(recursively if -recursive option
          // is specified).
          List<FileStatus> historyLogs = new ArrayList<FileStatus>();

          RemoteIterator<LocatedFileStatus> iter = fs.listFiles(thisPath, isRecursive);
          while (iter.hasNext()) {
            LocatedFileStatus child = iter.next();
            String fileName = child.getPath().getName();

            if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) {
              historyLogs.add(child);
            }
          }

          if (historyLogs.size() > 0) {
            // Add the sorted history log file names in this path to the
            // inputPaths list
            FileStatus[] sortableNames = historyLogs.toArray(new FileStatus[historyLogs.size()]);
            Arrays.sort(sortableNames, new HistoryLogsComparator());

            for (FileStatus historyLog : sortableNames) {
              inputPaths.add(historyLog.getPath());
            }
          }
        } else {
          inputPaths.add(thisPath);
        }
      }

      return inputPaths;
    }

Example #13

0

Show file

File: AsyncRecursiveWalker.java Project: JinGeek/presto

 private LocatedFileStatus getLocatedFileStatus(RemoteIterator<LocatedFileStatus> iterator)
     throws IOException {
   try (TimeStat.BlockTimer timer = namenodeStats.getRemoteIteratorNext().time()) {
     return iterator.next();
   } catch (IOException | RuntimeException e) {
     namenodeStats.getRemoteIteratorNext().recordException(e);
     throw e;
   }
 }

Example #14

0

Show file

File: HdfsCachingUtil.java Project: ImpalaToGo/ImpalaToGo

 /**
  * Gets the cache directive matching the given ID. Returns null if no matching directives were
  * found.
  */
 private static CacheDirectiveEntry getDirective(long directiveId) throws ImpalaRuntimeException {
   LOG.trace("Getting cache directive id: " + directiveId);
   if (!(dfs instanceof DistributedFileSystem)) {
     LOG.trace(
         "Filesystem instance is not the distibuted fs - for directive id \""
             + directiveId
             + "\".");
     return null;
   }
   CacheDirectiveInfo filter = new CacheDirectiveInfo.Builder().setId(directiveId).build();
   try {
     RemoteIterator<CacheDirectiveEntry> itr =
         ((DistributedFileSystem) dfs).listCacheDirectives(filter);
     if (itr.hasNext()) return itr.next();
   } catch (IOException e) {
     // Handle connection issues with e.g. HDFS and possible not found errors
     throw new ImpalaRuntimeException(e.getMessage(), e);
   }
   throw new ImpalaRuntimeException(
       "HDFS cache directive filter returned empty result. This must not happen");
 }

Example #15

0

Show file

File: CatalogServiceCatalog.java Project: kakamessi99/recordservice

    public void run() {
      LOG.trace("Reloading cache pool names from HDFS");
      // Map of cache pool name to CachePoolInfo. Stored in a map to allow Set operations
      // to be performed on the keys.
      Map<String, CachePoolInfo> currentCachePools = Maps.newHashMap();
      try {
        DistributedFileSystem dfs = FileSystemUtil.getDistributedFileSystem();
        RemoteIterator<CachePoolEntry> itr = dfs.listCachePools();
        while (itr.hasNext()) {
          CachePoolInfo cachePoolInfo = itr.next().getInfo();
          currentCachePools.put(cachePoolInfo.getPoolName(), cachePoolInfo);
        }
      } catch (Exception e) {
        LOG.error("Error loading cache pools: ", e);
        return;
      }

      catalogLock_.writeLock().lock();
      try {
        // Determine what has changed relative to what we have cached.
        Set<String> droppedCachePoolNames =
            Sets.difference(hdfsCachePools_.keySet(), currentCachePools.keySet());
        Set<String> createdCachePoolNames =
            Sets.difference(currentCachePools.keySet(), hdfsCachePools_.keySet());
        // Add all new cache pools.
        for (String createdCachePool : createdCachePoolNames) {
          HdfsCachePool cachePool = new HdfsCachePool(currentCachePools.get(createdCachePool));
          cachePool.setCatalogVersion(CatalogServiceCatalog.this.incrementAndGetCatalogVersion());
          hdfsCachePools_.add(cachePool);
        }
        // Remove dropped cache pools.
        for (String cachePoolName : droppedCachePoolNames) {
          hdfsCachePools_.remove(cachePoolName);
          CatalogServiceCatalog.this.incrementAndGetCatalogVersion();
        }
      } finally {
        catalogLock_.writeLock().unlock();
      }
    }

Example #16

0

Show file

File: FileInputFormat.java Project: lasaris/hadoop-common-ngmon

  private List<FileStatus> singleThreadedListStatus(
      JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
      Path p = dirs[i];
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat : matches) {
          if (globStat.isDirectory()) {
            RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
            while (iter.hasNext()) {
              LocatedFileStatus stat = iter.next();
              if (inputFilter.accept(stat.getPath())) {
                if (recursive && stat.isDirectory()) {
                  addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                } else {
                  result.add(stat);
                }
              }
            }
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    return result;
  }

Example #17

0

Show file

File: PcapJob.java Project: charlesporter/incubator-metron

 private List<byte[]> readResults(Path outputPath, Configuration config, FileSystem fs)
     throws IOException {
   List<byte[]> ret = new ArrayList<>();
   for (RemoteIterator<LocatedFileStatus> it = fs.listFiles(outputPath, false); it.hasNext(); ) {
     Path p = it.next().getPath();
     if (p.getName().equals("_SUCCESS")) {
       fs.delete(p, false);
       continue;
     }
     SequenceFile.Reader reader = new SequenceFile.Reader(config, SequenceFile.Reader.file(p));
     LongWritable key = new LongWritable();
     BytesWritable value = new BytesWritable();
     while (reader.next(key, value)) {
       ret.add(value.copyBytes());
     }
     reader.close();
     fs.delete(p, false);
   }
   fs.delete(outputPath, false);
   if (LOG.isDebugEnabled()) {
     LOG.debug(outputPath + ": Returning " + ret.size());
   }
   return ret;
 }

Example #18

0

Show file

File: HiveFileIterator.java Project: cdma/presto

  @Override
  protected LocatedFileStatus computeNext() {
    try {
      if (remoteIterator == null) {
        remoteIterator = getLocatedFileStatusRemoteIterator(path);
      }

      while (remoteIterator.hasNext()) {
        LocatedFileStatus status = getLocatedFileStatus(remoteIterator);

        // ignore hidden files. Hive ignores files starting with _ and . as well.
        String fileName = status.getPath().getName();
        if (fileName.startsWith("_") || fileName.startsWith(".")) {
          continue;
        }
        return status;
      }
      return endOfData();
    } catch (FileNotFoundException e) {
      throw new PrestoException(HIVE_FILE_NOT_FOUND, "Partition location does not exist: " + path);
    } catch (IOException e) {
      throw new PrestoException(HIVE_FILESYSTEM_ERROR, e);
    }
  }

Example #19

0

Show file

File: AggregatedLogsBlock.java Project: RiseOfApes/hadoop

  @Override
  protected void render(Block html) {
    ContainerId containerId = verifyAndGetContainerId(html);
    NodeId nodeId = verifyAndGetNodeId(html);
    String appOwner = verifyAndGetAppOwner(html);
    LogLimits logLimits = verifyAndGetLogLimits(html);
    if (containerId == null || nodeId == null || appOwner == null
        || appOwner.isEmpty() || logLimits == null) {
      return;
    }

    ApplicationId applicationId = containerId.getApplicationAttemptId()
        .getApplicationId();
    String logEntity = $(ENTITY_STRING);
    if (logEntity == null || logEntity.isEmpty()) {
      logEntity = containerId.toString();
    }

    if (!conf.getBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED,
        YarnConfiguration.DEFAULT_LOG_AGGREGATION_ENABLED)) {
      html.h1()
          ._("Aggregation is not enabled. Try the nodemanager at " + nodeId)
          ._();
      return;
    }

    Path remoteRootLogDir = new Path(conf.get(
        YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
        YarnConfiguration.DEFAULT_NM_REMOTE_APP_LOG_DIR));
    Path remoteAppDir = LogAggregationUtils.getRemoteAppLogDir(
        remoteRootLogDir, applicationId, appOwner,
        LogAggregationUtils.getRemoteNodeLogDirSuffix(conf));
    RemoteIterator<FileStatus> nodeFiles;
    try {
      Path qualifiedLogDir =
          FileContext.getFileContext(conf).makeQualified(
            remoteAppDir);
      nodeFiles =
          FileContext.getFileContext(qualifiedLogDir.toUri(), conf)
            .listStatus(remoteAppDir);
    } catch (FileNotFoundException fnf) {
      html.h1()
          ._("Logs not available for " + logEntity
              + ". Aggregation may not be complete, "
              + "Check back later or try the nodemanager at " + nodeId)._();
      return;
    } catch (Exception ex) {
      html.h1()
          ._("Error getting logs at " + nodeId)._();
      return;
    }

    boolean foundLog = false;
    String desiredLogType = $(CONTAINER_LOG_TYPE);
    try {
      while (nodeFiles.hasNext()) {
        AggregatedLogFormat.LogReader reader = null;
        try {
          FileStatus thisNodeFile = nodeFiles.next();
          if (!thisNodeFile.getPath().getName()
            .contains(LogAggregationUtils.getNodeString(nodeId))
              || thisNodeFile.getPath().getName()
                .endsWith(LogAggregationUtils.TMP_FILE_SUFFIX)) {
            continue;
          }
          long logUploadedTime = thisNodeFile.getModificationTime();
          reader =
              new AggregatedLogFormat.LogReader(conf, thisNodeFile.getPath());

          String owner = null;
          Map<ApplicationAccessType, String> appAcls = null;
          try {
            owner = reader.getApplicationOwner();
            appAcls = reader.getApplicationAcls();
          } catch (IOException e) {
            LOG.error("Error getting logs for " + logEntity, e);
            continue;
          }
          ApplicationACLsManager aclsManager = new ApplicationACLsManager(conf);
          aclsManager.addApplication(applicationId, appAcls);

          String remoteUser = request().getRemoteUser();
          UserGroupInformation callerUGI = null;
          if (remoteUser != null) {
            callerUGI = UserGroupInformation.createRemoteUser(remoteUser);
          }
          if (callerUGI != null && !aclsManager.checkAccess(callerUGI,
              ApplicationAccessType.VIEW_APP, owner, applicationId)) {
            html.h1()
                ._("User [" + remoteUser
                    + "] is not authorized to view the logs for " + logEntity
                    + " in log file [" + thisNodeFile.getPath().getName() + "]")._();
            LOG.error("User [" + remoteUser
              + "] is not authorized to view the logs for " + logEntity);
            continue;
          }

          AggregatedLogFormat.ContainerLogsReader logReader = reader
            .getContainerLogsReader(containerId);
          if (logReader == null) {
            continue;
          }

          foundLog = readContainerLogs(html, logReader, logLimits,
              desiredLogType, logUploadedTime);
        } catch (IOException ex) {
          LOG.error("Error getting logs for " + logEntity, ex);
          continue;
        } finally {
          if (reader != null)
            reader.close();
        }
      }
      if (!foundLog) {
        if (desiredLogType.isEmpty()) {
          html.h1("No logs available for container " + containerId.toString());
        } else {
          html.h1("Unable to locate '" + desiredLogType
              + "' log for container " + containerId.toString());
        }
      }
    } catch (IOException e) {
      html.h1()._("Error getting logs for " + logEntity)._();
      LOG.error("Error getting logs for " + logEntity, e);
    }
  }

Example #20

0

Show file

File: HadoopConverterJob.java Project: AlexanderSaydakov/druid

  public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
      jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
      throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
      jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(
        JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
        JobHelper.distributedClassPath(
            getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
        job);

    Throwable throwable = null;
    try {
      job.submit();
      log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
      final boolean success = job.waitForCompletion(true);
      if (!success) {
        final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
        if (reports != null) {
          for (final TaskReport report : reports) {
            log.error(
                "Error in task [%s] : %s",
                report.getTaskId(), Arrays.toString(report.getDiagnostics()));
          }
        }
        return null;
      }
      try {
        loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
        writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
      } catch (IOException ex) {
        log.error(ex, "Could not fetch counters");
      }
      final JobID jobID = job.getJobID();

      final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
      final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
      final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
      final List<Path> goodPaths = new ArrayList<>();
      while (it.hasNext()) {
        final LocatedFileStatus locatedFileStatus = it.next();
        if (locatedFileStatus.isFile()) {
          final Path myPath = locatedFileStatus.getPath();
          if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
            goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
          }
        }
      }
      if (goodPaths.isEmpty()) {
        log.warn("No good data found at [%s]", jobDir);
        return null;
      }
      final List<DataSegment> returnList =
          ImmutableList.copyOf(
              Lists.transform(
                  goodPaths,
                  new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                      try {
                        if (!fs.exists(input)) {
                          throw new ISE(
                              "Somehow [%s] was found but [%s] is missing at [%s]",
                              ConvertingOutputFormat.DATA_SUCCESS_KEY,
                              ConvertingOutputFormat.DATA_FILE_KEY,
                              jobDir);
                        }
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                      try (final InputStream stream = fs.open(input)) {
                        return HadoopDruidConverterConfig.jsonMapper.readValue(
                            stream, DataSegment.class);
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                    }
                  }));
      if (returnList.size() == segments.size()) {
        return returnList;
      } else {
        throw new ISE(
            "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
            segments.size(), returnList.size(), jobDir);
      }
    } catch (InterruptedException | ClassNotFoundException e) {
      RuntimeException exception = Throwables.propagate(e);
      throwable = exception;
      throw exception;
    } catch (Throwable t) {
      throwable = t;
      throw t;
    } finally {
      try {
        cleanup(job);
      } catch (IOException e) {
        if (throwable != null) {
          throwable.addSuppressed(e);
        } else {
          log.error(e, "Could not clean up job [%s]", job.getJobID());
        }
      }
    }
  }

Example #21

0

Show file

File: RollingSinkITCase.java Project: ktzoumas/incubator-flink

  /**
   * This uses {@link org.apache.flink.streaming.connectors.fs.DateTimeBucketer} to produce rolling
   * files. The clock of DateTimeBucketer is set to {@link ModifyableClock} to keep the time in
   * lockstep with the processing of elements using latches.
   */
  @Test
  public void testDateTimeRollingStringWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/rolling-out";
    DateTimeBucketer.setClock(new ModifyableClock());
    ModifyableClock.setCurrentTime(0);

    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);

    DataStream<Tuple2<Integer, String>> source =
        env.addSource(new WaitingTestSourceFunction(NUM_ELEMENTS)).broadcast();

    // the parallel flatMap is chained to the sink, so when it has seen 5 elements it can
    // fire the latch
    DataStream<String> mapped =
        source.flatMap(
            new RichFlatMapFunction<Tuple2<Integer, String>, String>() {
              private static final long serialVersionUID = 1L;

              int count = 0;

              @Override
              public void flatMap(Tuple2<Integer, String> value, Collector<String> out)
                  throws Exception {
                out.collect(value.f1);
                count++;
                if (count >= 5) {
                  if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
                    latch1.trigger();
                  } else {
                    latch2.trigger();
                  }
                  count = 0;
                }
              }
            });

    RollingSink<String> sink =
        new RollingSink<String>(outPath)
            .setBucketer(new DateTimeBucketer("ss"))
            .setPartPrefix("part")
            .setPendingPrefix("")
            .setPendingSuffix("");

    mapped.addSink(sink);

    env.execute("RollingSink String Write Test");

    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

    // we should have 8 rolling files, 4 time intervals and parallelism of 2
    int numFiles = 0;
    while (files.hasNext()) {
      LocatedFileStatus file = files.next();
      numFiles++;
      if (file.getPath().toString().contains("rolling-out/00")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 0; i < 5; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else if (file.getPath().toString().contains("rolling-out/05")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 5; i < 10; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else if (file.getPath().toString().contains("rolling-out/10")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 10; i < 15; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else if (file.getPath().toString().contains("rolling-out/15")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 15; i < 20; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else {
        Assert.fail("File " + file + " does not match any expected roll pattern.");
      }
    }

    Assert.assertEquals(8, numFiles);
  }