@Override
        public InputFormat get() {
          final Map<String, LocatedFileStatus> locationMap = Maps.newHashMap();
          for (LocatedFileStatus status : locations) {
            locationMap.put(status.getPath().getName(), status);
          }

          return new TextInputFormat() {
            @Override
            protected boolean isSplitable(FileSystem fs, Path file) {
              return false;
            }

            @Override
            protected FileStatus[] listStatus(JobConf job) throws IOException {
              Path[] dirs = getInputPaths(job);
              if (dirs.length == 0) {
                throw new IOException("No input paths specified in job");
              }
              FileStatus[] status = new FileStatus[dirs.length];
              for (int i = 0; i < dirs.length; i++) {
                status[i] = locationMap.get(dirs[i].getName());
              }
              return status;
            }
          };
        }
  private void doWalk(
      Path path, FileStatusCallback callback, AtomicLong taskCount, SettableFuture<Void> future) {
    try (SetThreadName ignored = new SetThreadName("HiveHdfsWalker")) {
      RemoteIterator<LocatedFileStatus> iterator = getLocatedFileStatusRemoteIterator(path);

      while (iterator.hasNext()) {
        LocatedFileStatus status = getLocatedFileStatus(iterator);

        // ignore hidden files. Hive ignores files starting with _ and . as well.
        String fileName = status.getPath().getName();
        if (fileName.startsWith("_") || fileName.startsWith(".")) {
          continue;
        }
        if (isDirectory(status)) {
          recursiveWalk(status.getPath(), callback, taskCount, future);
        } else {
          callback.process(status, status.getBlockLocations());
        }
        if (future.isDone()) {
          return;
        }
      }
    } catch (FileNotFoundException e) {
      future.setException(new FileNotFoundException("Partition location does not exist: " + path));
    } catch (Throwable t) {
      future.setException(t);
    } finally {
      if (taskCount.decrementAndGet() == 0) {
        future.set(null);
      }
    }
  }
  private void verifyJobOutput() throws IOException {

    final String _SUCCESS = "_SUCCESS";
    final String REDUCER_OUTPUT = "part-r-";
    boolean wasSuccessful = false;
    boolean reducerOutputExists = false;
    FileSystem fs = FileSystem.getLocal(new Configuration());
    RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(OUTPUT_PATH), false);
    LocatedFileStatus fileStatus = null;
    String fileName = null;

    while (iterator.hasNext()) {
      fileStatus = iterator.next();
      fileName = fileStatus.getPath().getName();

      if (fileName.contains(_SUCCESS)) {
        wasSuccessful = true;
      }
      if (fileName.contains(REDUCER_OUTPUT)) {
        reducerOutputExists = true;
      }
    }

    // verify presence of _SUCCESS file
    Assert.assertEquals(wasSuccessful, true);

    // verify presence of Reducer output
    Assert.assertEquals(reducerOutputExists, true);
  }
 /**
  * Add files in the input path recursively into the results.
  *
  * @param result The List to store all files.
  * @param fs The FileSystem.
  * @param path The input path.
  * @param inputFilter The input filter that can be used to filter files/dirs.
  * @throws IOException
  */
 protected void addInputPathRecursively(
     List<FileStatus> result, FileSystem fs, Path path, PathFilter inputFilter)
     throws IOException {
   RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path);
   while (iter.hasNext()) {
     LocatedFileStatus stat = iter.next();
     if (inputFilter.accept(stat.getPath())) {
       if (stat.isDirectory()) {
         addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
       } else {
         result.add(stat);
       }
     }
   }
 }
Beispiel #5
0
    /**
     * Processes the input file/folder argument. If the input is a file, then it is directly
     * considered for further processing by TraceBuilder. If the input is a folder, then all the
     * history logs in the input folder are considered for further processing.
     *
     * <p>If isRecursive is true, then the input path is recursively scanned for job history logs
     * for further processing by TraceBuilder.
     *
     * <p>NOTE: If the input represents a globbed path, then it is first flattened and then the
     * individual paths represented by the globbed input path are considered for further processing.
     *
     * @param input input path, possibly globbed
     * @param conf configuration
     * @param isRecursive whether to recursively traverse the input paths to find history logs
     * @return the input history log files' paths
     * @throws FileNotFoundException
     * @throws IOException
     */
    static List<Path> processInputArgument(String input, Configuration conf, boolean isRecursive)
        throws FileNotFoundException, IOException {
      Path inPath = new Path(input);
      FileSystem fs = inPath.getFileSystem(conf);
      FileStatus[] inStatuses = fs.globStatus(inPath);

      List<Path> inputPaths = new LinkedList<Path>();
      if (inStatuses == null || inStatuses.length == 0) {
        return inputPaths;
      }

      for (FileStatus inStatus : inStatuses) {
        Path thisPath = inStatus.getPath();
        if (inStatus.isDirectory()) {

          // Find list of files in this path(recursively if -recursive option
          // is specified).
          List<FileStatus> historyLogs = new ArrayList<FileStatus>();

          RemoteIterator<LocatedFileStatus> iter = fs.listFiles(thisPath, isRecursive);
          while (iter.hasNext()) {
            LocatedFileStatus child = iter.next();
            String fileName = child.getPath().getName();

            if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) {
              historyLogs.add(child);
            }
          }

          if (historyLogs.size() > 0) {
            // Add the sorted history log file names in this path to the
            // inputPaths list
            FileStatus[] sortableNames = historyLogs.toArray(new FileStatus[historyLogs.size()]);
            Arrays.sort(sortableNames, new HistoryLogsComparator());

            for (FileStatus historyLog : sortableNames) {
              inputPaths.add(historyLog.getPath());
            }
          }
        } else {
          inputPaths.add(thisPath);
        }
      }

      return inputPaths;
    }
  private List<FileStatus> singleThreadedListStatus(
      JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
      Path p = dirs[i];
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat : matches) {
          if (globStat.isDirectory()) {
            RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
            while (iter.hasNext()) {
              LocatedFileStatus stat = iter.next();
              if (inputFilter.accept(stat.getPath())) {
                if (recursive && stat.isDirectory()) {
                  addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                } else {
                  result.add(stat);
                }
              }
            }
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    return result;
  }
Beispiel #7
0
  @Override
  protected LocatedFileStatus computeNext() {
    try {
      if (remoteIterator == null) {
        remoteIterator = getLocatedFileStatusRemoteIterator(path);
      }

      while (remoteIterator.hasNext()) {
        LocatedFileStatus status = getLocatedFileStatus(remoteIterator);

        // ignore hidden files. Hive ignores files starting with _ and . as well.
        String fileName = status.getPath().getName();
        if (fileName.startsWith("_") || fileName.startsWith(".")) {
          continue;
        }
        return status;
      }
      return endOfData();
    } catch (FileNotFoundException e) {
      throw new PrestoException(HIVE_FILE_NOT_FOUND, "Partition location does not exist: " + path);
    } catch (IOException e) {
      throw new PrestoException(HIVE_FILESYSTEM_ERROR, e);
    }
  }
  /**
   * This uses {@link org.apache.flink.streaming.connectors.fs.DateTimeBucketer} to produce rolling
   * files. The clock of DateTimeBucketer is set to {@link ModifyableClock} to keep the time in
   * lockstep with the processing of elements using latches.
   */
  @Test
  public void testDateTimeRollingStringWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/rolling-out";
    DateTimeBucketer.setClock(new ModifyableClock());
    ModifyableClock.setCurrentTime(0);

    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);

    DataStream<Tuple2<Integer, String>> source =
        env.addSource(new WaitingTestSourceFunction(NUM_ELEMENTS)).broadcast();

    // the parallel flatMap is chained to the sink, so when it has seen 5 elements it can
    // fire the latch
    DataStream<String> mapped =
        source.flatMap(
            new RichFlatMapFunction<Tuple2<Integer, String>, String>() {
              private static final long serialVersionUID = 1L;

              int count = 0;

              @Override
              public void flatMap(Tuple2<Integer, String> value, Collector<String> out)
                  throws Exception {
                out.collect(value.f1);
                count++;
                if (count >= 5) {
                  if (getRuntimeContext().getIndexOfThisSubtask() == 0) {
                    latch1.trigger();
                  } else {
                    latch2.trigger();
                  }
                  count = 0;
                }
              }
            });

    RollingSink<String> sink =
        new RollingSink<String>(outPath)
            .setBucketer(new DateTimeBucketer("ss"))
            .setPartPrefix("part")
            .setPendingPrefix("")
            .setPendingSuffix("");

    mapped.addSink(sink);

    env.execute("RollingSink String Write Test");

    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

    // we should have 8 rolling files, 4 time intervals and parallelism of 2
    int numFiles = 0;
    while (files.hasNext()) {
      LocatedFileStatus file = files.next();
      numFiles++;
      if (file.getPath().toString().contains("rolling-out/00")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 0; i < 5; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else if (file.getPath().toString().contains("rolling-out/05")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 5; i < 10; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else if (file.getPath().toString().contains("rolling-out/10")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 10; i < 15; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else if (file.getPath().toString().contains("rolling-out/15")) {
        FSDataInputStream inStream = dfs.open(file.getPath());

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 15; i < 20; i++) {
          String line = br.readLine();
          Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
      } else {
        Assert.fail("File " + file + " does not match any expected roll pattern.");
      }
    }

    Assert.assertEquals(8, numFiles);
  }
  public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
      jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
      throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
      jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(
        JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
        JobHelper.distributedClassPath(
            getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
        job);

    Throwable throwable = null;
    try {
      job.submit();
      log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
      final boolean success = job.waitForCompletion(true);
      if (!success) {
        final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
        if (reports != null) {
          for (final TaskReport report : reports) {
            log.error(
                "Error in task [%s] : %s",
                report.getTaskId(), Arrays.toString(report.getDiagnostics()));
          }
        }
        return null;
      }
      try {
        loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
        writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
      } catch (IOException ex) {
        log.error(ex, "Could not fetch counters");
      }
      final JobID jobID = job.getJobID();

      final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
      final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
      final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
      final List<Path> goodPaths = new ArrayList<>();
      while (it.hasNext()) {
        final LocatedFileStatus locatedFileStatus = it.next();
        if (locatedFileStatus.isFile()) {
          final Path myPath = locatedFileStatus.getPath();
          if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
            goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
          }
        }
      }
      if (goodPaths.isEmpty()) {
        log.warn("No good data found at [%s]", jobDir);
        return null;
      }
      final List<DataSegment> returnList =
          ImmutableList.copyOf(
              Lists.transform(
                  goodPaths,
                  new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                      try {
                        if (!fs.exists(input)) {
                          throw new ISE(
                              "Somehow [%s] was found but [%s] is missing at [%s]",
                              ConvertingOutputFormat.DATA_SUCCESS_KEY,
                              ConvertingOutputFormat.DATA_FILE_KEY,
                              jobDir);
                        }
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                      try (final InputStream stream = fs.open(input)) {
                        return HadoopDruidConverterConfig.jsonMapper.readValue(
                            stream, DataSegment.class);
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                    }
                  }));
      if (returnList.size() == segments.size()) {
        return returnList;
      } else {
        throw new ISE(
            "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
            segments.size(), returnList.size(), jobDir);
      }
    } catch (InterruptedException | ClassNotFoundException e) {
      RuntimeException exception = Throwables.propagate(e);
      throwable = exception;
      throw exception;
    } catch (Throwable t) {
      throwable = t;
      throw t;
    } finally {
      try {
        cleanup(job);
      } catch (IOException e) {
        if (throwable != null) {
          throwable.addSuppressed(e);
        } else {
          log.error(e, "Could not clean up job [%s]", job.getJobID());
        }
      }
    }
  }