Пример #1
0
  /**
   * Get latest water mark from previous work unit states.
   *
   * @param state Source state
   * @return latest water mark (high water mark)
   */
  private long getLatestWatermarkFromMetadata(SourceState state) {
    LOG.debug("Get latest watermark from the previous run");
    long latestWaterMark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;

    List<WorkUnitState> previousWorkUnitStates =
        Lists.newArrayList(state.getPreviousWorkUnitStates());
    List<Long> previousWorkUnitStateHighWatermarks = Lists.newArrayList();
    List<Long> previousWorkUnitLowWatermarks = Lists.newArrayList();

    if (previousWorkUnitStates.isEmpty()) {
      LOG.info(
          "No previous work unit states found; Latest watermark - Default watermark: "
              + latestWaterMark);
      return latestWaterMark;
    }

    boolean hasFailedRun = false;
    boolean isCommitOnFullSuccess = false;
    boolean isDataProcessedInPreviousRun = false;

    JobCommitPolicy commitPolicy =
        JobCommitPolicy.forName(
            state.getProp(
                ConfigurationKeys.JOB_COMMIT_POLICY_KEY,
                ConfigurationKeys.DEFAULT_JOB_COMMIT_POLICY));
    if (commitPolicy == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS) {
      isCommitOnFullSuccess = true;
    }

    for (WorkUnitState workUnitState : previousWorkUnitStates) {
      long processedRecordCount = 0;
      LOG.info(
          "State of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getWorkingState());
      if (workUnitState.getWorkingState() == WorkingState.FAILED
          || workUnitState.getWorkingState() == WorkingState.CANCELLED
          || workUnitState.getWorkingState() == WorkingState.RUNNING
          || workUnitState.getWorkingState() == WorkingState.PENDING) {
        hasFailedRun = true;
      } else {
        processedRecordCount =
            workUnitState.getPropAsLong(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED);
        if (processedRecordCount != 0) {
          isDataProcessedInPreviousRun = true;
        }
      }

      LOG.info(
          "Low watermark of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getWorkunit().getLowWaterMark());
      LOG.info(
          "High watermark of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getHighWaterMark());
      LOG.info("Record count of the previous task: " + processedRecordCount + "\n");

      // Consider high water mark of the previous work unit, if it is
      // extracted any data
      if (processedRecordCount != 0) {
        previousWorkUnitStateHighWatermarks.add(workUnitState.getHighWaterMark());
      }

      previousWorkUnitLowWatermarks.add(this.getLowWatermarkFromWorkUnit(workUnitState));
    }

    // If commit policy is full and it has failed run, get latest water mark
    // as
    // minimum of low water marks from previous states.
    if (isCommitOnFullSuccess && hasFailedRun) {
      long previousLowWatermark = Collections.min(previousWorkUnitLowWatermarks);

      WorkUnitState previousState = previousWorkUnitStates.get(0);
      ExtractType extractType =
          ExtractType.valueOf(
              previousState
                  .getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE)
                  .toUpperCase());

      // add backup seconds only for snapshot extracts but not for appends
      if (extractType == ExtractType.SNAPSHOT) {
        int backupSecs =
            previousState.getPropAsInt(
                ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0);
        String watermarkType =
            previousState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE);
        latestWaterMark = this.addBackedUpSeconds(previousLowWatermark, backupSecs, watermarkType);
      } else {
        latestWaterMark = previousLowWatermark;
      }

      LOG.info(
          "Previous job was COMMIT_ON_FULL_SUCCESS but it was failed; Latest watermark - "
              + "Min watermark from WorkUnits: "
              + latestWaterMark);
    }

    // If commit policy is full and there are no failed tasks or commit
    // policy is partial,
    // get latest water mark as maximum of high water marks from previous
    // tasks.
    else {
      if (isDataProcessedInPreviousRun) {
        latestWaterMark = Collections.max(previousWorkUnitStateHighWatermarks);
        LOG.info(
            "Previous run was successful. Latest watermark - Max watermark from WorkUnitStates: "
                + latestWaterMark);
      } else {
        latestWaterMark = Collections.min(previousWorkUnitLowWatermarks);
        LOG.info(
            "Previous run was successful but no data found. Latest watermark - Min watermark from WorkUnitStates: "
                + latestWaterMark);
      }
    }

    return latestWaterMark;
  }
Пример #2
0
  protected void publishData(
      WorkUnitState workUnitState, int branchId, Set<Path> writerOutputPathsMoved)
      throws IOException {
    // Get a ParallelRunner instance for moving files in parallel
    ParallelRunner parallelRunner = this.getParallelRunner(this.fileSystemByBranches.get(branchId));

    // The directory where the workUnitState wrote its output data.
    Path writerOutputDir =
        WriterUtils.getWriterOutputDir(workUnitState, this.numBranches, branchId);

    if (writerOutputPathsMoved.contains(writerOutputDir)) {
      // This writer output path has already been moved for another task of the same extract
      return;
    }

    if (!this.fileSystemByBranches.get(branchId).exists(writerOutputDir)) {
      LOG.warn(
          String.format(
              "Branch %d of WorkUnit %s produced no data", branchId, workUnitState.getId()));
      return;
    }

    // The directory where the final output directory for this job will be placed.
    // It is a combination of DATA_PUBLISHER_FINAL_DIR and WRITER_FILE_PATH.
    Path publisherOutputDir = getPublisherOutputDir(workUnitState, branchId);

    if (this.fileSystemByBranches.get(branchId).exists(publisherOutputDir)) {
      // The final output directory already exists, check if the job is configured to replace it.
      boolean replaceFinalOutputDir =
          this.getState()
              .getPropAsBoolean(
                  ForkOperatorUtils.getPropertyNameForBranch(
                      ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR,
                      this.numBranches,
                      branchId));

      // If the final output directory is not configured to be replaced, put new data to the
      // existing directory.
      if (!replaceFinalOutputDir) {
        addWriterOutputToExistingDir(
            writerOutputDir, publisherOutputDir, workUnitState, branchId, parallelRunner);
        writerOutputPathsMoved.add(writerOutputDir);
        return;
      }
      // Delete the final output directory if it is configured to be replaced
      this.fileSystemByBranches.get(branchId).delete(publisherOutputDir, true);
    } else {
      // Create the parent directory of the final output directory if it does not exist
      WriterUtils.mkdirsWithRecursivePermission(
          this.fileSystemByBranches.get(branchId),
          publisherOutputDir.getParent(),
          this.permissions.get(branchId));
    }

    LOG.info(String.format("Moving %s to %s", writerOutputDir, publisherOutputDir));
    parallelRunner.renamePath(
        writerOutputDir,
        publisherOutputDir,
        this.publisherFinalDirOwnerGroupsByBranches.get(branchId));
    writerOutputPathsMoved.add(writerOutputDir);
  }