Пример #1
0
 @BeforeClass
 public void setUp() {
   WorkUnitState workUnitState = new WorkUnitState();
   workUnitState.setProp(ConfigurationKeys.JOB_ID_KEY, "Job-1");
   workUnitState.setProp(ConfigurationKeys.TASK_ID_KEY, "Task-1");
   this.taskState = new TaskState(workUnitState);
 }
  /**
   * Use resource key(Optional) and rest json entry as a template and fill in template using Avro as
   * a reference. e.g: Rest JSON entry HOCON template:
   * AccountId=${sf_account_id},Member_Id__c=${member_id} Avro:
   * {"sf_account_id":{"string":"0016000000UiCYHAA3"},"member_id":{"long":296458833}}
   *
   * <p>Converted Json: {"AccountId":"0016000000UiCYHAA3","Member_Id__c":296458833}
   *
   * <p>As it's template based approach, it can produce nested JSON structure even Avro is flat (or
   * vice versa).
   *
   * <p>e.g: Rest resource template: /sobject/account/memberId/${member_id} Avro:
   * {"sf_account_id":{"string":"0016000000UiCYHAA3"},"member_id":{"long":296458833}} Converted
   * resource: /sobject/account/memberId/296458833
   *
   * <p>Converted resource will be used to form end point.
   * http://www.server.com:9090/sobject/account/memberId/296458833
   *
   * <p>{@inheritDoc}
   *
   * @see gobblin.converter.Converter#convertRecord(java.lang.Object, java.lang.Object,
   *     gobblin.configuration.WorkUnitState)
   */
  @Override
  public Iterable<RestEntry<JsonObject>> convertRecord(
      Void outputSchema, GenericRecord inputRecord, WorkUnitState workUnit)
      throws DataConversionException {

    Config srcConfig =
        ConfigFactory.parseString(
            inputRecord.toString(), ConfigParseOptions.defaults().setSyntax(ConfigSyntax.JSON));

    String resourceKey = workUnit.getProp(CONVERTER_AVRO_REST_ENTRY_RESOURCE_KEY, "");
    if (!StringUtils.isEmpty(resourceKey)) {
      final String dummyKey = "DUMMY";
      Config tmpConfig =
          ConfigFactory.parseString(dummyKey + "=" + resourceKey).resolveWith(srcConfig);
      resourceKey = tmpConfig.getString(dummyKey);
    }

    String hoconInput = workUnit.getProp(CONVERTER_AVRO_REST_JSON_ENTRY_TEMPLATE);
    if (StringUtils.isEmpty(hoconInput)) {
      return new SingleRecordIterable<>(
          new RestEntry<>(resourceKey, parser.parse(inputRecord.toString()).getAsJsonObject()));
    }

    Config destConfig = ConfigFactory.parseString(hoconInput).resolveWith(srcConfig);
    JsonObject json =
        parser.parse(destConfig.root().render(ConfigRenderOptions.concise())).getAsJsonObject();
    return new SingleRecordIterable<>(new RestEntry<>(resourceKey, json));
  }
Пример #3
0
  protected void addWriterOutputToExistingDir(
      Path writerOutputDir,
      Path publisherOutputDir,
      WorkUnitState workUnitState,
      int branchId,
      ParallelRunner parallelRunner)
      throws IOException {
    boolean preserveFileName =
        workUnitState.getPropAsBoolean(
            ForkOperatorUtils.getPropertyNameForBranch(
                ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, this.numBranches, branchId),
            false);

    // Go through each file in writerOutputDir and move it into publisherOutputDir
    for (FileStatus status : this.fileSystemByBranches.get(branchId).listStatus(writerOutputDir)) {

      // Preserve the file name if configured, use specified name otherwise
      Path finalOutputPath =
          preserveFileName
              ? new Path(
                  publisherOutputDir,
                  workUnitState.getProp(
                      ForkOperatorUtils.getPropertyNameForBranch(
                          ConfigurationKeys.DATA_PUBLISHER_FINAL_NAME, this.numBranches, branchId)))
              : new Path(publisherOutputDir, status.getPath().getName());

      LOG.info(String.format("Moving %s to %s", status.getPath(), finalOutputPath));
      parallelRunner.renamePath(status.getPath(), finalOutputPath, Optional.<String>absent());
    }
  }
Пример #4
0
 public TaskState(WorkUnitState workUnitState) {
   // Since getWorkunit() returns an immutable WorkUnit object,
   // the WorkUnit object in this object is also immutable.
   super(workUnitState.getWorkunit());
   addAll(workUnitState);
   this.jobId = workUnitState.getProp(ConfigurationKeys.JOB_ID_KEY);
   this.taskId = workUnitState.getProp(ConfigurationKeys.TASK_ID_KEY);
   this.setId(this.taskId);
 }
Пример #5
0
 public KafkaAvroExtractor(WorkUnitState state) {
   super(state);
   this.schemaRegistry =
       state.contains(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_CLASS)
           ? Optional.of(KafkaSchemaRegistry.<K, Schema>get(state.getProperties()))
           : Optional.<KafkaSchemaRegistry<K, Schema>>absent();
   this.schema = getExtractorSchema();
   if (this.schema.isPresent()) {
     this.reader = Optional.of(new GenericDatumReader<Record>(this.schema.get()));
   } else {
     log.error(
         String.format(
             "Cannot find latest schema for topic %s. This topic will be skipped",
             this.topicName));
     this.reader = Optional.absent();
   }
 }
Пример #6
0
  @Override
  public void publishData(Collection<? extends WorkUnitState> states) throws IOException {

    // We need a Set to collect unique writer output paths as multiple tasks may belong to the same
    // extract. Tasks that
    // belong to the same Extract will by default have the same output directory
    Set<Path> writerOutputPathsMoved = Sets.newHashSet();

    for (WorkUnitState workUnitState : states) {
      for (int branchId = 0; branchId < this.numBranches; branchId++) {
        publishData(workUnitState, branchId, writerOutputPathsMoved);
      }

      // Upon successfully committing the data to the final output directory, set states
      // of successful tasks to COMMITTED. leaving states of unsuccessful ones unchanged.
      // This makes sense to the COMMIT_ON_PARTIAL_SUCCESS policy.
      workUnitState.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
    }
  }
Пример #7
0
  /** Test for {@link ForkOperatorUtils#getPropertyNameForBranch(WorkUnitState, String)}. */
  @Test
  public void testGetPropertyNameForBranchWithWorkUnitState() {
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(PROPERTY_FOO, PATH_FOO);

    // Test that if the fork id key is not specified that the original property is preserved
    Assert.assertEquals(
        ForkOperatorUtils.getPropertyNameForBranch(workUnitState, PROPERTY_FOO), PROPERTY_FOO);

    // Test that if the fork id key is set to -1 that the original property is preserved
    workUnitState.setProp(ConfigurationKeys.FORK_BRANCH_ID_KEY, -1);
    Assert.assertEquals(
        ForkOperatorUtils.getPropertyNameForBranch(workUnitState, PROPERTY_FOO), PROPERTY_FOO);

    // Test that if the fork id key is set to 0 that the new property is properly created
    workUnitState.setProp(ConfigurationKeys.FORK_BRANCH_ID_KEY, 0);
    Assert.assertEquals(
        ForkOperatorUtils.getPropertyNameForBranch(workUnitState, PROPERTY_FOO),
        PROPERTY_FOO + ".0");
  }
Пример #8
0
 @Override
 public void write(DataOutput out) throws IOException {
   Text text = new Text();
   text.set(this.jobId);
   text.write(out);
   text.set(this.taskId);
   text.write(out);
   out.writeLong(this.startTime);
   out.writeLong(this.endTime);
   out.writeLong(this.duration);
   super.write(out);
 }
Пример #9
0
 @Override
 public void readFields(DataInput in) throws IOException {
   Text text = new Text();
   text.readFields(in);
   this.jobId = text.toString();
   text.readFields(in);
   this.taskId = text.toString();
   this.setId(this.taskId);
   this.startTime = in.readLong();
   this.endTime = in.readLong();
   this.duration = in.readLong();
   super.readFields(in);
 }
  @Test
  public void testWrite() throws Exception {
    String streamString = "testContents";

    FileStatus status = fs.getFileStatus(testTempPath);
    OwnerAndPermission ownerAndPermission =
        new OwnerAndPermission(
            status.getOwner(),
            status.getGroup(),
            new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
    CopyableFile cf = CopyableFileUtils.getTestCopyableFile(ownerAndPermission);

    CopyableDatasetMetadata metadata =
        new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source")));

    WorkUnitState state = new WorkUnitState();
    state.setProp(
        ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString());
    state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString());
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5));
    CopySource.serializeCopyEntity(state, cf);
    CopySource.serializeCopyableDataset(state, metadata);

    FileAwareInputStreamDataWriter dataWriter = new FileAwareInputStreamDataWriter(state, 1, 0);

    FileAwareInputStream fileAwareInputStream =
        new FileAwareInputStream(
            cf, StreamUtils.convertStream(IOUtils.toInputStream(streamString)));
    dataWriter.write(fileAwareInputStream);
    dataWriter.commit();
    Path writtenFilePath =
        new Path(
            new Path(
                state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR),
                cf.getDatasetAndPartition(metadata).identifier()),
            cf.getDestination());
    Assert.assertEquals(
        IOUtils.toString(new FileInputStream(writtenFilePath.toString())), streamString);
  }
Пример #11
0
  /**
   * Get low water mark from the given work unit state.
   *
   * @param workUnitState Work unit state
   * @return latest low water mark
   */
  private long getLowWatermarkFromWorkUnit(WorkUnitState workUnitState) {
    String watermarkType =
        workUnitState.getProp(
            ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE,
            ConfigurationKeys.DEFAULT_WATERMARK_TYPE);
    long lowWaterMark = workUnitState.getWorkunit().getLowWaterMark();

    if (lowWaterMark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
      return lowWaterMark;
    }

    WatermarkType wmType = WatermarkType.valueOf(watermarkType.toUpperCase());
    int deltaNum = new WatermarkPredicate(wmType).getDeltaNumForNextWatermark();

    switch (wmType) {
      case SIMPLE:
        return lowWaterMark - deltaNum;
      default:
        Date lowWaterMarkDate = Utils.toDate(lowWaterMark, "yyyyMMddHHmmss");
        return Long.parseLong(
            Utils.dateToString(
                Utils.addSecondsToDate(lowWaterMarkDate, deltaNum * -1), "yyyyMMddHHmmss"));
    }
  }
Пример #12
0
  /**
   * Get latest water mark from previous work unit states.
   *
   * @param state Source state
   * @return latest water mark (high water mark)
   */
  private long getLatestWatermarkFromMetadata(SourceState state) {
    LOG.debug("Get latest watermark from the previous run");
    long latestWaterMark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;

    List<WorkUnitState> previousWorkUnitStates =
        Lists.newArrayList(state.getPreviousWorkUnitStates());
    List<Long> previousWorkUnitStateHighWatermarks = Lists.newArrayList();
    List<Long> previousWorkUnitLowWatermarks = Lists.newArrayList();

    if (previousWorkUnitStates.isEmpty()) {
      LOG.info(
          "No previous work unit states found; Latest watermark - Default watermark: "
              + latestWaterMark);
      return latestWaterMark;
    }

    boolean hasFailedRun = false;
    boolean isCommitOnFullSuccess = false;
    boolean isDataProcessedInPreviousRun = false;

    JobCommitPolicy commitPolicy =
        JobCommitPolicy.forName(
            state.getProp(
                ConfigurationKeys.JOB_COMMIT_POLICY_KEY,
                ConfigurationKeys.DEFAULT_JOB_COMMIT_POLICY));
    if (commitPolicy == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS) {
      isCommitOnFullSuccess = true;
    }

    for (WorkUnitState workUnitState : previousWorkUnitStates) {
      long processedRecordCount = 0;
      LOG.info(
          "State of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getWorkingState());
      if (workUnitState.getWorkingState() == WorkingState.FAILED
          || workUnitState.getWorkingState() == WorkingState.CANCELLED
          || workUnitState.getWorkingState() == WorkingState.RUNNING
          || workUnitState.getWorkingState() == WorkingState.PENDING) {
        hasFailedRun = true;
      } else {
        processedRecordCount =
            workUnitState.getPropAsLong(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED);
        if (processedRecordCount != 0) {
          isDataProcessedInPreviousRun = true;
        }
      }

      LOG.info(
          "Low watermark of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getWorkunit().getLowWaterMark());
      LOG.info(
          "High watermark of the previous task: "
              + workUnitState.getId()
              + ":"
              + workUnitState.getHighWaterMark());
      LOG.info("Record count of the previous task: " + processedRecordCount + "\n");

      // Consider high water mark of the previous work unit, if it is
      // extracted any data
      if (processedRecordCount != 0) {
        previousWorkUnitStateHighWatermarks.add(workUnitState.getHighWaterMark());
      }

      previousWorkUnitLowWatermarks.add(this.getLowWatermarkFromWorkUnit(workUnitState));
    }

    // If commit policy is full and it has failed run, get latest water mark
    // as
    // minimum of low water marks from previous states.
    if (isCommitOnFullSuccess && hasFailedRun) {
      long previousLowWatermark = Collections.min(previousWorkUnitLowWatermarks);

      WorkUnitState previousState = previousWorkUnitStates.get(0);
      ExtractType extractType =
          ExtractType.valueOf(
              previousState
                  .getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE)
                  .toUpperCase());

      // add backup seconds only for snapshot extracts but not for appends
      if (extractType == ExtractType.SNAPSHOT) {
        int backupSecs =
            previousState.getPropAsInt(
                ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0);
        String watermarkType =
            previousState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE);
        latestWaterMark = this.addBackedUpSeconds(previousLowWatermark, backupSecs, watermarkType);
      } else {
        latestWaterMark = previousLowWatermark;
      }

      LOG.info(
          "Previous job was COMMIT_ON_FULL_SUCCESS but it was failed; Latest watermark - "
              + "Min watermark from WorkUnits: "
              + latestWaterMark);
    }

    // If commit policy is full and there are no failed tasks or commit
    // policy is partial,
    // get latest water mark as maximum of high water marks from previous
    // tasks.
    else {
      if (isDataProcessedInPreviousRun) {
        latestWaterMark = Collections.max(previousWorkUnitStateHighWatermarks);
        LOG.info(
            "Previous run was successful. Latest watermark - Max watermark from WorkUnitStates: "
                + latestWaterMark);
      } else {
        latestWaterMark = Collections.min(previousWorkUnitLowWatermarks);
        LOG.info(
            "Previous run was successful but no data found. Latest watermark - Min watermark from WorkUnitStates: "
                + latestWaterMark);
      }
    }

    return latestWaterMark;
  }
Пример #13
0
  protected void publishData(
      WorkUnitState workUnitState, int branchId, Set<Path> writerOutputPathsMoved)
      throws IOException {
    // Get a ParallelRunner instance for moving files in parallel
    ParallelRunner parallelRunner = this.getParallelRunner(this.fileSystemByBranches.get(branchId));

    // The directory where the workUnitState wrote its output data.
    Path writerOutputDir =
        WriterUtils.getWriterOutputDir(workUnitState, this.numBranches, branchId);

    if (writerOutputPathsMoved.contains(writerOutputDir)) {
      // This writer output path has already been moved for another task of the same extract
      return;
    }

    if (!this.fileSystemByBranches.get(branchId).exists(writerOutputDir)) {
      LOG.warn(
          String.format(
              "Branch %d of WorkUnit %s produced no data", branchId, workUnitState.getId()));
      return;
    }

    // The directory where the final output directory for this job will be placed.
    // It is a combination of DATA_PUBLISHER_FINAL_DIR and WRITER_FILE_PATH.
    Path publisherOutputDir = getPublisherOutputDir(workUnitState, branchId);

    if (this.fileSystemByBranches.get(branchId).exists(publisherOutputDir)) {
      // The final output directory already exists, check if the job is configured to replace it.
      boolean replaceFinalOutputDir =
          this.getState()
              .getPropAsBoolean(
                  ForkOperatorUtils.getPropertyNameForBranch(
                      ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR,
                      this.numBranches,
                      branchId));

      // If the final output directory is not configured to be replaced, put new data to the
      // existing directory.
      if (!replaceFinalOutputDir) {
        addWriterOutputToExistingDir(
            writerOutputDir, publisherOutputDir, workUnitState, branchId, parallelRunner);
        writerOutputPathsMoved.add(writerOutputDir);
        return;
      }
      // Delete the final output directory if it is configured to be replaced
      this.fileSystemByBranches.get(branchId).delete(publisherOutputDir, true);
    } else {
      // Create the parent directory of the final output directory if it does not exist
      WriterUtils.mkdirsWithRecursivePermission(
          this.fileSystemByBranches.get(branchId),
          publisherOutputDir.getParent(),
          this.permissions.get(branchId));
    }

    LOG.info(String.format("Moving %s to %s", writerOutputDir, publisherOutputDir));
    parallelRunner.renamePath(
        writerOutputDir,
        publisherOutputDir,
        this.publisherFinalDirOwnerGroupsByBranches.get(branchId));
    writerOutputPathsMoved.add(writerOutputDir);
  }
  @Test
  public void testCommit() throws IOException {

    String destinationExistingToken = "destination";
    String destinationAdditionalTokens = "path";
    String fileName = "file";

    // Asemble destination paths
    Path destination =
        new Path(
            new Path(new Path("/", destinationExistingToken), destinationAdditionalTokens),
            fileName);
    Path destinationWithoutLeadingSeparator =
        new Path(new Path(destinationExistingToken, destinationAdditionalTokens), fileName);

    // Create temp directory
    File tmpFile = Files.createTempDir();
    tmpFile.deleteOnExit();
    Path tmpPath = new Path(tmpFile.getAbsolutePath());

    // create origin file
    Path originFile = new Path(tmpPath, fileName);
    this.fs.createNewFile(originFile);

    // create stating dir
    Path stagingDir = new Path(tmpPath, "staging");
    this.fs.mkdirs(stagingDir);

    // create output dir
    Path outputDir = new Path(tmpPath, "output");
    this.fs.mkdirs(outputDir);

    // create copyable file
    FileStatus status = this.fs.getFileStatus(originFile);
    FsPermission readWrite =
        new FsPermission(FsAction.READ_WRITE, FsAction.READ_WRITE, FsAction.READ_WRITE);
    FsPermission dirReadWrite =
        new FsPermission(FsAction.ALL, FsAction.READ_WRITE, FsAction.READ_WRITE);
    OwnerAndPermission ownerAndPermission =
        new OwnerAndPermission(status.getOwner(), status.getGroup(), readWrite);
    List<OwnerAndPermission> ancestorOwnerAndPermissions = Lists.newArrayList();
    ancestorOwnerAndPermissions.add(ownerAndPermission);
    ancestorOwnerAndPermissions.add(ownerAndPermission);
    ancestorOwnerAndPermissions.add(ownerAndPermission);
    ancestorOwnerAndPermissions.add(ownerAndPermission);

    Properties properties = new Properties();
    properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher");

    CopyableFile cf =
        CopyableFile.fromOriginAndDestination(
                this.fs,
                status,
                destination,
                CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties)
                    .publishDir(new Path("/target"))
                    .preserve(PreserveAttributes.fromMnemonicString(""))
                    .build())
            .destinationOwnerAndPermission(ownerAndPermission)
            .ancestorsOwnerAndPermission(ancestorOwnerAndPermissions)
            .build();

    // create work unit state
    WorkUnitState state = new WorkUnitState();
    state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, stagingDir.toUri().getPath());
    state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, outputDir.toUri().getPath());
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5));
    CopyableDatasetMetadata metadata =
        new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source")));
    CopySource.serializeCopyEntity(state, cf);
    CopySource.serializeCopyableDataset(state, metadata);

    // create writer
    FileAwareInputStreamDataWriter writer = new FileAwareInputStreamDataWriter(state, 1, 0);

    // create output of writer.write
    Path writtenFile = writer.getStagingFilePath(cf);
    this.fs.mkdirs(writtenFile.getParent());
    this.fs.createNewFile(writtenFile);

    // create existing directories in writer output
    Path outputRoot =
        FileAwareInputStreamDataWriter.getPartitionOutputRoot(
            outputDir, cf.getDatasetAndPartition(metadata));
    Path existingOutputPath = new Path(outputRoot, destinationExistingToken);
    this.fs.mkdirs(existingOutputPath);
    FileStatus fileStatus = this.fs.getFileStatus(existingOutputPath);
    FsPermission existingPathPermission = fileStatus.getPermission();

    // check initial state of the relevant directories
    Assert.assertTrue(this.fs.exists(existingOutputPath));
    Assert.assertEquals(this.fs.listStatus(existingOutputPath).length, 0);

    writer.actualProcessedCopyableFile = Optional.of(cf);

    // commit
    writer.commit();

    // check state of relevant paths after commit
    Path expectedOutputPath = new Path(outputRoot, destinationWithoutLeadingSeparator);
    Assert.assertTrue(this.fs.exists(expectedOutputPath));
    fileStatus = this.fs.getFileStatus(expectedOutputPath);
    Assert.assertEquals(fileStatus.getOwner(), ownerAndPermission.getOwner());
    Assert.assertEquals(fileStatus.getGroup(), ownerAndPermission.getGroup());
    Assert.assertEquals(fileStatus.getPermission(), readWrite);
    // parent should have permissions set correctly
    fileStatus = this.fs.getFileStatus(expectedOutputPath.getParent());
    Assert.assertEquals(fileStatus.getPermission(), dirReadWrite);
    // previously existing paths should not have permissions changed
    fileStatus = this.fs.getFileStatus(existingOutputPath);
    Assert.assertEquals(fileStatus.getPermission(), existingPathPermission);

    Assert.assertFalse(this.fs.exists(writer.stagingDir));
  }
 @Override
 public void setActualHighWatermark(WorkUnitState wus) {
   wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(LongWatermark.class));
 }