Пример #1
0
 /**
  * Return a KijiDataRequest that describes which input columns need to be available.
  *
  * @return A kiji data request.
  */
 public KijiDataRequest getDataRequest() {
   final KijiDataRequest dataRequest = mProducer.getDataRequest();
   if (dataRequest.isEmpty()) {
     throw new JobConfigurationException(
         mProducer.getClass().getName()
             + " returned an empty KijiDataRequest, which is not allowed.");
   }
   return dataRequest;
 }
  @Test
  public void testJSONBulkImporter() throws Exception {
    // Prepare input file:
    File inputFile = File.createTempFile("TestJSONImportInput", ".txt", getLocalTempDir());
    TestingResources.writeTextFile(
        inputFile, TestingResources.get(BulkImporterTestUtils.JSON_IMPORT_DATA));

    Configuration conf = getConf();
    conf.set(
        DescribedInputTextBulkImporter.CONF_FILE,
        BulkImporterTestUtils.localResource(BulkImporterTestUtils.FOO_IMPORT_DESCRIPTOR));

    // Run the bulk-import:
    final KijiMapReduceJob job =
        KijiBulkImportJobBuilder.create()
            .withConf(conf)
            .withBulkImporter(JSONBulkImporter.class)
            .withInput(MapReduceJobInputs.newTextMapReduceJobInput(new Path(inputFile.toString())))
            .withOutput(new DirectKijiTableMapReduceJobOutput(mTable.getURI()))
            .build();
    assertTrue(job.run());

    final Counters counters = job.getHadoopJob().getCounters();
    assertEquals(
        3, counters.findCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_PROCESSED).getValue());
    assertEquals(
        1, counters.findCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_INCOMPLETE).getValue());
    assertEquals(
        0, counters.findCounter(JobHistoryCounters.BULKIMPORTER_RECORDS_REJECTED).getValue());

    // Validate output:
    final KijiRowScanner scanner = mReader.getScanner(KijiDataRequest.create("info"));
    BulkImporterTestUtils.validateImportedRows(scanner, false);
    scanner.close();
  }
Пример #3
0
 @Override
 public KijiDataRequest getDataRequest() {
   KijiDataRequestBuilder builder = KijiDataRequest.builder();
   KijiDataRequestBuilder.ColumnsDef def = builder.newColumnsDef();
   def.withMaxVersions(1);
   // Everything
   def.add(new KijiColumnName("data", "player_data"))
       .add(new KijiColumnName("data", "dire_towers_status"))
       .add(new KijiColumnName("data", "radiant_towers_status"))
       .add(new KijiColumnName("data", "dire_barracks_status"))
       .add(new KijiColumnName("data", "radiant_barracks_status"))
       .add(new KijiColumnName("data", "cluster"))
       .add(new KijiColumnName("data", "season"))
       .add(new KijiColumnName("data", "game_mode"))
       .add(new KijiColumnName("data", "match_seq_num"))
       .add(new KijiColumnName("data", "league_id"))
       .add(new KijiColumnName("data", "first_blood_time"))
       .add(new KijiColumnName("data", "negative_votes"))
       .add(new KijiColumnName("data", "duration"))
       .add(new KijiColumnName("data", "radiant_win"))
       .add(new KijiColumnName("data", "positive_votes"))
       .add(new KijiColumnName("data", "lobby_type"))
       .add(new KijiColumnName("data", "human_players"));
   return builder.addColumns(def).build();
 }
  @Test
  public void testBuilder() throws Exception {
    final KijiTableLayout layout =
        KijiTableLayout.newLayout(KijiTableLayouts.getLayout(KijiTableLayouts.SIMPLE));

    final Kiji kiji =
        new InstanceBuilder()
            .withTable("table", layout)
            .withRow("row1")
            .withFamily("family")
            .withQualifier("column")
            .withValue(1, "foo1")
            .withValue(2, "foo2")
            .withRow("row2")
            .withFamily("family")
            .withQualifier("column")
            .withValue(100, "foo3")
            .build();

    final KijiTable table = kiji.openTable("table");
    final KijiTableReader reader = table.openTableReader();

    // Verify the first row.
    final KijiDataRequest req = KijiDataRequest.create("family", "column");
    final KijiRowData row1 = reader.get(table.getEntityId("row1"), req);
    assertEquals("foo2", row1.getValue("family", "column", 2).toString());

    // Verify the second row.
    final KijiRowData row2 = reader.get(table.getEntityId("row2"), req);
    assertEquals("foo3", row2.getValue("family", "column", 100).toString());

    ResourceUtils.closeOrLog(reader);
    ResourceUtils.releaseOrLog(table);
    ResourceUtils.releaseOrLog(kiji);
  }
Пример #5
0
  private <T> T populateFromRow(
      EntitySpec<T> spec, T entity, long startTime, long endTime, Object... entityIdComponents)
      throws IOException {

    // TODO: Use a pool of tables and/or table readers
    final KijiTable table = mKiji.openTable(spec.getTableName());
    try {
      final KijiTableReader reader = table.openTableReader();
      try {
        final KijiDataRequestBuilder builder = KijiDataRequest.builder();
        builder.withTimeRange(startTime, endTime);
        spec.populateColumnRequests(builder);
        final KijiDataRequest dataRequest = builder.build();
        final EntityId entityId = table.getEntityId(entityIdComponents);
        final KijiRowData row = reader.get(entityId, dataRequest);

        try {
          return spec.populateEntityFromRow(entity, row);
        } catch (IllegalAccessException iae) {
          throw new RuntimeException(iae);
        }

      } finally {
        reader.close();
      }
    } finally {
      table.release();
    }
  }
Пример #6
0
 /** A test to ensure that policies can mask the key value stores of their producers. */
 @Test
 public void testKVMasking() throws IOException {
   // Create a freshness policy that knows where to find the text file backed kv-store.
   KijiFreshnessPolicy policy =
       new ShadowingFreshening("file:" + new File(getLocalTempDir(), KV_FILENAME));
   // Install a freshness policy.
   KijiFreshnessManager manager = KijiFreshnessManager.create(getKiji());
   try {
     manager.registerFreshener(
         "user",
         new KijiColumnName("info", "name"),
         policy,
         new UnconfiguredScoreFunction(),
         Collections.<String, String>emptyMap(),
         true,
         false);
   } finally {
     manager.close();
   }
   final KijiTable userTable = getKiji().openTable("user");
   try {
     final FreshKijiTableReader reader =
         FreshKijiTableReader.Builder.create().withTable(userTable).withTimeout(10000).build();
     try {
       // Read from the table to ensure that the user name is updated.
       KijiRowData data =
           reader.get(userTable.getEntityId("felix"), KijiDataRequest.create("info", "name"));
       assertEquals("Old Gumbie Cat", data.getMostRecentValue("info", "name").toString());
     } finally {
       reader.close();
     }
   } finally {
     userTable.release();
   }
 }
Пример #7
0
  /** A test to make sure that producers run inside of freshening can access key value stores. */
  @Test
  public void testSimpleKVStore() throws IOException {
    final String path = new Path("file:" + new File(getLocalTempDir(), KV_FILENAME)).toString();
    final Map<String, String> params = Maps.newHashMap();
    params.put(SimpleKVScoreFunction.PARAMETER_KEY, path);

    // Install a freshness policy.
    KijiFreshnessManager manager = KijiFreshnessManager.create(getKiji());
    try {
      manager.registerFreshener(
          "user",
          new KijiColumnName("info", "name"),
          AlwaysFreshen.class.getName(),
          SimpleKVScoreFunction.class.getName(),
          params,
          true,
          false,
          false);
    } finally {
      manager.close();
    }
    final KijiTable userTable = getKiji().openTable("user");
    try {
      final FreshKijiTableReader reader =
          FreshKijiTableReader.Builder.create().withTable(userTable).withTimeout(10000).build();
      try {
        // Read from the table to ensure that the user name is updated.
        KijiRowData data =
            reader.get(userTable.getEntityId("felix"), KijiDataRequest.create("info", "name"));
        assertEquals("Railway Cat", data.getMostRecentValue("info", "name").toString());
      } finally {
        reader.close();
      }
    } finally {
      userTable.release();
    }
  }
Пример #8
0
  @Test
  public void testKVStoreInIsFresh() throws IOException {
    // Create a freshness policy that knows where to find the text file backed kv-store.
    KijiFreshnessPolicy policy =
        new KVStoreInIsFreshPolicy("file:" + new File(getLocalTempDir(), KV_FILENAME));
    // Install a freshness policy.
    KijiFreshnessManager manager = KijiFreshnessManager.create(getKiji());
    try {
      manager.registerFreshener(
          "user",
          new KijiColumnName("info", "name"),
          policy,
          new UnconfiguredScoreFunction(),
          Collections.<String, String>emptyMap(),
          true,
          false);
    } finally {
      manager.close();
    }
    KijiTable userTable = null;
    FreshKijiTableReader freshReader = null;
    try {
      userTable = getKiji().openTable("user");
      freshReader =
          FreshKijiTableReader.Builder.create().withTable(userTable).withTimeout(10000).build();

      // Read from the table to ensure that the user name is updated.
      KijiRowData data =
          freshReader.get(userTable.getEntityId("felix"), KijiDataRequest.create("info", "name"));
      // IsFresh should have returned true, so nothing should be written.
      assertEquals("Felis", data.getMostRecentValue("info", "name").toString());
    } finally {
      ResourceUtils.closeOrLog(freshReader);
      ResourceUtils.releaseOrLog(userTable);
    }
  }
/**
 * Builder for KijiMapReduceJobs which run ScoreFunction implementations across all rows of a table.
 *
 * <p>A ScoreFunction MapReduce job runs a ScoreFunction against all rows within the specified range
 * of a table. It runs the ScoreFunction as if it was attached with an {@link
 * org.kiji.scoring.lib.AlwaysFreshen} policy which provides no additional parameters or
 * KeyValueStores.
 *
 * <p>ScoreFunction MapReduce jobs require that all information available to a ScoreFunction via the
 * FreshenerContext be specified during construction of the job. This information includes:
 *
 * <ul>
 *   <li>attached column (normally this would be the column where the Freshener is attached)
 *   <li>string-string parameter mapping (defaults to an empty map)
 *   <li>client data request (normally this would be the request which triggered the run of the
 *       Freshener) (defaults to an empty data request)
 *   <li>KeyValueStores will be constructed from the return value of the ScoreFunction's
 *       getRequiredStores method optionally overridden by KeyValueStores specified to {@link
 *       #withKeyValueStoreOverrides(java.util.Map)}. This optional overriding makes up for the lack
 *       of overrides normally provided by the KijiFreshnessPolicy. (defaults to an empty map)
 * </ul>
 *
 * <p>Example usage:
 *
 * <pre>
 *     final KijiMapReduceJob sfJob = ScoreFunctionJobBuilder.create()
 *         .withConf(conf)
 *         .withInputTable(inputTableURI)
 *         .withAttachedColumn(new KijiColumnName("family:qualifier"))
 *         .withScoreFunctionClass(MyScoreFunction.class)
 *         .withOutput(MapReduceJobOutputs.newDirectKijiTableMapReduceJobOutput(inputTableURI))
 *         .build();
 *     sfJob.run();
 *   </pre>
 */
public final class ScoreFunctionJobBuilder
    extends KijiTableInputJobBuilder<ScoreFunctionJobBuilder> {

  public static final String SCORE_FUNCTION_CLASS_CONF_KEY =
      "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_class_conf_key";
  public static final String SCORE_FUNCTION_PARAMETERS_CONF_KEY =
      "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_parameters_conf_key";
  public static final String SCORE_FUNCTION_ATTACHED_COLUMN_CONF_KEY =
      "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_attached_column_conf_key";
  public static final String SCORE_FUNCTION_CLIENT_DATA_REQUEST_CONF_KEY =
      "org.kiji.scoring.batch.ScoreFunctionJobBuilder.sf_client_data_request_conf_key";

  private static final Gson GSON = new Gson();
  private static final KijiDataRequest DEFAULT_CLIENT_REQUEST = KijiDataRequest.builder().build();
  private static final Map<String, String> DEFAULT_PARAMETERS = Maps.newHashMap();
  private static final int DEFAULT_NUM_THREADS_PER_MAPPER = 1;

  private Class<? extends ScoreFunction<?>> mScoreFunctionClass = null;
  private KijiTableMapReduceJobOutput mJobOutput = null;
  private ScoreFunction<?> mScoreFunction = null;
  private KijiMapper<?, ?, ?, ?> mMapper = null;
  private KijiReducer<?, ?, ?, ?> mReducer = null;
  private KijiDataRequest mScoreFunctionDataRequest = null;
  private int mNumThreadsPerMapper = DEFAULT_NUM_THREADS_PER_MAPPER;
  private KijiDataRequest mClientDataRequest = null;
  private KijiColumnName mAttachedColumn = null;
  private Map<String, String> mParameters = null;
  private Map<String, KeyValueStore<?, ?>> mKeyValueStoreOverrides = null;

  /** Private constructor. Use {@link #create()}. */
  private ScoreFunctionJobBuilder() {}

  /**
   * Create a new ScoreFunctionJobBuilder.
   *
   * @return a new ScoreFunctionJobBuilder.
   */
  public static ScoreFunctionJobBuilder create() {
    return new ScoreFunctionJobBuilder();
  }

  /**
   * Configure the Job to run the given ScoreFunction implementation to generate scores.
   *
   * @param scoreFunctionClass class of the ScoreFunction implementation with which to generate
   *     scores.
   * @return this builder.
   */
  public ScoreFunctionJobBuilder withScoreFunctionClass(
      final Class<? extends ScoreFunction<?>> scoreFunctionClass) {
    mScoreFunctionClass = scoreFunctionClass;
    return this;
  }

  /**
   * Configure the Job to output using the given KijiTableMapReduceJobOutput. The output table must
   * match the input table.
   *
   * @param jobOutput KijiTableMapReduceJobOutput which defines the output from this mapreduce job.
   * @return this builder.
   */
  public ScoreFunctionJobBuilder withOutput(final KijiTableMapReduceJobOutput jobOutput) {
    mJobOutput = jobOutput;
    return super.withOutput(jobOutput);
  }

  /** {@inheritDoc} */
  @Override
  public ScoreFunctionJobBuilder withOutput(final MapReduceJobOutput jobOutput) {
    if (jobOutput instanceof KijiTableMapReduceJobOutput) {
      return withOutput((KijiTableMapReduceJobOutput) jobOutput);
    } else {
      throw new RuntimeException(
          "jobOutput parameter of ScoreFunctionJobBuilder.withOutput() must "
              + "be a KijiTableMapReduceJobOutput.");
    }
  }

  /**
   * Sets the number of threads to use for running the ScoreFunction in parallel.
   *
   * <p>You may use this setting to run multiple instances of your ScoreFunction in parallel within
   * each map task of the job. This may be useful for increasing your throughput when your
   * ScoreFunction is not CPU bound.
   *
   * @param numThreads the number of ScoreFunctions which will be run in parallel per mapper.
   * @return this builder.
   */
  public ScoreFunctionJobBuilder withNumThreadsPerMapper(final int numThreads) {
    Preconditions.checkArgument(0 < numThreads, "numThreads must be positive, got %d", numThreads);
    mNumThreadsPerMapper = numThreads;
    return this;
  }

  /**
   * Configure the Job to include the given client data request. This request will be visible to the
   * ScoreFunction via {@link org.kiji.scoring.FreshenerContext#getClientRequest()}. If unspecified,
   * an empty data request will be used.
   *
   * @param clientDataRequest KijiDataRequest which will be visible to the ScoreFunction.
   * @return this builder.
   */
  public ScoreFunctionJobBuilder withClientDataRequest(final KijiDataRequest clientDataRequest) {
    mClientDataRequest = clientDataRequest;
    return this;
  }

  /**
   * Configure the Job to include the given attached column. This column will be visible to the
   * ScoreFunction via {@link org.kiji.scoring.FreshenerContext#getAttachedColumn()} and will be
   * used as the output column for values written by the ScoreFunction. The schema of this column
   * should be compatible with the schema of values output by the ScoreFunction.
   *
   * @param attachedColumn column to which to write ScoreFunction return values.
   * @return this builder.
   */
  public ScoreFunctionJobBuilder withAttachedColumn(final KijiColumnName attachedColumn) {
    mAttachedColumn = attachedColumn;
    return this;
  }

  /**
   * Configure the Job to include the given parameters. These parameters should be the equivalent of
   * merging request and attachment time parameters from the real time execution of a Freshener.
   *
   * @param parameters parameters which will be available to the ScoreFunction via the
   *     FreshenerContext.
   * @return this builder.
   */
  public ScoreFunctionJobBuilder withParameters(final Map<String, String> parameters) {
    mParameters = parameters;
    return this;
  }

  /**
   * Configures the Job to use the given KeyValueStores in preference to those requested by the
   * ScoreFunction when there are name conflicts. This mirrors the KeyValueStore override behavior
   * provided by a KijiFreshnessPolicy. These KeyValueStores will only replace KeyValueStores
   * requested by the ScoreFunction with the same name. KeyValueStores requested by the
   * ScoreFunction whose names are not shadowed in this map will be available as normal.
   *
   * @param kvStoreOverrides KeyValueStores which will take precedence over stores requested by the
   *     ScoreFunction.
   * @return this builder.
   */
  public ScoreFunctionJobBuilder withKeyValueStoreOverrides(
      final Map<String, KeyValueStore<?, ?>> kvStoreOverrides) {
    mKeyValueStoreOverrides = kvStoreOverrides;
    return this;
  }

  /** {@inheritDoc} */
  @Override
  protected void configureJob(final Job job) throws IOException {
    if (null == mScoreFunctionClass) {
      throw new JobConfigurationException("Must specify a ScoreFunction class.");
    }
    if (null == mClientDataRequest) {
      mClientDataRequest = DEFAULT_CLIENT_REQUEST;
    }
    if (null == mAttachedColumn) {
      throw new JobConfigurationException("Must specified an AttachedColumn.");
    }
    if (null == mParameters) {
      mParameters = DEFAULT_PARAMETERS;
    }

    final Configuration conf = job.getConfiguration();
    conf.setClass(SCORE_FUNCTION_CLASS_CONF_KEY, mScoreFunctionClass, ScoreFunction.class);
    if (!getInputTableURI().equals(mJobOutput.getOutputTableURI())) {
      throw new JobConfigurationException(
          String.format(
              "Output table must be the same as the input" + "table. Got input: %s output: %s",
              getInputTableURI(), mJobOutput.getOutputTableURI()));
    }
    conf.set(SCORE_FUNCTION_ATTACHED_COLUMN_CONF_KEY, mAttachedColumn.getName());
    conf.set(SCORE_FUNCTION_PARAMETERS_CONF_KEY, GSON.toJson(mParameters, Map.class));
    conf.set(
        SCORE_FUNCTION_CLIENT_DATA_REQUEST_CONF_KEY,
        Base64.encodeBase64String(SerializationUtils.serialize(mClientDataRequest)));
    mMapper = new ScoreFunctionMapper();
    mReducer = new IdentityReducer<Object, Object>();
    job.setJobName("Kiji ScoreFunction: " + mScoreFunctionClass.getSimpleName());
    mScoreFunction = ReflectionUtils.newInstance(mScoreFunctionClass, conf);
    final FreshenerContext context =
        InternalFreshenerContext.create(
            mClientDataRequest,
            mAttachedColumn,
            mParameters,
            Maps.<String, String>newHashMap(),
            KeyValueStoreReaderFactory.create(getRequiredStores()));
    mScoreFunctionDataRequest = mScoreFunction.getDataRequest(context);

    super.configureJob(job);
  }

  /** {@inheritDoc} */
  @Override
  protected void configureMapper(final Job job) throws IOException {
    super.configureMapper(job);

    if (mNumThreadsPerMapper > 1) {
      @SuppressWarnings("unchecked")
      Class<? extends Mapper<EntityId, KijiRowData, Object, Object>> childMapperClass =
          (Class<? extends Mapper<EntityId, KijiRowData, Object, Object>>) mMapper.getClass();
      KijiMultithreadedMapper.setMapperClass(job, childMapperClass);
      KijiMultithreadedMapper.setNumberOfThreads(job, mNumThreadsPerMapper);
      job.setMapperClass(KijiMultithreadedMapper.class);
    }
  }

  /** {@inheritDoc} */
  @Override
  protected Map<String, KeyValueStore<?, ?>> getRequiredStores() {
    final FreshenerContext context = InternalFreshenerContext.create(mAttachedColumn, mParameters);
    final Map<String, KeyValueStore<?, ?>> combinedStores = Maps.newHashMap();
    combinedStores.putAll(mScoreFunction.getRequiredStores(context));
    if (null != mKeyValueStoreOverrides) {
      combinedStores.putAll(mKeyValueStoreOverrides);
    }
    return combinedStores;
  }

  /** {@inheritDoc} */
  @Override
  protected KijiDataRequest getDataRequest() {
    return mScoreFunctionDataRequest;
  }

  /** {@inheritDoc} */
  @Override
  protected KijiMapReduceJob build(final Job job) {
    return KijiMapReduceJob.create(job);
  }

  /** {@inheritDoc} */
  @Override
  protected KijiMapper<?, ?, ?, ?> getMapper() {
    return mMapper;
  }

  /** {@inheritDoc} */
  @Override
  protected KijiReducer<?, ?, ?, ?> getCombiner() {
    return null;
  }

  /** {@inheritDoc} */
  @Override
  protected KijiReducer<?, ?, ?, ?> getReducer() {
    return mReducer;
  }

  /** {@inheritDoc} */
  @Override
  protected Class<?> getJarClass() {
    return mScoreFunctionClass;
  }
}
Пример #10
0
 @Override
 public KijiDataRequest getDataRequest(FreshenerContext context) {
   return KijiDataRequest.builder().build();
 }