コード例 #1
0
  /** {@inheritDoc} */
  @Override
  public Plan getPlan(String... args) {

    // parse job parameters
    int noSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String docsInput = (args.length > 1 ? args[1] : "");
    String ranksInput = (args.length > 2 ? args[2] : "");
    String visitsInput = (args.length > 3 ? args[3] : "");
    String output = (args.length > 4 ? args[4] : "");

    /*
     * Output Format:
     * 0: URL
     * 1: DOCUMENT_TEXT
     */
    // Create DataSourceContract for documents relation
    FileDataSource docs = new FileDataSource(RecordInputFormat.class, docsInput, "Docs Input");
    docs.setDegreeOfParallelism(noSubTasks);
    docs.getCompilerHints().setUniqueField(new FieldSet(0));
    RecordInputFormat.configureRecordFormat(docs)
        .recordDelimiter('\n')
        .fieldDelimiter('|')
        .field(VarLengthStringParser.class, 0)
        .field(VarLengthStringParser.class, 1);

    /*
     * Output Format:
     * 0: URL
     * 1: RANK
     * 2: AVG_DURATION
     */
    // Create DataSourceContract for ranks relation
    FileDataSource ranks = new FileDataSource(RecordInputFormat.class, ranksInput, "Ranks input");
    ranks.setDegreeOfParallelism(noSubTasks);
    RecordInputFormat.configureRecordFormat(ranks)
        .recordDelimiter('\n')
        .fieldDelimiter('|')
        .field(VarLengthStringParser.class, 1)
        .field(DecimalTextIntParser.class, 0)
        .field(DecimalTextIntParser.class, 2);

    /*
     * Output Format:
     * 0: URL
     * 1: DATE
     */
    // Create DataSourceContract for visits relation
    FileDataSource visits =
        new FileDataSource(RecordInputFormat.class, visitsInput, "Visits input:q");
    visits.setDegreeOfParallelism(noSubTasks);
    RecordInputFormat.configureRecordFormat(visits)
        .recordDelimiter('\n')
        .fieldDelimiter('|')
        .field(VarLengthStringParser.class, 1)
        .field(VarLengthStringParser.class, 2);

    // Create MapContract for filtering the entries from the documents
    // relation
    MapContract filterDocs =
        MapContract.builder(FilterDocs.class).input(docs).name("Filter Docs").build();
    filterDocs.setDegreeOfParallelism(noSubTasks);
    filterDocs.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.15f);
    filterDocs.getCompilerHints().setAvgBytesPerRecord(60);
    filterDocs.getCompilerHints().setAvgNumRecordsPerDistinctFields(new FieldSet(new int[] {0}), 1);

    // Create MapContract for filtering the entries from the ranks relation
    MapContract filterRanks =
        MapContract.builder(FilterRanks.class).input(ranks).name("Filter Ranks").build();
    filterRanks.setDegreeOfParallelism(noSubTasks);
    filterRanks.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.25f);
    filterRanks
        .getCompilerHints()
        .setAvgNumRecordsPerDistinctFields(new FieldSet(new int[] {0}), 1);

    // Create MapContract for filtering the entries from the visits relation
    MapContract filterVisits =
        MapContract.builder(FilterVisits.class).input(visits).name("Filter Visits").build();
    filterVisits.setDegreeOfParallelism(noSubTasks);
    filterVisits.getCompilerHints().setAvgBytesPerRecord(60);
    filterVisits.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.2f);

    // Create MatchContract to join the filtered documents and ranks
    // relation
    MatchContract joinDocsRanks =
        MatchContract.builder(JoinDocRanks.class, PactString.class, 0, 0)
            .input1(filterDocs)
            .input2(filterRanks)
            .name("Join Docs Ranks")
            .build();
    joinDocsRanks.setDegreeOfParallelism(noSubTasks);

    // Create CoGroupContract to realize a anti join between the joined
    // documents and ranks relation and the filtered visits relation
    CoGroupContract antiJoinVisits =
        CoGroupContract.builder(AntiJoinVisits.class, PactString.class, 0, 0)
            .input1(joinDocsRanks)
            .input2(filterVisits)
            .name("Antijoin DocsVisits")
            .build();
    antiJoinVisits.setDegreeOfParallelism(noSubTasks);
    antiJoinVisits.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.8f);

    // Create DataSinkContract for writing the result of the OLAP query
    FileDataSink result =
        new FileDataSink(RecordOutputFormat.class, output, antiJoinVisits, "Result");
    result.setDegreeOfParallelism(noSubTasks);
    RecordOutputFormat.configureRecordFormat(result)
        .recordDelimiter('\n')
        .fieldDelimiter('|')
        .lenient(true)
        .field(PactInteger.class, 1)
        .field(PactString.class, 0)
        .field(PactInteger.class, 2);

    // Return the PACT plan
    return new Plan(result, "Weblog Analysis");
  }
コード例 #2
0
 /**
  * Creates a configuration builder that can be used to set the input format's parameters to the
  * config in a fluent fashion.
  *
  * @return A config builder for setting parameters.
  */
 public static ConfigBuilder configureRecordFormat(FileDataSink target) {
   return new ConfigBuilder(target.getParameters());
 }