/** {@inheritDoc} */ @Override public Plan getPlan(String... args) { // parse job parameters int noSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String docsInput = (args.length > 1 ? args[1] : ""); String ranksInput = (args.length > 2 ? args[2] : ""); String visitsInput = (args.length > 3 ? args[3] : ""); String output = (args.length > 4 ? args[4] : ""); /* * Output Format: * 0: URL * 1: DOCUMENT_TEXT */ // Create DataSourceContract for documents relation FileDataSource docs = new FileDataSource(RecordInputFormat.class, docsInput, "Docs Input"); docs.setDegreeOfParallelism(noSubTasks); docs.getCompilerHints().setUniqueField(new FieldSet(0)); RecordInputFormat.configureRecordFormat(docs) .recordDelimiter('\n') .fieldDelimiter('|') .field(VarLengthStringParser.class, 0) .field(VarLengthStringParser.class, 1); /* * Output Format: * 0: URL * 1: RANK * 2: AVG_DURATION */ // Create DataSourceContract for ranks relation FileDataSource ranks = new FileDataSource(RecordInputFormat.class, ranksInput, "Ranks input"); ranks.setDegreeOfParallelism(noSubTasks); RecordInputFormat.configureRecordFormat(ranks) .recordDelimiter('\n') .fieldDelimiter('|') .field(VarLengthStringParser.class, 1) .field(DecimalTextIntParser.class, 0) .field(DecimalTextIntParser.class, 2); /* * Output Format: * 0: URL * 1: DATE */ // Create DataSourceContract for visits relation FileDataSource visits = new FileDataSource(RecordInputFormat.class, visitsInput, "Visits input:q"); visits.setDegreeOfParallelism(noSubTasks); RecordInputFormat.configureRecordFormat(visits) .recordDelimiter('\n') .fieldDelimiter('|') .field(VarLengthStringParser.class, 1) .field(VarLengthStringParser.class, 2); // Create MapContract for filtering the entries from the documents // relation MapContract filterDocs = MapContract.builder(FilterDocs.class).input(docs).name("Filter Docs").build(); filterDocs.setDegreeOfParallelism(noSubTasks); filterDocs.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.15f); filterDocs.getCompilerHints().setAvgBytesPerRecord(60); filterDocs.getCompilerHints().setAvgNumRecordsPerDistinctFields(new FieldSet(new int[] {0}), 1); // Create MapContract for filtering the entries from the ranks relation MapContract filterRanks = MapContract.builder(FilterRanks.class).input(ranks).name("Filter Ranks").build(); filterRanks.setDegreeOfParallelism(noSubTasks); filterRanks.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.25f); filterRanks .getCompilerHints() .setAvgNumRecordsPerDistinctFields(new FieldSet(new int[] {0}), 1); // Create MapContract for filtering the entries from the visits relation MapContract filterVisits = MapContract.builder(FilterVisits.class).input(visits).name("Filter Visits").build(); filterVisits.setDegreeOfParallelism(noSubTasks); filterVisits.getCompilerHints().setAvgBytesPerRecord(60); filterVisits.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.2f); // Create MatchContract to join the filtered documents and ranks // relation MatchContract joinDocsRanks = MatchContract.builder(JoinDocRanks.class, PactString.class, 0, 0) .input1(filterDocs) .input2(filterRanks) .name("Join Docs Ranks") .build(); joinDocsRanks.setDegreeOfParallelism(noSubTasks); // Create CoGroupContract to realize a anti join between the joined // documents and ranks relation and the filtered visits relation CoGroupContract antiJoinVisits = CoGroupContract.builder(AntiJoinVisits.class, PactString.class, 0, 0) .input1(joinDocsRanks) .input2(filterVisits) .name("Antijoin DocsVisits") .build(); antiJoinVisits.setDegreeOfParallelism(noSubTasks); antiJoinVisits.getCompilerHints().setAvgRecordsEmittedPerStubCall(0.8f); // Create DataSinkContract for writing the result of the OLAP query FileDataSink result = new FileDataSink(RecordOutputFormat.class, output, antiJoinVisits, "Result"); result.setDegreeOfParallelism(noSubTasks); RecordOutputFormat.configureRecordFormat(result) .recordDelimiter('\n') .fieldDelimiter('|') .lenient(true) .field(PactInteger.class, 1) .field(PactString.class, 0) .field(PactInteger.class, 2); // Return the PACT plan return new Plan(result, "Weblog Analysis"); }
/** * Creates a configuration builder that can be used to set the input format's parameters to the * config in a fluent fashion. * * @return A config builder for setting parameters. */ public static ConfigBuilder configureRecordFormat(FileDataSink target) { return new ConfigBuilder(target.getParameters()); }