@Test public void testCrossValidationOneSchema() throws TupleMRException, IOException { Configuration conf = getConf(); int maxIndex = SCHEMA.getFields().size() - 1; for (int randomSchema = 0; randomSchema < MAX_RANDOM_SCHEMAS; randomSchema++) { Schema schema = permuteSchema(SCHEMA); OrderBy sortCriteria = createRandomSortCriteria(schema, maxIndex + 1); // TODO could we get empty group fields ?? String[] groupFields = getFirstFields(sortCriteria, 1 + random.nextInt(sortCriteria.getElements().size() - 1)); ITuple[] tuples = new ITuple[] {new Tuple(schema), new Tuple(schema)}; for (ITuple tuple : tuples) { fillTuple(false, tuple, 0, maxIndex); } for (int minIndex = maxIndex; minIndex >= 0; minIndex--) { /* trick for speeding up the tests */ DCUtils.cleanupTemporaryInstanceCache(conf, "comparator.dat"); TupleMRConfigBuilder builder = new TupleMRConfigBuilder(); builder.addIntermediateSchema(schema); builder.setGroupByFields(groupFields); builder.setOrderBy(sortCriteria); TupleMRConfig tupleMRConf = builder.buildConf(); TupleMRConfig.set(tupleMRConf, conf); // tupleMRConf has changed -> we need a new Serialization object ser = new HadoopSerialization(conf); SortComparator sortComparator = new SortComparator(); GroupComparator groupComparator = new GroupComparator(); sortComparator.setConf(conf); groupComparator.setConf(conf); for (ITuple tuple : tuples) { fillTuple(true, tuple, minIndex, maxIndex); } for (int indexTuple1 = 0; indexTuple1 < tuples.length; indexTuple1++) { for (int indexTuple2 = indexTuple1 + 1; indexTuple2 < tuples.length; indexTuple2++) { ITuple tuple1 = tuples[indexTuple1]; ITuple tuple2 = tuples[indexTuple2]; assertSameComparison("Sort comparator", sortComparator, tuple1, tuple2); assertOppositeOrEqualsComparison(sortComparator, tuple1, tuple2); assertSameComparison("Group comparator", groupComparator, tuple1, tuple2); assertOppositeOrEqualsComparison(groupComparator, tuple1, tuple2); } } } } }
protected static String[] getFirstFields(OrderBy sortCriteria, int numFields) { String[] result = new String[numFields]; for (int i = 0; i < numFields; i++) { SortElement element = sortCriteria.getElements().get(i); result[i] = element.getName(); } return result; }
@Override public int run(String[] args) throws Exception { // Validate params etc JCommander jComm = new JCommander(this); jComm.setProgramName("Splout Page Counts example"); try { jComm.parse(args); } catch (ParameterException e) { System.err.println(e.getMessage()); jComm.usage(); System.exit(-1); } boolean generate = !noGenerate; // just for clarifying if (generateTupleFiles && deploy) { System.err.println("Can't run a 'dry' TupleFile generation and deploy it."); jComm.usage(); System.exit(-1); } Path outPath = new Path(outputPath); FileSystem outFs = outPath.getFileSystem(getConf()); if (!FileSystem.getLocal(conf).equals(FileSystem.get(conf))) { File nativeLibs = new File("native"); if (nativeLibs.exists()) { SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf); } } if (generate) { Path inputPath = new Path(this.inputPath); FileSystem inputFileSystem = inputPath.getFileSystem(conf); FileStatus[] fileStatuses = inputFileSystem.listStatus(inputPath); // define the schema that the resultant table will have: date, hour, pagename, pageviews final Schema tableSchema = new Schema( "pagecounts", Fields.parse("date:string, hour:string, pagename:string, pageviews:int")); // define the schema of the input files: projectcode, pagename, pageviews, bytes Schema fileSchema = new Schema( "pagecountsfile", Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long")); // instantiate a TableBuilder TableBuilder tableBuilder = new TableBuilder(tableSchema); // for every input file... for (FileStatus fileStatus : fileStatuses) { String fileName = fileStatus.getPath().getName().toString(); // strip the date and the hour from the file name String fileDate = fileName.split("-")[1]; String fileHour = fileName.split("-")[2].substring(0, 2); // instantiate a custom RecordProcessor to process the records of this file PageCountsRecordProcessor recordProcessor = new PageCountsRecordProcessor(tableSchema, fileDate, fileHour); // use the tableBuilder method for adding each of the files to the mix tableBuilder.addCSVTextFile( fileStatus.getPath(), ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER, false, false, TupleTextInputFormat.NO_NULL_STRING, fileSchema, recordProcessor); } // partition the dataset by pagename - which should give a fair even distribution. tableBuilder.partitionBy("pagename"); // create a compound index on pagename, date so that typical queries for the dataset will be // fast tableBuilder.createIndex("pagename", "date"); long nonExactPageSize = memoryForIndexing / 32000; // number of pages int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2))); Log.info( "Pagesize = " + pageSize + " as memory for indexing was [" + memoryForIndexing + "] and there are 32000 pages."); tableBuilder.initialSQL("pragma page_size=" + pageSize); // insertion order is very important for optimizing query speed because it makes data be // co-located in disk tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc")); // instantiate a TablespaceBuilder TablespaceBuilder tablespaceBuilder = new TablespaceBuilder(); // we will partition this dataset in as many partitions as: tablespaceBuilder.setNPartitions(nPartitions); tablespaceBuilder.add(tableBuilder.build()); // we turn a specific SQLite pragma on for making autocomplete queries fast tablespaceBuilder.initStatements("pragma case_sensitive_like=true;"); HadoopUtils.deleteIfExists(outFs, outPath); // finally, instantiate a TablespaceGenerator and execute it TablespaceGenerator tablespaceViewBuilder; if (generateTupleFiles) { // we subclass TablespaceGenerator to be able to run the generation without outputting the // SQLite stores, for // benchmark comparisons. // In the future this feature may be useful in general for debugging store creation. tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath) { @Override public void generateView( Configuration conf, SamplingType samplingType, SamplingOptions samplingOptions) throws Exception { prepareOutput(conf); final int nPartitions = tablespace.getnPartitions(); if (nPartitions > 1) { partitionMap = sample(nPartitions, conf, samplingType, samplingOptions); } else { partitionMap = PartitionMap.oneShardOpenedMap(); } writeOutputMetadata(conf); TupleMRBuilder builder = createMRBuilder(nPartitions, conf); // Set a TupleOutput here instead of SQLiteOutput builder.setOutput( new Path(outputPath, OUT_STORE), new TupleOutputFormat(tableSchema), ITuple.class, NullWritable.class); Job job = builder.createJob(); executeViewGeneration(job); } }; } else { // ... otherwise a standard TablespaceGenerator is used. tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath); } tablespaceViewBuilder.generateView( getConf(), SamplingType.RESERVOIR, new TupleSampler.DefaultSamplingOptions()); } if (deploy) { // use StoreDeployerTool for deploying the already generated dataset StoreDeployerTool deployer = new StoreDeployerTool(qnode, getConf()); ArrayList<TablespaceDepSpec> deployments = new ArrayList<TablespaceDepSpec>(); deployments.add(new TablespaceDepSpec("pagecounts", outPath.toString(), repFactor, null)); deployer.deploy(deployments); } return 1; }