@Test
  public void testCrossValidationOneSchema() throws TupleMRException, IOException {
    Configuration conf = getConf();

    int maxIndex = SCHEMA.getFields().size() - 1;

    for (int randomSchema = 0; randomSchema < MAX_RANDOM_SCHEMAS; randomSchema++) {
      Schema schema = permuteSchema(SCHEMA);
      OrderBy sortCriteria = createRandomSortCriteria(schema, maxIndex + 1);
      // TODO could we get empty group fields ??
      String[] groupFields =
          getFirstFields(sortCriteria, 1 + random.nextInt(sortCriteria.getElements().size() - 1));
      ITuple[] tuples = new ITuple[] {new Tuple(schema), new Tuple(schema)};
      for (ITuple tuple : tuples) {
        fillTuple(false, tuple, 0, maxIndex);
      }

      for (int minIndex = maxIndex; minIndex >= 0; minIndex--) {
        /* trick for speeding up the tests */
        DCUtils.cleanupTemporaryInstanceCache(conf, "comparator.dat");
        TupleMRConfigBuilder builder = new TupleMRConfigBuilder();
        builder.addIntermediateSchema(schema);
        builder.setGroupByFields(groupFields);
        builder.setOrderBy(sortCriteria);

        TupleMRConfig tupleMRConf = builder.buildConf();
        TupleMRConfig.set(tupleMRConf, conf);

        // tupleMRConf has changed -> we need a new Serialization object
        ser = new HadoopSerialization(conf);

        SortComparator sortComparator = new SortComparator();
        GroupComparator groupComparator = new GroupComparator();

        sortComparator.setConf(conf);
        groupComparator.setConf(conf);

        for (ITuple tuple : tuples) {
          fillTuple(true, tuple, minIndex, maxIndex);
        }
        for (int indexTuple1 = 0; indexTuple1 < tuples.length; indexTuple1++) {
          for (int indexTuple2 = indexTuple1 + 1; indexTuple2 < tuples.length; indexTuple2++) {
            ITuple tuple1 = tuples[indexTuple1];
            ITuple tuple2 = tuples[indexTuple2];
            assertSameComparison("Sort comparator", sortComparator, tuple1, tuple2);
            assertOppositeOrEqualsComparison(sortComparator, tuple1, tuple2);
            assertSameComparison("Group comparator", groupComparator, tuple1, tuple2);
            assertOppositeOrEqualsComparison(groupComparator, tuple1, tuple2);
          }
        }
      }
    }
  }
 protected static String[] getFirstFields(OrderBy sortCriteria, int numFields) {
   String[] result = new String[numFields];
   for (int i = 0; i < numFields; i++) {
     SortElement element = sortCriteria.getElements().get(i);
     result[i] = element.getName();
   }
   return result;
 }
Example #3
0
  @Override
  public int run(String[] args) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Splout Page Counts example");
    try {
      jComm.parse(args);
    } catch (ParameterException e) {
      System.err.println(e.getMessage());
      jComm.usage();
      System.exit(-1);
    }

    boolean generate = !noGenerate; // just for clarifying

    if (generateTupleFiles && deploy) {
      System.err.println("Can't run a 'dry' TupleFile generation and deploy it.");
      jComm.usage();
      System.exit(-1);
    }

    Path outPath = new Path(outputPath);
    FileSystem outFs = outPath.getFileSystem(getConf());

    if (!FileSystem.getLocal(conf).equals(FileSystem.get(conf))) {
      File nativeLibs = new File("native");
      if (nativeLibs.exists()) {
        SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf);
      }
    }

    if (generate) {
      Path inputPath = new Path(this.inputPath);
      FileSystem inputFileSystem = inputPath.getFileSystem(conf);

      FileStatus[] fileStatuses = inputFileSystem.listStatus(inputPath);

      // define the schema that the resultant table will have: date, hour, pagename, pageviews
      final Schema tableSchema =
          new Schema(
              "pagecounts",
              Fields.parse("date:string, hour:string, pagename:string, pageviews:int"));
      // define the schema of the input files: projectcode, pagename, pageviews, bytes
      Schema fileSchema =
          new Schema(
              "pagecountsfile",
              Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long"));

      // instantiate a TableBuilder
      TableBuilder tableBuilder = new TableBuilder(tableSchema);

      // for every input file...
      for (FileStatus fileStatus : fileStatuses) {
        String fileName = fileStatus.getPath().getName().toString();
        // strip the date and the hour from the file name
        String fileDate = fileName.split("-")[1];
        String fileHour = fileName.split("-")[2].substring(0, 2);
        // instantiate a custom RecordProcessor to process the records of this file
        PageCountsRecordProcessor recordProcessor =
            new PageCountsRecordProcessor(tableSchema, fileDate, fileHour);
        // use the tableBuilder method for adding each of the files to the mix
        tableBuilder.addCSVTextFile(
            fileStatus.getPath(),
            ' ',
            TupleTextInputFormat.NO_QUOTE_CHARACTER,
            TupleTextInputFormat.NO_ESCAPE_CHARACTER,
            false,
            false,
            TupleTextInputFormat.NO_NULL_STRING,
            fileSchema,
            recordProcessor);
      }

      // partition the dataset by pagename - which should give a fair even distribution.
      tableBuilder.partitionBy("pagename");
      // create a compound index on pagename, date so that typical queries for the dataset will be
      // fast
      tableBuilder.createIndex("pagename", "date");

      long nonExactPageSize = memoryForIndexing / 32000; // number of pages
      int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2)));
      Log.info(
          "Pagesize = "
              + pageSize
              + " as memory for indexing was ["
              + memoryForIndexing
              + "] and there are 32000 pages.");

      tableBuilder.initialSQL("pragma page_size=" + pageSize);
      // insertion order is very important for optimizing query speed because it makes data be
      // co-located in disk
      tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc"));

      // instantiate a TablespaceBuilder
      TablespaceBuilder tablespaceBuilder = new TablespaceBuilder();

      // we will partition this dataset in as many partitions as:
      tablespaceBuilder.setNPartitions(nPartitions);
      tablespaceBuilder.add(tableBuilder.build());
      // we turn a specific SQLite pragma on for making autocomplete queries fast
      tablespaceBuilder.initStatements("pragma case_sensitive_like=true;");

      HadoopUtils.deleteIfExists(outFs, outPath);

      // finally, instantiate a TablespaceGenerator and execute it
      TablespaceGenerator tablespaceViewBuilder;

      if (generateTupleFiles) {
        // we subclass TablespaceGenerator to be able to run the generation without outputting the
        // SQLite stores, for
        // benchmark comparisons.
        // In the future this feature may be useful in general for debugging store creation.
        tablespaceViewBuilder =
            new TablespaceGenerator(tablespaceBuilder.build(), outPath) {

              @Override
              public void generateView(
                  Configuration conf, SamplingType samplingType, SamplingOptions samplingOptions)
                  throws Exception {

                prepareOutput(conf);
                final int nPartitions = tablespace.getnPartitions();
                if (nPartitions > 1) {
                  partitionMap = sample(nPartitions, conf, samplingType, samplingOptions);
                } else {
                  partitionMap = PartitionMap.oneShardOpenedMap();
                }
                writeOutputMetadata(conf);

                TupleMRBuilder builder = createMRBuilder(nPartitions, conf);
                // Set a TupleOutput here instead of SQLiteOutput
                builder.setOutput(
                    new Path(outputPath, OUT_STORE),
                    new TupleOutputFormat(tableSchema),
                    ITuple.class,
                    NullWritable.class);
                Job job = builder.createJob();
                executeViewGeneration(job);
              }
            };
      } else {
        // ... otherwise a standard TablespaceGenerator is used.
        tablespaceViewBuilder = new TablespaceGenerator(tablespaceBuilder.build(), outPath);
      }

      tablespaceViewBuilder.generateView(
          getConf(), SamplingType.RESERVOIR, new TupleSampler.DefaultSamplingOptions());
    }

    if (deploy) {
      // use StoreDeployerTool for deploying the already generated dataset
      StoreDeployerTool deployer = new StoreDeployerTool(qnode, getConf());
      ArrayList<TablespaceDepSpec> deployments = new ArrayList<TablespaceDepSpec>();
      deployments.add(new TablespaceDepSpec("pagecounts", outPath.toString(), repFactor, null));
      deployer.deploy(deployments);
    }
    return 1;
  }