@Test
  public void testTextLoad() {
    DataFrame df1 =
        context
            .read()
            .text(
                Thread.currentThread()
                    .getContextClassLoader()
                    .getResource("text-suite.txt")
                    .toString());
    Assert.assertEquals(4L, df1.count());

    DataFrame df2 =
        context
            .read()
            .text(
                Thread.currentThread()
                    .getContextClassLoader()
                    .getResource("text-suite.txt")
                    .toString(),
                Thread.currentThread()
                    .getContextClassLoader()
                    .getResource("text-suite2.txt")
                    .toString());
    Assert.assertEquals(5L, df2.count());
  }
 @Test
 public void testFrequentItems() {
   DataFrame df = context.table("testData2");
   String[] cols = {"a"};
   DataFrame results = df.stat().freqItems(cols, 0.2);
   Assert.assertTrue(results.collect()[0].getSeq(0).contains(1));
 }
  @Test
  public void testBloomFilter() {
    DataFrame df = context.range(1000);

    BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
    Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter1.mightContain(i));
    }

    BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
    Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter2.mightContain(i * 3));
    }

    BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
    Assert.assertTrue(filter3.bitSize() == 64 * 5);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter3.mightContain(i));
    }

    BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
    Assert.assertTrue(filter4.bitSize() == 64 * 5);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter4.mightContain(i * 3));
    }
  }
 @Ignore
 public void testShow() {
   // This test case is intended ignored, but to make sure it compiles correctly
   DataFrame df = context.table("testData");
   df.show();
   df.show(1000);
 }
 void validateDataFrameWithBeans(Bean bean, DataFrame df) {
   StructType schema = df.schema();
   Assert.assertEquals(
       new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()), schema.apply("a"));
   Assert.assertEquals(
       new StructField("b", new ArrayType(IntegerType$.MODULE$, true), true, Metadata.empty()),
       schema.apply("b"));
   ArrayType valueType = new ArrayType(DataTypes.IntegerType, false);
   MapType mapType = new MapType(DataTypes.StringType, valueType, true);
   Assert.assertEquals(new StructField("c", mapType, true, Metadata.empty()), schema.apply("c"));
   Assert.assertEquals(
       new StructField("d", new ArrayType(DataTypes.StringType, true), true, Metadata.empty()),
       schema.apply("d"));
   Row first = df.select("a", "b", "c", "d").first();
   Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
   // Now Java lists and maps are converted to Scala Seq's and Map's. Once we get a Seq below,
   // verify that it has the expected length, and contains expected elements.
   Seq<Integer> result = first.getAs(1);
   Assert.assertEquals(bean.getB().length, result.length());
   for (int i = 0; i < result.length(); i++) {
     Assert.assertEquals(bean.getB()[i], result.apply(i));
   }
   @SuppressWarnings("unchecked")
   Seq<Integer> outputBuffer = (Seq<Integer>) first.getJavaMap(2).get("hello");
   Assert.assertArrayEquals(
       bean.getC().get("hello"),
       Ints.toArray(JavaConverters.seqAsJavaListConverter(outputBuffer).asJava()));
   Seq<String> d = first.getAs(3);
   Assert.assertEquals(bean.getD().size(), d.length());
   for (int i = 0; i < d.length(); i++) {
     Assert.assertEquals(bean.getD().get(i), d.apply(i));
   }
 }
Beispiel #6
0
  public Matrix createVector(BitVector selector) {
    int rows = selector != null ? selector.countOnBits() : frame.size();
    Matrix m = new Matrix(rows, 1);

    for (int i = 0, j = 0; j < frame.size(); j++) {
      if (selector == null || selector.isOn(j)) {
        M rowValue = frame.object(j);

        try {
          Number numValue = (Number) numericField.get(rowValue);
          m.set(i, 0, numValue.doubleValue());

        } catch (IllegalAccessException e) {
          e.printStackTrace();
          throw new IllegalStateException(
              String.format(
                  "Couldn't access field %s: %s", numericField.getName(), e.getMessage()));
        }

        i++;
      }
    }

    return m;
  }
 @Test
 public void testCreateDataFromFromList() {
   StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true)));
   List<Row> rows = Arrays.asList(RowFactory.create(0));
   DataFrame df = context.createDataFrame(rows, schema);
   Row[] result = df.collect();
   Assert.assertEquals(1, result.length);
 }
 @Test
 public void testSampleBy() {
   DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
   DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
   Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
   Row[] expected = {RowFactory.create(0, 5), RowFactory.create(1, 8)};
   Assert.assertArrayEquals(expected, actual);
 }
 @Test
 public void testSampleBy() {
   DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
   DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
   Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
   Assert.assertEquals(0, actual[0].getLong(0));
   Assert.assertTrue(0 <= actual[0].getLong(1) && actual[0].getLong(1) <= 8);
   Assert.assertEquals(1, actual[1].getLong(0));
   Assert.assertTrue(2 <= actual[1].getLong(1) && actual[1].getLong(1) <= 13);
 }
Beispiel #10
0
  @Test
  public void testSave() {
    DataFrame df = (new XmlReader()).withRowTag(booksFileTag).xmlFile(sqlContext, booksFile);
    TestUtils.deleteRecursively(new File(tempDir));
    df.select("price", "description").save(tempDir, "com.databricks.spark.xml");

    DataFrame newDf = (new XmlReader()).xmlFile(sqlContext, tempDir);
    int result = newDf.select("price").collect().length;
    Assert.assertEquals(result, numBooks);
  }
Beispiel #11
0
  @Test
  public void testLoad() {
    HashMap<String, String> options = new HashMap<String, String>();
    options.put("rowTag", booksFileTag);
    options.put("path", booksFile);

    DataFrame df = sqlContext.load("com.databricks.spark.xml", options);
    int result = df.select("description").collect().length;
    Assert.assertEquals(result, numBooks);
  }
 @Test
 public void testCrosstab() {
   DataFrame df = context.table("testData2");
   DataFrame crosstab = df.stat().crosstab("a", "b");
   String[] columnNames = crosstab.schema().fieldNames();
   Assert.assertEquals("a_b", columnNames[0]);
   Assert.assertEquals("2", columnNames[1]);
   Assert.assertEquals("1", columnNames[2]);
   Row[] rows = crosstab.collect();
   Arrays.sort(rows, crosstabRowComparator);
   Integer count = 1;
   for (Row row : rows) {
     Assert.assertEquals(row.get(0).toString(), count.toString());
     Assert.assertEquals(1L, row.getLong(1));
     Assert.assertEquals(1L, row.getLong(2));
     count++;
   }
 }
  @Test
  public void pivot() {
    DataFrame df = context.table("courseSales");
    Row[] actual =
        df.groupBy("year")
            .pivot("course", Arrays.<Object>asList("dotNET", "Java"))
            .agg(sum("earnings"))
            .orderBy("year")
            .collect();

    Assert.assertEquals(2012, actual[0].getInt(0));
    Assert.assertEquals(15000.0, actual[0].getDouble(1), 0.01);
    Assert.assertEquals(20000.0, actual[0].getDouble(2), 0.01);

    Assert.assertEquals(2013, actual[1].getInt(0));
    Assert.assertEquals(48000.0, actual[1].getDouble(1), 0.01);
    Assert.assertEquals(30000.0, actual[1].getDouble(2), 0.01);
  }
Beispiel #14
0
  public Matrix createVector() {
    int rows = frame.size();
    Matrix m = new Matrix(rows, 1);

    for (int i = 0; i < rows; i++) {
      M rowValue = frame.object(i);
      try {
        Number numValue = (Number) numericField.get(rowValue);
        m.set(i, 0, numValue.doubleValue());

      } catch (IllegalAccessException e) {
        e.printStackTrace();
        throw new IllegalStateException(
            String.format("Couldn't access field %s: %s", numericField.getName(), e.getMessage()));
      }
    }

    return m;
  }
  @Test
  public void testCountMinSketch() {
    DataFrame df = context.range(1000);

    CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
    Assert.assertEquals(sketch1.totalCount(), 1000);
    Assert.assertEquals(sketch1.depth(), 10);
    Assert.assertEquals(sketch1.width(), 20);

    CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
    Assert.assertEquals(sketch2.totalCount(), 1000);
    Assert.assertEquals(sketch2.depth(), 10);
    Assert.assertEquals(sketch2.width(), 20);

    CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
    Assert.assertEquals(sketch3.totalCount(), 1000);
    Assert.assertEquals(sketch3.relativeError(), 0.001, 1e-4);
    Assert.assertEquals(sketch3.confidence(), 0.99, 5e-3);

    CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
    Assert.assertEquals(sketch4.totalCount(), 1000);
    Assert.assertEquals(sketch4.relativeError(), 0.001, 1e-4);
    Assert.assertEquals(sketch4.confidence(), 0.99, 5e-3);
  }
Beispiel #16
0
  @Override
  public int run(SparkConf conf, CommandLine cli) throws Exception {

    long startMs = System.currentTimeMillis();

    conf.set("spark.ui.enabled", "false");

    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext sqlContext = new SQLContext(jsc);

    long diffMs = (System.currentTimeMillis() - startMs);
    System.out.println(">> took " + diffMs + " ms to create SQLContext");

    Map<String, String> options = new HashMap<>();
    options.put("zkhost", "localhost:9983");
    options.put("collection", "ml20news");
    options.put("query", "content_txt:[* TO *]");
    options.put("fields", "content_txt");

    DataFrame solrData = sqlContext.read().format("solr").options(options).load();
    DataFrame sample = solrData.sample(false, 0.1d, 5150).select("content_txt");
    List<Row> rows = sample.collectAsList();
    System.out.println(">> loaded " + rows.size() + " docs to classify");

    StructType schema = sample.schema();

    CrossValidatorModel cvModel = CrossValidatorModel.load("ml-pipeline-model");
    PipelineModel bestModel = (PipelineModel) cvModel.bestModel();

    int r = 0;
    startMs = System.currentTimeMillis();
    for (Row next : rows) {
      Row oneRow = RowFactory.create(next.getString(0));
      DataFrame oneRowDF =
          sqlContext.createDataFrame(Collections.<Row>singletonList(oneRow), schema);
      DataFrame scored = bestModel.transform(oneRowDF);
      Row scoredRow = scored.collect()[0];
      String predictedLabel = scoredRow.getString(scoredRow.fieldIndex("predictedLabel"));

      // an acutal app would save the predictedLabel
      // System.out.println(">> for row["+r+"], model returned "+scoredRows.length+" rows,
      // "+scoredRows[0]);

      r++;
    }
    diffMs = (System.currentTimeMillis() - startMs);
    System.out.println(">> took " + diffMs + " ms to score " + rows.size() + " docs");

    return 0;
  }
Beispiel #17
0
  public Predicted(DataFrame<M> f, String pfn) {
    frame = f;
    numericField = null;

    Class<M> mcls = frame.getModelClass();

    try {
      Field mf = mcls.getField(pfn);
      Class t = mf.getType();
      if (Model.isSubclass(t, Number.class)) {
        numericField = mf;
      } else {
        throw new IllegalArgumentException(
            String.format("Field %s is not numeric", pfn, t.getName()));
      }

    } catch (NoSuchFieldException e) {
      e.printStackTrace();
      throw new IllegalArgumentException(String.format("Unknown field name: %s", pfn));
    }
  }
 @Test
 public void testCovariance() {
   DataFrame df = context.table("testData2");
   Double result = df.stat().cov("a", "b");
   Assert.assertTrue(Math.abs(result) < 1.0e-6);
 }
Beispiel #19
0
 @Test
 public void testXmlParser() {
   DataFrame df = (new XmlReader()).withRowTag(booksFileTag).xmlFile(sqlContext, booksFile);
   int result = df.select("id").collect().length;
   Assert.assertEquals(result, numBooks);
 }
  /** See SPARK-5904. Abstract vararg methods defined in Scala do not work in Java. */
  @Test
  public void testVarargMethods() {
    DataFrame df = context.table("testData");

    df.toDF("key1", "value1");

    df.select("key", "value");
    df.select(col("key"), col("value"));
    df.selectExpr("key", "value + 1");

    df.sort("key", "value");
    df.sort(col("key"), col("value"));
    df.orderBy("key", "value");
    df.orderBy(col("key"), col("value"));

    df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
    df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
    df.agg(first("key"), sum("value"));

    df.groupBy().avg("key");
    df.groupBy().mean("key");
    df.groupBy().max("key");
    df.groupBy().min("key");
    df.groupBy().sum("key");

    // Varargs in column expressions
    df.groupBy().agg(countDistinct("key", "value"));
    df.groupBy().agg(countDistinct(col("key"), col("value")));
    df.select(coalesce(col("key")));

    // Varargs with mathfunctions
    DataFrame df2 = context.table("testData2");
    df2.select(exp("a"), exp("b"));
    df2.select(exp(log("a")));
    df2.select(pow("a", "a"), pow("b", 2.0));
    df2.select(pow(col("a"), col("b")), exp("b"));
    df2.select(sin("a"), acos("b"));

    df2.select(rand(), acos("b"));
    df2.select(col("*"), randn(5L));
  }
 @Test
 public void testCollectAndTake() {
   DataFrame df = context.table("testData").filter("key = 1 or key = 2 or key = 3");
   Assert.assertEquals(3, df.select("key").collectAsList().size());
   Assert.assertEquals(2, df.select("key").takeAsList(2).size());
 }
 @Test
 public void testExecution() {
   DataFrame df = context.table("testData").filter("key = 1");
   Assert.assertEquals(1, df.select("key").collect()[0].get(0));
 }
 @Test
 public void testCorrelation() {
   DataFrame df = context.table("testData2");
   Double pearsonCorr = df.stat().corr("a", "b", "pearson");
   Assert.assertTrue(Math.abs(pearsonCorr) < 1.0e-6);
 }