@Test public void testFrequentItems() { DataFrame df = context.table("testData2"); String[] cols = {"a"}; DataFrame results = df.stat().freqItems(cols, 0.2); Assert.assertTrue(results.collect()[0].getSeq(0).contains(1)); }
@Test public void testCreateDataFromFromList() { StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true))); List<Row> rows = Arrays.asList(RowFactory.create(0)); DataFrame df = context.createDataFrame(rows, schema); Row[] result = df.collect(); Assert.assertEquals(1, result.length); }
@Override public int run(SparkConf conf, CommandLine cli) throws Exception { long startMs = System.currentTimeMillis(); conf.set("spark.ui.enabled", "false"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); long diffMs = (System.currentTimeMillis() - startMs); System.out.println(">> took " + diffMs + " ms to create SQLContext"); Map<String, String> options = new HashMap<>(); options.put("zkhost", "localhost:9983"); options.put("collection", "ml20news"); options.put("query", "content_txt:[* TO *]"); options.put("fields", "content_txt"); DataFrame solrData = sqlContext.read().format("solr").options(options).load(); DataFrame sample = solrData.sample(false, 0.1d, 5150).select("content_txt"); List<Row> rows = sample.collectAsList(); System.out.println(">> loaded " + rows.size() + " docs to classify"); StructType schema = sample.schema(); CrossValidatorModel cvModel = CrossValidatorModel.load("ml-pipeline-model"); PipelineModel bestModel = (PipelineModel) cvModel.bestModel(); int r = 0; startMs = System.currentTimeMillis(); for (Row next : rows) { Row oneRow = RowFactory.create(next.getString(0)); DataFrame oneRowDF = sqlContext.createDataFrame(Collections.<Row>singletonList(oneRow), schema); DataFrame scored = bestModel.transform(oneRowDF); Row scoredRow = scored.collect()[0]; String predictedLabel = scoredRow.getString(scoredRow.fieldIndex("predictedLabel")); // an acutal app would save the predictedLabel // System.out.println(">> for row["+r+"], model returned "+scoredRows.length+" rows, // "+scoredRows[0]); r++; } diffMs = (System.currentTimeMillis() - startMs); System.out.println(">> took " + diffMs + " ms to score " + rows.size() + " docs"); return 0; }
@Test public void testCrosstab() { DataFrame df = context.table("testData2"); DataFrame crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("2", columnNames[1]); Assert.assertEquals("1", columnNames[2]); Row[] rows = crosstab.collect(); Arrays.sort(rows, crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }