@Override public void execute( JavaSparkContext ctx, SQLContext sqlContext, WorkflowContext workflowContext, DataFrame df) { workflowContext.out("Executing NodePrintFirstNRows : " + id); Row[] rows = df.take(n); for (Row row : rows) { workflowContext.out(row.toString()); } super.execute(ctx, sqlContext, workflowContext, df); }
@Override public ArrayList<String> call(JobContext jc) { String inputFile = "src/test/resources/testweet.json"; SQLContext sqlctx = jc.sqlctx(); DataFrame input = sqlctx.jsonFile(inputFile); input.registerTempTable("tweets"); DataFrame topTweets = sqlctx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10"); ArrayList<String> tweetList = new ArrayList<>(); for (Row r : topTweets.collect()) { tweetList.add(r.toString()); } return tweetList; }