public static void main(String[] args) { if (!parseArgs(args)) throw new IllegalArgumentException("Parse arguments failed."); SparkConf conf = new SparkConf().setAppName("Logistic Regression with SGD"); SparkContext sc = new SparkContext(conf); JavaRDD<String> data = sc.textFile(inputFile, 1).toJavaRDD(); JavaRDD<LabeledPoint> training = data.map( new Function<String, LabeledPoint>() { public LabeledPoint call(String line) { String[] splits = line.split(","); double[] features = new double[3]; try { features[0] = Double.valueOf(splits[1]); features[1] = Double.valueOf(splits[2]); features[2] = Double.valueOf(splits[3]); return new LabeledPoint(Double.valueOf(splits[3]), Vectors.dense(features)); } catch (NumberFormatException e) { return null; // Nothing to do.. } } }) .filter( new Function<LabeledPoint, Boolean>() { public Boolean call(LabeledPoint p) { return p != null; } }) .cache(); LogisticRegressionModel model = lrs.run(training.rdd()); model.save(sc, outputFile); sc.stop(); }
public static void create(final Configuration configuration) { final SparkConf sparkConf = new SparkConf(); configuration .getKeys() .forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString())); sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); CONTEXT = SparkContext.getOrCreate(sparkConf); }
public static void main(String[] args) throws FileNotFoundException { // Set up contexts. SparkContext sc = SparkUtils.getSparkContext(); SQLContext sqlContext = SparkUtils.getSqlContext(sc); String workingDirectory = args[0]; // read positive data set, cases where the PDB ID occurs in the sentence of the primary citation String positivesIFileName = workingDirectory + "/" + "PositivesI.parquet"; DataFrame positivesI = sqlContext.read().parquet(positivesIFileName); sqlContext.registerDataFrameAsTable(positivesI, "positivesI"); DataFrame positives = sqlContext.sql( "SELECT pdb_id, match_type, deposition_year, pmc_id, pm_id, publication_year, CAST(primary_citation AS double) AS label, sentence, blinded_sentence FROM positivesI"); positives.show(10); String positivesIIFileName = workingDirectory + "/" + "PositivesII.parquet"; DataFrame positivesII = sqlContext.read().parquet(positivesIIFileName); // System.out.println("Sampling 16% of positivesII"); // positivesII = positivesII.sample(false, 0.16, 1); sqlContext.registerDataFrameAsTable(positivesII, "positivesII"); DataFrame negatives = sqlContext.sql( "SELECT pdb_id, match_type, deposition_year, pmc_id, pm_id, publication_year, CAST(primary_citation AS double) AS label, sentence, blinded_sentence FROM positivesII WHERE sentence NOT LIKE '%deposited%' AND sentence NOT LIKE '%submitted%'"); negatives.show(10); long start = System.nanoTime(); String metricFileName = workingDirectory + "/" + "PdbPrimaryCitationMetrics.txt"; PrintWriter writer = new PrintWriter(metricFileName); writer.println("PDB Primary Citation Classification: Logistic Regression Results"); String modelFileName = workingDirectory + "/" + "PdbPrimaryCitationModel.ser"; writer.println(train(sqlContext, positives, negatives, modelFileName)); writer.close(); long end = System.nanoTime(); System.out.println("Time: " + (end - start) / 1E9 + " sec."); sc.stop(); }
public static void refresh() { if (null == CONTEXT) throw new IllegalStateException("The Spark context has not been created."); final Set<String> keepNames = new HashSet<>(); for (final RDD<?> rdd : JavaConversions.asJavaIterable(CONTEXT.persistentRdds().values())) { if (null != rdd.name()) { keepNames.add(rdd.name()); NAME_TO_RDD.put(rdd.name(), rdd); } } // remove all stale names in the NAME_TO_RDD map NAME_TO_RDD .keySet() .stream() .filter(key -> !keepNames.contains(key)) .collect(Collectors.toList()) .forEach(NAME_TO_RDD::remove); }
/** NewHadoopRDD */ @Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad) throws IOException { if (predecessorRdds.size() != 0) { throw new RuntimeException( "Should not have predecessorRdds for Load. Got : " + predecessorRdds); } JobConf loadJobConf = SparkUtil.newJobConf(pigContext); configureLoader(physicalPlan, poLoad, loadJobConf); RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopFile( poLoad.getLFile().getFileName(), PigInputFormat.class, // InputFormat class Text.class, // K class Tuple.class, // V class loadJobConf); // map to get just RDD<Tuple> return hadoopRDD.map(TO_TUPLE_FUNCTION, ScalaUtil.getClassTag(Tuple.class)); }
public static void create(final String master) { final SparkConf sparkConf = new SparkConf(); sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); sparkConf.setMaster(master); CONTEXT = SparkContext.getOrCreate(sparkConf); }
public static void close() { NAME_TO_RDD.clear(); if (null != CONTEXT) CONTEXT.stop(); CONTEXT = null; }
@BeforeClass public static void setUp() { SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkIT"); javaSparkContext = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate(sparkConf)); }