public static void create(final Configuration configuration) { final SparkConf sparkConf = new SparkConf(); configuration .getKeys() .forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString())); sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); CONTEXT = SparkContext.getOrCreate(sparkConf); }
public SparkMapReduce( final SparkConf conf, final String name, final IMapperFunction<KEYIN, VALUEIN, K, V> pMapper, final IReducerFunction<K, V, KOUT, VOUT> pRetucer, IPartitionFunction<K> pPartitioner, IKeyValueConsumer<KOUT, VOUT>... pConsumer) { setMap(pMapper); setReduce(pRetucer); setPartitioner(pPartitioner); for (int i = 0; i < pConsumer.length; i++) { IKeyValueConsumer<KOUT, VOUT> cns = pConsumer[i]; addConsumer(cns); } conf.setAppName(name); }
private static void init() { SparkConf conf = new SparkConf(); conf.setAppName("binend countByValue"); conf.setMaster("spark://localhost:7077"); JavaSparkContext jsc = new JavaSparkContext(conf); jsc.addJar( "/home/titanic/soft/intelijWorkspace/github-spark/com-hadoop-spark/target/com-hadoop-spark-1.0-SNAPSHOT.jar"); List<Integer> list = new ArrayList<Integer>(); for (int x = 0; x <= 10; x++) { list.add(x); } JavaRDD<Integer> rdd = jsc.parallelize(list); Map<Integer, Long> map = rdd.countByValue(); System.out.println(map); }
public static void main(String[] args) throws Exception { Schema schema = new Schema.Builder() .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width") .addColumnInteger("Species") .build(); SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); String directory = new ClassPathResource("IrisData/iris.txt") .getFile() .getParent(); // Normally just define your directory like "file:/..." or "hdfs:/..." JavaRDD<String> stringData = sc.textFile(directory); // We first need to parse this comma-delimited (CSV) format; we can do this using // CSVRecordReader: RecordReader rr = new CSVRecordReader(); JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr)); int maxHistogramBuckets = 10; DataAnalysis dataAnalysis = AnalyzeSpark.analyze(schema, parsedInputData, maxHistogramBuckets); System.out.println(dataAnalysis); // We can get statistics on a per-column basis: DoubleAnalysis da = (DoubleAnalysis) dataAnalysis.getColumnAnalysis("Sepal length"); double minValue = da.getMin(); double maxValue = da.getMax(); double mean = da.getMean(); HtmlAnalysis.createHtmlAnalysisFile(dataAnalysis, new File("DataVecIrisAnalysis.html")); // To write to HDFS instead: // String htmlAnalysisFileContents = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis); // SparkUtils.writeStringToFile("hdfs://your/hdfs/path/here",htmlAnalysisFileContents,sc); }
private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) { if (contextOptions.getUsesProvidedSparkContext()) { LOG.info("Using a provided Spark Context"); JavaSparkContext jsc = contextOptions.getProvidedSparkContext(); if (jsc == null || jsc.sc().isStopped()) { LOG.error("The provided Spark context " + jsc + " was not created or was stopped"); throw new RuntimeException("The provided Spark context was not created or was stopped"); } return jsc; } else { LOG.info("Creating a brand new Spark Context."); SparkConf conf = new SparkConf(); if (!conf.contains("spark.master")) { // set master if not set. conf.setMaster(contextOptions.getSparkMaster()); } conf.setAppName(contextOptions.getAppName()); // register immutable collections serializers because the SDK uses them. conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName()); conf.set("spark.serializer", KryoSerializer.class.getName()); return new JavaSparkContext(conf); } }
/** Main method for performing the random partition based model ensembler evaluation */ public static void main(String[] args) { // Construction of Spark Configuration SparkConf sContext = new SparkConf(); sContext.setMaster("local[4]"); sContext.setAppName("JavaLR"); sContext.set("spark.executor.memory", "4G"); // Creates the spark context sc = new JavaSparkContext(sContext); // "local[4]", "JavaLR"); // Load train and test data JavaRDD<String> trainingData = readData("/Users/erangap/Documents/ML_Project/datasets/trainImputedNormalized.csv", "Id") .sample(false, 0.1, 11L); JavaRDD<String> testdata = readData("/Users/erangap/Documents/ML_Project/datasets/testImputedNormalized.csv", "Id") .sample(false, 0.1, 11L); // trainingData.saveAsTextFile("/Users/erangap/Documents/ML_Project/datasets/reduced.csv"); JavaRDD<LabeledPoint> points = trainingData.map(new ParsePoint()); // points.persist(StorageLevel.MEMORY_AND_DISK()); // System.out.println(points.first().features()); JavaRDD<LabeledPoint> testPoints = testdata.map(new ParsePoint()); // testPoints.persist(StorageLevel.MEMORY_AND_DISK()); System.out.println("Total number of records -> " + points.count()); RandomPartitionedEnSembler ensembler = new RandomPartitionedEnSembler(); ensembler.setNoofModels(32); ensembler.setThreshold(0.499999); // Perform the training DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); Date trainStartTime = Calendar.getInstance().getTime(); String trainStart = dateFormat.format(trainStartTime); ensembler.train(points); Date trainEndTime = Calendar.getInstance().getTime(); String trainEnd = dateFormat.format(trainEndTime); // Training time calculations and console print long trainElapsed = (trainEndTime.getTime() - trainStartTime.getTime()) / 1000; System.out.println("Training Started at -> " + trainStart); System.out.println("Training Ended at -> " + trainEnd); System.out.println("Time Taken to Train -> " + trainElapsed + " Sec."); // Prepare data for testing JavaRDD<Double> testingLabels = testPoints .map( new Function<LabeledPoint, Double>() { private static final long serialVersionUID = -6597374940461185814L; public Double call(LabeledPoint dataPoint) throws Exception { return dataPoint.label(); } }) .cache(); List<Double> classLabels = testingLabels.toArray(); // Perform the predictions Date predictStartTime = Calendar.getInstance().getTime(); String predictStart = dateFormat.format(predictStartTime); List<Double> predictedLabels = ensembler.voteAndPredit(testPoints).toArray(); Date predictEndTime = Calendar.getInstance().getTime(); String predictEnd = dateFormat.format(predictEndTime); // Predict time calculations and console print long preditElapsed = (predictEndTime.getTime() - predictStartTime.getTime()) / 1000; System.out.println("Prediction Started at -> " + predictStart); System.out.println("Prediction Ended at -> " + predictEnd); System.out.println("Time Taken to Predit -> " + preditElapsed + " Sec."); // Calculate and Display the accuracy System.out.println("Testing accuracy (%): " + Metrics.accuracy(classLabels, predictedLabels)); BinaryClassificationMetrics binaryClassificationMetrics = getBinaryClassificationMatrix(ensembler, testPoints); System.out.println("Area under the curve -> " + binaryClassificationMetrics.areaUnderROC()); }
public static void create(final String master) { final SparkConf sparkConf = new SparkConf(); sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin"); sparkConf.setMaster(master); CONTEXT = SparkContext.getOrCreate(sparkConf); }