Exemplo n.º 1
0
 public static void create(final Configuration configuration) {
   final SparkConf sparkConf = new SparkConf();
   configuration
       .getKeys()
       .forEachRemaining(key -> sparkConf.set(key, configuration.getProperty(key).toString()));
   sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin");
   CONTEXT = SparkContext.getOrCreate(sparkConf);
 }
Exemplo n.º 2
0
  public SparkMapReduce(
      final SparkConf conf,
      final String name,
      final IMapperFunction<KEYIN, VALUEIN, K, V> pMapper,
      final IReducerFunction<K, V, KOUT, VOUT> pRetucer,
      IPartitionFunction<K> pPartitioner,
      IKeyValueConsumer<KOUT, VOUT>... pConsumer) {
    setMap(pMapper);
    setReduce(pRetucer);
    setPartitioner(pPartitioner);

    for (int i = 0; i < pConsumer.length; i++) {
      IKeyValueConsumer<KOUT, VOUT> cns = pConsumer[i];
      addConsumer(cns);
    }
    conf.setAppName(name);
  }
Exemplo n.º 3
0
  private static void init() {
    SparkConf conf = new SparkConf();
    conf.setAppName("binend countByValue");
    conf.setMaster("spark://localhost:7077");

    JavaSparkContext jsc = new JavaSparkContext(conf);
    jsc.addJar(
        "/home/titanic/soft/intelijWorkspace/github-spark/com-hadoop-spark/target/com-hadoop-spark-1.0-SNAPSHOT.jar");

    List<Integer> list = new ArrayList<Integer>();

    for (int x = 0; x <= 10; x++) {
      list.add(x);
    }

    JavaRDD<Integer> rdd = jsc.parallelize(list);
    Map<Integer, Long> map = rdd.countByValue();
    System.out.println(map);
  }
Exemplo n.º 4
0
  public static void main(String[] args) throws Exception {

    Schema schema =
        new Schema.Builder()
            .addColumnsDouble("Sepal length", "Sepal width", "Petal length", "Petal width")
            .addColumnInteger("Species")
            .build();

    SparkConf conf = new SparkConf();
    conf.setMaster("local[*]");
    conf.setAppName("DataVec Example");

    JavaSparkContext sc = new JavaSparkContext(conf);

    String directory =
        new ClassPathResource("IrisData/iris.txt")
            .getFile()
            .getParent(); // Normally just define your directory like "file:/..." or "hdfs:/..."
    JavaRDD<String> stringData = sc.textFile(directory);

    // We first need to parse this comma-delimited (CSV) format; we can do this using
    // CSVRecordReader:
    RecordReader rr = new CSVRecordReader();
    JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr));

    int maxHistogramBuckets = 10;
    DataAnalysis dataAnalysis = AnalyzeSpark.analyze(schema, parsedInputData, maxHistogramBuckets);

    System.out.println(dataAnalysis);

    // We can get statistics on a per-column basis:
    DoubleAnalysis da = (DoubleAnalysis) dataAnalysis.getColumnAnalysis("Sepal length");
    double minValue = da.getMin();
    double maxValue = da.getMax();
    double mean = da.getMean();

    HtmlAnalysis.createHtmlAnalysisFile(dataAnalysis, new File("DataVecIrisAnalysis.html"));

    // To write to HDFS instead:
    // String htmlAnalysisFileContents = HtmlAnalysis.createHtmlAnalysisString(dataAnalysis);
    // SparkUtils.writeStringToFile("hdfs://your/hdfs/path/here",htmlAnalysisFileContents,sc);
  }
 private static JavaSparkContext createSparkContext(SparkContextOptions contextOptions) {
   if (contextOptions.getUsesProvidedSparkContext()) {
     LOG.info("Using a provided Spark Context");
     JavaSparkContext jsc = contextOptions.getProvidedSparkContext();
     if (jsc == null || jsc.sc().isStopped()) {
       LOG.error("The provided Spark context " + jsc + " was not created or was stopped");
       throw new RuntimeException("The provided Spark context was not created or was stopped");
     }
     return jsc;
   } else {
     LOG.info("Creating a brand new Spark Context.");
     SparkConf conf = new SparkConf();
     if (!conf.contains("spark.master")) {
       // set master if not set.
       conf.setMaster(contextOptions.getSparkMaster());
     }
     conf.setAppName(contextOptions.getAppName());
     // register immutable collections serializers because the SDK uses them.
     conf.set("spark.kryo.registrator", BeamSparkRunnerRegistrator.class.getName());
     conf.set("spark.serializer", KryoSerializer.class.getName());
     return new JavaSparkContext(conf);
   }
 }
Exemplo n.º 6
0
  /** Main method for performing the random partition based model ensembler evaluation */
  public static void main(String[] args) {

    // Construction of Spark Configuration
    SparkConf sContext = new SparkConf();
    sContext.setMaster("local[4]");
    sContext.setAppName("JavaLR");
    sContext.set("spark.executor.memory", "4G");

    // Creates the spark context
    sc = new JavaSparkContext(sContext); // "local[4]", "JavaLR");

    // Load train and test data
    JavaRDD<String> trainingData =
        readData("/Users/erangap/Documents/ML_Project/datasets/trainImputedNormalized.csv", "Id")
            .sample(false, 0.1, 11L);
    JavaRDD<String> testdata =
        readData("/Users/erangap/Documents/ML_Project/datasets/testImputedNormalized.csv", "Id")
            .sample(false, 0.1, 11L);

    // trainingData.saveAsTextFile("/Users/erangap/Documents/ML_Project/datasets/reduced.csv");
    JavaRDD<LabeledPoint> points = trainingData.map(new ParsePoint());
    // points.persist(StorageLevel.MEMORY_AND_DISK());
    // System.out.println(points.first().features());
    JavaRDD<LabeledPoint> testPoints = testdata.map(new ParsePoint());
    // testPoints.persist(StorageLevel.MEMORY_AND_DISK());

    System.out.println("Total number of records -> " + points.count());

    RandomPartitionedEnSembler ensembler = new RandomPartitionedEnSembler();
    ensembler.setNoofModels(32);
    ensembler.setThreshold(0.499999);

    // Perform the training
    DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
    Date trainStartTime = Calendar.getInstance().getTime();
    String trainStart = dateFormat.format(trainStartTime);
    ensembler.train(points);
    Date trainEndTime = Calendar.getInstance().getTime();
    String trainEnd = dateFormat.format(trainEndTime);

    // Training time calculations and console print
    long trainElapsed = (trainEndTime.getTime() - trainStartTime.getTime()) / 1000;
    System.out.println("Training Started at -> " + trainStart);
    System.out.println("Training Ended at -> " + trainEnd);
    System.out.println("Time Taken to Train -> " + trainElapsed + " Sec.");

    // Prepare data for testing
    JavaRDD<Double> testingLabels =
        testPoints
            .map(
                new Function<LabeledPoint, Double>() {

                  private static final long serialVersionUID = -6597374940461185814L;

                  public Double call(LabeledPoint dataPoint) throws Exception {
                    return dataPoint.label();
                  }
                })
            .cache();
    List<Double> classLabels = testingLabels.toArray();

    // Perform the predictions
    Date predictStartTime = Calendar.getInstance().getTime();
    String predictStart = dateFormat.format(predictStartTime);
    List<Double> predictedLabels = ensembler.voteAndPredit(testPoints).toArray();
    Date predictEndTime = Calendar.getInstance().getTime();
    String predictEnd = dateFormat.format(predictEndTime);

    // Predict time calculations and console print
    long preditElapsed = (predictEndTime.getTime() - predictStartTime.getTime()) / 1000;
    System.out.println("Prediction Started at -> " + predictStart);
    System.out.println("Prediction Ended at -> " + predictEnd);
    System.out.println("Time Taken to Predit -> " + preditElapsed + " Sec.");

    // Calculate and Display the accuracy
    System.out.println("Testing accuracy (%): " + Metrics.accuracy(classLabels, predictedLabels));
    BinaryClassificationMetrics binaryClassificationMetrics =
        getBinaryClassificationMatrix(ensembler, testPoints);
    System.out.println("Area under the curve -> " + binaryClassificationMetrics.areaUnderROC());
  }
Exemplo n.º 7
0
 public static void create(final String master) {
   final SparkConf sparkConf = new SparkConf();
   sparkConf.setAppName("Apache TinkerPop's Spark-Gremlin");
   sparkConf.setMaster(master);
   CONTEXT = SparkContext.getOrCreate(sparkConf);
 }