@After
 public void tearDown() throws Exception {
   if (sc != null) {
     sc.stop();
     sc.close();
   }
 }
Example #2
0
  public static void trainModel(String filteredDataPath, String modelpath) throws IOException {

    String line = "";
    String combline = "";
    // read and process raw data
    BufferedReader br = new BufferedReader(new FileReader(filteredDataPath));

    while ((line = br.readLine()) != null) combline = combline + " " + line;

    List<String> words = Lists.newArrayList(combline.split(" "));
    List<List<String>> localDoc = Lists.newArrayList(words, words);

    // build a context object
    JavaSparkContext sc = new JavaSparkContext("local", "Word2VecSuite");
    JavaRDD<List<String>> doc = sc.parallelize(localDoc);

    // training settings
    Word2Vec word2vec = new Word2Vec().setVectorSize(100).setMinCount(50).setSeed(42L);

    // train
    Word2VecModel model = word2vec.fit(doc);

    // save model
    SparkContext sc1 = sc.toSparkContext(sc);
    model.save(sc1, modelpath);
    System.out.println("Model has been saved in folder: " + modelpath);
  }
Example #3
0
  public static void main(String[] args) {
    String logFile;

    if (args.length != 0) logFile = args[0];
    else logFile = "/media/gf/Java/spark-1.4.0-bin-hadoop2.6/README.md";

    final SparkConf conf = new SparkConf().setAppName("Simple Application");

    final JavaSparkContext sc = new JavaSparkContext(conf);
    final JavaRDD<String> logData = sc.textFile(logFile).cache();

    final String[] check = getFilterSet();

    System.out.println("Start: " + new Date());
    for (int i = 0; i < check.length; i++) {
      final int post = i;
      long count =
          logData
              .filter(
                  new Function<String, Boolean>() {
                    public Boolean call(String s) {
                      return s.contains(check[post]);
                    }
                  })
              .count();

      System.out.println("Lines with " + check[i] + ": " + count);
    }
    System.out.println("End: " + new Date());

    sc.close();
  }
Example #4
0
  public static void main(String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL");
    JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
    SQLContext sqlContext = new SQLContext(javaSparkContext);

    System.out.println("=== Data source: RDD ===");
    // Load a text file and convert each line to a Java Bean.
    JavaRDD<Person> people =
        javaSparkContext
            .textFile("people.txt")
            .map(
                (line) -> {
                  String[] parts = line.split(",");
                  Person person = new Person();
                  person.setName(parts[0]);
                  person.setAge(parts[1]);
                  return person;
                });

    // Apply a schema to an RDD of Java Beans and register it as a table.
    DataFrame dataFrame = sqlContext.createDataFrame(people, Person.class);
    dataFrame.registerTempTable("people");

    // SQL can be run over RDDs that have been registered as tables.
    DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

    // 通过DataFrame 获取对应的RDD collection 获取对应的结果内容
    List<String> teenagersName =
        teenagers.toJavaRDD().map((row) -> "Name : " + row.getString(0)).collect();

    teenagersName.forEach(
        (name) -> {
          System.out.println(name);
        });
  }
  public static void main(String[] args) {
    // Create a java spark context
    SparkConf conf = new SparkConf().setAppName("Accumulators");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Create an accumulator to keep track of number of blank lines in callSigns.txt
    final Accumulator<Integer> blankLines = sc.accumulator(0);

    JavaRDD<String> input = sc.textFile("src/main/resources/callSigns.txt");

    JavaRDD<String> callSigns =
        input.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable<String> call(String s) throws Exception {
                if (s.equals("")) {
                  blankLines.add(1);
                }
                return Arrays.asList(s.split(" "));
              }
            });

    callSigns.saveAsTextFile("Chapter5-Output");
    System.out.println("Number of blank lines present in text file : " + blankLines);
  }
Example #6
0
  public static void main(String[] args) {
    if (args.length < 2) {
      System.err.println("Usage: KMeansMP <input_file> <results>");
      System.exit(1);
    }
    String inputFile = args[0];
    String results_path = args[1];
    JavaPairRDD<Integer, Iterable<String>> results;
    int k = 4;
    int iterations = 100;
    int runs = 1;
    long seed = 0;
    final KMeansModel model;

    SparkConf sparkConf = new SparkConf().setAppName("KMeans MP");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<String> lines = sc.textFile(inputFile);

    JavaRDD<Vector> points = lines.map(new ParsePoint());
    JavaRDD<String> titles = lines.map(new ParseTitle());

    model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.RANDOM(), 0);

    results = titles.zip(points).mapToPair(new ClusterCars(model)).groupByKey();

    results.saveAsTextFile(results_path);

    sc.stop();
  }
  public static void main(String args[]) {
    if (args.length == 0) {
      System.out.println("JavaHBaseDistributedScan  {master} {tableName}");
    }

    String master = args[0];
    String tableName = args[1];

    JavaSparkContext jsc = new JavaSparkContext(master, "JavaHBaseDistributedScan");
    jsc.addJar("SparkHBase.jar");

    Configuration conf = HBaseConfiguration.create();
    conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
    conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));

    JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

    Scan scan = new Scan();
    scan.setCaching(100);

    JavaRDD<Tuple2<byte[], List<Tuple3<byte[], byte[], byte[]>>>> javaRdd =
        hbaseContext.hbaseRDD(tableName, scan);

    List<Tuple2<byte[], List<Tuple3<byte[], byte[], byte[]>>>> results = javaRdd.collect();

    results.size();
  }
  public void run(String master) {
    JavaSparkContext sc =
        new JavaSparkContext(
            master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
    FlatMapFunction<Iterator<Integer>, AvgCount> setup =
        new FlatMapFunction<Iterator<Integer>, AvgCount>() {
          @Override
          public Iterable<AvgCount> call(Iterator<Integer> input) {
            AvgCount a = new AvgCount(0, 0);
            while (input.hasNext()) {
              a.total_ += input.next();
              a.num_ += 1;
            }
            ArrayList<AvgCount> ret = new ArrayList<AvgCount>();
            ret.add(a);
            return ret;
          }
        };
    Function2<AvgCount, AvgCount, AvgCount> combine =
        new Function2<AvgCount, AvgCount, AvgCount>() {
          @Override
          public AvgCount call(AvgCount a, AvgCount b) {
            a.total_ += b.total_;
            a.num_ += b.num_;
            return a;
          }
        };

    AvgCount result = rdd.mapPartitions(setup).reduce(combine);
    System.out.println(result.avg());
  }
  public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaLogQuery");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    JavaRDD<String> dataSet =
        (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

    JavaPairRDD<Tuple3<String, String, String>, Stats> extracted =
        dataSet.mapToPair(
            new PairFunction<String, Tuple3<String, String, String>, Stats>() {
              @Override
              public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
                return new Tuple2<Tuple3<String, String, String>, Stats>(
                    extractKey(s), extractStats(s));
              }
            });

    JavaPairRDD<Tuple3<String, String, String>, Stats> counts =
        extracted.reduceByKey(
            new Function2<Stats, Stats, Stats>() {
              @Override
              public Stats call(Stats stats, Stats stats2) {
                return stats.merge(stats2);
              }
            });

    List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
    for (Tuple2<?, ?> t : output) {
      System.out.println(t._1() + "\t" + t._2());
    }
    jsc.stop();
  }
  public static void main(String args[]) {
    if (args.length == 0) {
      System.out.println("JavaHBaseBulkPutExample  {master} {tableName} {columnFamily}");
    }

    String master = args[0];
    String tableName = args[1];
    String columnFamily = args[2];

    JavaSparkContext jsc = new JavaSparkContext(master, "JavaHBaseBulkPutExample");

    List<String> list = new ArrayList<String>();
    list.add("1," + columnFamily + ",a,1");
    list.add("2," + columnFamily + ",a,2");
    list.add("3," + columnFamily + ",a,3");
    list.add("4," + columnFamily + ",a,4");
    list.add("5," + columnFamily + ",a,5");
    JavaRDD<String> rdd = jsc.parallelize(list);

    Configuration conf = HBaseConfiguration.create();
    conf.addResource(new Path("/opt/hadoop-2.6.0/etc/hadoop/core-site.xml"));
    conf.addResource(new Path("/opt/hbase/conf/hbase-site.xml"));

    JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

    hbaseContext.bulkPut(rdd, tableName, new PutFunction(), true);
  }
Example #11
0
 public SparkRuntime(
     SparkPipeline pipeline,
     JavaSparkContext sparkContext,
     Configuration conf,
     Map<PCollectionImpl<?>, Set<Target>> outputTargets,
     Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize,
     Map<PCollection<?>, StorageLevel> toCache,
     Map<PipelineCallable<?>, Set<Target>> allPipelineCallables) {
   this.pipeline = pipeline;
   this.sparkContext = sparkContext;
   this.conf = conf;
   this.counters =
       sparkContext.accumulator(
           Maps.<String, Map<String, Long>>newHashMap(), new CounterAccumulatorParam());
   this.ctxt =
       new SparkRuntimeContext(
           sparkContext.appName(),
           counters,
           sparkContext.broadcast(WritableUtils.toByteArray(conf)));
   this.outputTargets = Maps.newTreeMap(DEPTH_COMPARATOR);
   this.outputTargets.putAll(outputTargets);
   this.toMaterialize = toMaterialize;
   this.toCache = toCache;
   this.allPipelineCallables = allPipelineCallables;
   this.activePipelineCallables = allPipelineCallables.keySet();
   this.status.set(Status.READY);
   this.monitorThread =
       new Thread(
           new Runnable() {
             @Override
             public void run() {
               monitorLoop();
             }
           });
 }
Example #12
0
  public static void main(String[] args) {
    String logFile = "YOUR_SPARK_HOME/README.md"; // Should be some file on your system
    SparkConf conf = new SparkConf().setAppName("Simple Application");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> logData = sc.textFile(logFile).cache();

    long numAs =
        logData
            .filter(
                new Function<String, Boolean>() {
                  public Boolean call(String s) {
                    return s.contains("a");
                  }
                })
            .count();

    long numBs =
        logData
            .filter(
                new Function<String, Boolean>() {
                  public Boolean call(String s) {
                    return s.contains("b");
                  }
                })
            .count();

    System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
  }
 @Test
 public void testJavaSparkContextFunctions() throws Exception {
   SparkContext mockSparkContext = mock(SparkContext.class);
   JavaSparkContext mockJavaSparkContext = mock(JavaSparkContext.class);
   when(mockJavaSparkContext.sc()).thenReturn(mockSparkContext);
   GemFireJavaSparkContextFunctions wrapper = javaFunctions(mockJavaSparkContext);
   assertTrue(mockSparkContext == wrapper.sc);
 }
 @Test
 public void testJavaFunctions1() throws Exception {
   SparkContext sc = mock(SparkContext.class);
   JavaSparkContext jsc = mock(JavaSparkContext.class);
   when(jsc.sc()).thenReturn(sc);
   SparkContextJavaFunctions scjf = javaFunctions(jsc);
   assertThat(scjf.sparkContext, is(jsc.sc()));
 }
  /**
   * @param sqlctx
   * @param mb
   * @param schema
   * @return
   * @throws DMLRuntimeException
   */
  @SuppressWarnings("resource")
  private DataFrame createDataFrame(
      SQLContext sqlctx, MatrixBlock mb, boolean containsID, ValueType[] schema)
      throws DMLRuntimeException {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;

    for (int i = 0; i < mb.getNumRows(); i++) {
      Object[] row = new Object[clen];
      if (containsID) row[0] = i + 1;
      for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
        if (schema[j2] != ValueType.OBJECT) {
          row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
        } else {
          double[] tmp =
              DataConverter.convertToDoubleVector(
                  mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock()));
          row[j2 + off] = new DenseVector(tmp);
          j += colsVector - 1;
        }
      }
      list.add(RowFactory.create(row));
    }

    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
      fields.add(
          DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
      DataType dt = null;
      switch (schema[j]) {
        case STRING:
          dt = DataTypes.StringType;
          break;
        case DOUBLE:
          dt = DataTypes.DoubleType;
          break;
        case INT:
          dt = DataTypes.LongType;
          break;
        case OBJECT:
          dt = new VectorUDT();
          break;
        default:
          throw new RuntimeException("Unsupported value type.");
      }
      fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);

    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sqlctx.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sqlctx.createDataFrame(rowRDD, dfSchema);
  }
Example #16
0
  public static void main(String[] args) {

    if (args.length == 0) {
      System.err.println("Usage: Main <file>");
      System.exit(1);
    }

    SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> lines = sc.textFile(args[0]);

    JavaPairRDD<String, Double> dayArrivalDelayPair =
        lines.flatMapToPair(
            line -> {
              String[] splitLine = line.split(SPLIT_PATTERN);
              String key = splitLine.length == 0 ? "" : splitLine[0];
              Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]);
              return Arrays.asList(new Tuple2<>(key, value));
            });

    JavaPairRDD<String, AverageWrapper> dayAverageWrapper =
        dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1));

    JavaPairRDD<String, AverageWrapper> daysValueCount =
        dayAverageWrapper.reduceByKey(
            (aw1, aw2) ->
                new AverageWrapper(
                    aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount()));

    Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap();
    List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>();
    listResults.addAll(resultMap.entrySet());
    Collections.sort(
        listResults,
        (entry1, entry2) ->
            Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue()));

    for (Map.Entry<String, AverageWrapper> entry : listResults) {
      System.out.printf(
          "%s -> (%f, %d)\n",
          entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount());
    }

    //        JavaPairRDD<String, Double> resultRDD =
    //                daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() /
    // averageWrapper.getCount());
    //
    //        Map<String, Double> results = resultRDD.collectAsMap();

    //        List<Map.Entry<String, Double>> listResults = new ArrayList<>();
    //        listResults.addAll(results.entrySet());
    //        Collections.sort(listResults, (entry1, entry2) ->
    // entry1.getValue().compareTo(entry2.getValue()));
    //
    //        for (Map.Entry<String, Double> entry : listResults) {
    //            System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue());
    //        }
  }
  public static void main(String[] args) {
    String master = args[0];
    String appName = args[1];
    String path = args[2];

    SparkConf conf = new SparkConf().setAppName(appName).setMaster(master);
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<String> lines =
        sc.textFile(path)
            .filter(
                new Function<String, Boolean>() {
                  @Override
                  public Boolean call(String s) throws Exception {
                    return !s.isEmpty() && !s.contains("Total");
                  }
                });

    JavaRDD<String> usOnly =
        lines.filter(
            new Function<String, Boolean>() {
              @Override
              public Boolean call(String s) throws Exception {
                return s.contains("United States");
              }
            });

    JavaPairRDD<String, Integer> yearAndMedals =
        usOnly.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) throws Exception {
                String[] fields = s.split(",");
                return new Tuple2<String, Integer>(fields[3], Integer.parseInt(fields[8]));
              }
            });

    JavaPairRDD<String, Integer> reduced =
        yearAndMedals.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer accumulator, Integer currentValue) throws Exception {
                return accumulator + currentValue;
              }
            });

    JavaPairRDD<String, Integer> result =
        reduced.filter(
            new Function<Tuple2<String, Integer>, Boolean>() {
              @Override
              public Boolean call(Tuple2<String, Integer> tuple) throws Exception {
                return tuple._2 < 200;
              }
            });

    System.out.println();
    System.out.println(result.collect());
  }
  public static boolean SpatialRangeQuery(
      String InputLocation1, String InputLocation2, String OutputLocation) {

    SparkConf sparkConfiguration = new SparkConf().setAppName("Group22-RangeQuery");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConfiguration);
    boolean result = getRangeQuery(InputLocation1, InputLocation2, OutputLocation, sparkContext);
    sparkContext.close();
    return result;
  }
  /**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   * @throws DMLUnsupportedOperationException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String itervar,
      String matrixvar,
      String program,
      String resultFile,
      MatrixObject input,
      ExecutionContext ec,
      PDataPartitionFormat dpf,
      OutputInfo oi,
      boolean tSparseCol, // config params
      boolean enableCPCaching,
      int numReducers) // opt params
      throws DMLRuntimeException, DMLUnsupportedOperationException {
    String jobname = "ParFor-DPESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();

    // prepare input parameters
    MatrixDimensionsMetaData md = (MatrixDimensionsMetaData) input.getMetaData();
    MatrixCharacteristics mc = md.getMatrixCharacteristics();
    InputInfo ii = InputInfo.BinaryBlockInputInfo;

    // initialize accumulators for tasks/iterations
    Accumulator<Integer> aTasks = sc.accumulator(0);
    Accumulator<Integer> aIters = sc.accumulator(0);

    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
    DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf);
    RemoteDPParForSparkWorker efun =
        new RemoteDPParForSparkWorker(
            program, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters);
    List<Tuple2<Long, String>> out =
        in.flatMapToPair(dpfun) // partition the input blocks
            .groupByKey(numReducers) // group partition blocks 		
            .mapPartitionsToPair(efun) // execute parfor tasks, incl cleanup
            .collect(); // get output handles

    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    int numTasks = aTasks.value(); // get accumulator value
    int numIters = aIters.value(); // get accumulator value

    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);

    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
      Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }

    return ret;
  }
  /** Load the data from the json file and return an RDD of Tweet */
  public JavaRDD<Tweet> loadData() {
    // create spark configuration and spark context
    SparkConf conf = new SparkConf().setAppName("Tweet mining").setMaster("local[*]");

    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<Tweet> tweets = sc.textFile(pathToFile).map(line -> Parse.parseJsonToTweet(line));

    return tweets;
  }
Example #21
0
  public static void main(String[] args) {
    JavaSparkContext sc = new JavaSparkContext("local", "JavaAPISuite");

    JavaRDD<String> lines = sc.textFile("log.txt");
    JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")));
    JavaPairRDD<String, Integer> counts =
        words.mapToPair(w -> new Tuple2<String, Integer>(w, 1)).reduceByKey((x, y) -> x + y);

    counts.collect().forEach(t -> System.out.println("Key:" + t._1() + " Value:" + t._2()));
  }
  public static void main(String[] args) {

    // Handle invalid arguments..
    if (args.length < 2) {
      System.out.println("Usage: ConvexHull arg1 arg2");
      System.out.println("arg1: input dataset A file path [points]");
      System.out.println("arg2: output file name and path");
      System.exit(1);
    }

    // Creating and setting sparkconf
    SparkConf sparkConf = new SparkConf().setAppName("Group3-edu.asu.cse512.ConvexHull");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Adding external jars
    // sc.addJar("lib/jts-1.13.jar");

    JavaRDD<String> lines = sc.textFile(args[0]);
    // Using mapPartitions function to find convex hull points in distributed environment
    JavaRDD<Coordinate> hullPointsRDD = lines.mapPartitions(new ConvexH());
    List<Coordinate> hullPointsList = hullPointsRDD.collect();
    Coordinate[] inputArray = new Coordinate[hullPointsList.size()];
    int j = 0;
    for (Coordinate c : hullPointsList) {
      inputArray[j] = c;
      j++;
    }
    // Finding convex hull points on the final subset of points retrieved from distributed
    // environment
    GeometryFactory geoFactory1 = new GeometryFactory();
    MultiPoint mPoint1 = geoFactory1.createMultiPoint(inputArray);
    Geometry geo1 = mPoint1.convexHull();
    Coordinate[] convexHullResult = geo1.getCoordinates();
    int length = convexHullResult.length;
    Coordinate[] convexHullFinalResult = Arrays.copyOf(convexHullResult, length - 1);
    Arrays.sort(convexHullFinalResult);

    // Converting the list of coordinates into Coordinate RDD
    JavaRDD<Coordinate> convexHullResultRDD =
        sc.parallelize(Arrays.asList(convexHullFinalResult), 1);
    JavaRDD<String> convexHullResultString =
        convexHullResultRDD
            .repartition(1)
            .map(
                new Function<Coordinate, String>() {
                  public String call(Coordinate hullPoint) throws Exception {

                    return hullPoint.x + "," + hullPoint.y;
                  }
                });
    // Save the String RDD into text file. Using repartition(1) to preserve the order of coordinates
    convexHullResultString.repartition(1).saveAsTextFile(args[1]);
  }
Example #23
0
  public static void main(String[] args) {
    JavaSparkContext sc = new JavaSparkContext("local", "test-java-client");
    //        JavaSparkContext sc = new JavaSparkContext("spark://192.168.181.23:7077",
    // "test-java-client");
    JavaRDD<String> textFile =
        sc.textFile("/home/congsl/apps/storm/dockerfile-repository/nginx/Dockerfile");
    Map<String, Integer> result =
        textFile
            .flatMap(
                new FlatMapFunction<String, Object>() {
                  @Override
                  public Iterable<Object> call(String s) throws Exception {
                    System.out.println(s);
                    return Arrays.asList(s.split(" "));
                  }
                })
            .map(
                new Function<Object, Map<String, Integer>>() {
                  @Override
                  public Map<String, Integer> call(Object v1) throws Exception {
                    System.out.println(v1);
                    Map<String, Integer> map = new HashMap<String, Integer>();
                    map.put(v1.toString(), 1);
                    return map;
                  }
                })
            .reduce(
                new Function2<Map<String, Integer>, Map<String, Integer>, Map<String, Integer>>() {
                  @Override
                  public Map<String, Integer> call(Map<String, Integer> v1, Map<String, Integer> v2)
                      throws Exception {
                    System.out.println("v1:" + v1);
                    System.out.println("v2:" + v2);
                    for (String key : v2.keySet()) {
                      if (v1.get(key) == null) {
                        v1.put(key, v2.get(key));
                      } else {
                        v1.put(key, v1.get(key) + v2.get(key));
                      }
                    }
                    return v1;
                  }
                });

    System.out.println(result);
    System.out.println(textFile.count());
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    DataFrame df =
        sqlContext.jdbc(
            "jdbc:mysql://localhost:3306/activiti?user=root&password=admin", "ACT_ID_INFO");
    df.show();
    //        JavaSQLContext sqlContext = new JavaSQLContext(sparkContext);
  }
Example #24
0
 public static void main(String args[]) {
   SparkConf conf = new SparkConf().setAppName("KeyValueTest").setMaster("local");
   JavaSparkContext jsc = new JavaSparkContext(conf);
   JavaRDD<String> lines = jsc.textFile("/home/piyushm/samplejson.json");
   List<Person> persons = lines.mapPartitions(new ParseJson()).collect();
   JavaRDD<Person> personJavaRDD = jsc.parallelize(persons);
   JavaRDD<String> csvFileContent = jsc.textFile("/opt/sample.csv");
   System.out.println(csvFileContent.map(new ParseLine()).collect());
   System.out.println(persons);
   System.out.println(personJavaRDD.mapPartitions(new WriteJson()).collect());
   jsc.stop();
 }
  /** "Hello World" spark job that counts the number of each word in a word list. */
  @Test
  public void helloWorld() {
    JavaSparkContext sc = subject.getContext();
    Map<String, Integer> content =
        sc.parallelize(Arrays.asList("one", "two", "two", "three", "three", "three"))
            .mapToPair(new ToItemCounterPair())
            .reduceByKey(new Sum())
            .collectAsMap();

    assertThat(content.get("one"), is(1));
    assertThat(content.get("two"), is(2));
    assertThat(content.get("three"), is(3));
  }
Example #26
0
 public static void main(String[] args) {
   String file = "";
   SparkConf conf = new SparkConf().setAppName("app");
   JavaSparkContext sc = new JavaSparkContext(conf);
   JavaRDD<String> lines = sc.textFile(file);
   JavaRDD<String> errors =
       lines.filter(
           new Function<String, Boolean>() {
             public Boolean call(String x) {
               return x.contains("error");
             }
           });
 }
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("Distinct");
    JavaSparkContext sc = new JavaSparkContext(conf);

    JavaRDD<Integer> nums =
        sc.parallelize(
            Arrays.asList(1, 2, 3, 4, 5, 1, 3, 2, 2, 1, 3, 4, 5, 5, 4, 3, 1, 2, 3, 2, 6, 8, 0));
    JavaRDD<Integer> distinct = nums.distinct();

    System.out.println(StringUtils.join(distinct.collect(), ","));

    sc.close();
  }
  public static void main(String[] args) throws IOException {

    SparkConf config = new SparkConf().setAppName("003-distributed-matrices").setMaster("local[*]");

    try (JavaSparkContext sc = new JavaSparkContext(config)) {

      /* Create a RowMatrix */
      List<Vector> vectors = new ArrayList<>(10);
      for (int i = 0; i < 10; i++) {
        vectors.add(Vectors.dense(getVectorElements()));
      }

      JavaRDD<Vector> rowsRDD = sc.parallelize(vectors, 4);

      RowMatrix rowMatrix = new RowMatrix(rowsRDD.rdd());
      System.out.println(rowMatrix.toString());

      /* Create an IndexedRowMatrix */
      JavaRDD<IndexedRow> indexedRows =
          sc.parallelize(
              Arrays.asList(new IndexedRow(0, vectors.get(0)), new IndexedRow(1, vectors.get(1))));
      IndexedRowMatrix indexedRowMatrix = new IndexedRowMatrix(indexedRows.rdd());
      System.out.println(indexedRowMatrix);

      /* convert */
      JavaRDD<IndexedRow> indexedRowsFromRowMatrix =
          rowMatrix
              .rows()
              .toJavaRDD()
              .zipWithIndex()
              .map((Tuple2<Vector, Long> t) -> new IndexedRow(t._2(), t._1()));
      IndexedRowMatrix indexedRowMatrixFromRowMatrix =
          new IndexedRowMatrix(indexedRowsFromRowMatrix.rdd());
      System.out.println(indexedRowMatrixFromRowMatrix);

      /* Create a CoordinateMatrix
       *     M = [ 5 0 1
       *           0 3 4 ]
       */
      JavaRDD<MatrixEntry> matrixEntries =
          sc.parallelize(
              Arrays.asList(
                  new MatrixEntry(0, 0, 5.),
                  new MatrixEntry(1, 1, 3.),
                  new MatrixEntry(2, 0, 1.),
                  new MatrixEntry(2, 1, 4.)));
      CoordinateMatrix coordMatrix = new CoordinateMatrix(matrixEntries.rdd());
      System.out.println(coordMatrix);
      printSeparator();
    }
  }
Example #29
0
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      throw new IllegalArgumentException(
          "The number of arguments is incorrect. Usage:\n"
              + " <configuration file (conf.xml) path> <job file (.analysis.xml) path> [properties file path]\n"
              + "Got: "
              + Arrays.toString(args));
    }

    final SparkConf conf = new SparkConf().setAppName("DataCleaner-spark");
    final JavaSparkContext sparkContext = new JavaSparkContext(conf);

    final URI confXmlPath = URI.create(args[0]);
    final URI analysisJobXmlPath = URI.create(args[1]);

    final URI propertiesPath;
    if (args.length > 2) {
      propertiesPath = URI.create(args[2]);
    } else {
      propertiesPath = null;
    }

    final SparkJobContext sparkJobContext =
        new SparkJobContext(confXmlPath, analysisJobXmlPath, propertiesPath, sparkContext);

    final ServiceLoader<SparkJobLifeCycleListener> listenerLoaders =
        ServiceLoader.load(SparkJobLifeCycleListener.class);

    for (SparkJobLifeCycleListener listener : listenerLoaders) {
      sparkJobContext.addSparkJobLifeCycleListener(listener);
    }

    final SparkAnalysisRunner sparkAnalysisRunner =
        new SparkAnalysisRunner(sparkContext, sparkJobContext);
    try {
      final AnalysisResultFuture result = sparkAnalysisRunner.run();

      result.await();

      if (sparkJobContext.isResultEnabled()) {
        final Resource resultResource =
            ResultFilePathUtils.getResultResource(sparkContext, sparkJobContext);
        logger.info("DataCleaner result will be written to: {}", resultResource);
        saveResult(result, resultResource);
      } else {
        logger.info("DataCleaner result will not be written - disabled");
      }
    } finally {
      sparkContext.stop();
    }
  }
  public static void main(String[] args) {

    JavaSparkContext javaSparkContext = SparkConfSetup.getJavaSparkContext();

    CassandraConnector connector = SparkConfSetup.getCassandraConnector();

    basicCassandraSession(connector);

    writePeopleToCassandra(javaSparkContext);

    readPeopleFromCassandra(javaSparkContext);

    javaSparkContext.stop();
  }