public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaLogQuery");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    JavaRDD<String> dataSet =
        (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

    JavaPairRDD<Tuple3<String, String, String>, Stats> extracted =
        dataSet.mapToPair(
            new PairFunction<String, Tuple3<String, String, String>, Stats>() {
              @Override
              public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
                return new Tuple2<Tuple3<String, String, String>, Stats>(
                    extractKey(s), extractStats(s));
              }
            });

    JavaPairRDD<Tuple3<String, String, String>, Stats> counts =
        extracted.reduceByKey(
            new Function2<Stats, Stats, Stats>() {
              @Override
              public Stats call(Stats stats, Stats stats2) {
                return stats.merge(stats2);
              }
            });

    List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
    for (Tuple2<?, ?> t : output) {
      System.out.println(t._1() + "\t" + t._2());
    }
    jsc.stop();
  }
 @After
 public void tearDown() throws Exception {
   if (sc != null) {
     sc.stop();
     sc.close();
   }
 }
Example #3
0
 @Override
 public void kill() throws InterruptedException {
   if (started) {
     sparkContext.stop();
     set(PipelineResult.EMPTY);
   }
 }
Example #4
0
  public static void main(String[] args) {
    if (args.length < 2) {
      System.err.println("Usage: KMeansMP <input_file> <results>");
      System.exit(1);
    }
    String inputFile = args[0];
    String results_path = args[1];
    JavaPairRDD<Integer, Iterable<String>> results;
    int k = 4;
    int iterations = 100;
    int runs = 1;
    long seed = 0;
    final KMeansModel model;

    SparkConf sparkConf = new SparkConf().setAppName("KMeans MP");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<String> lines = sc.textFile(inputFile);

    JavaRDD<Vector> points = lines.map(new ParsePoint());
    JavaRDD<String> titles = lines.map(new ParseTitle());

    model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.RANDOM(), 0);

    results = titles.zip(points).mapToPair(new ClusterCars(model)).groupByKey();

    results.saveAsTextFile(results_path);

    sc.stop();
  }
 @AfterClass
 public static void clean() throws Exception {
   if (sc != null) {
     sc.stop();
     // wait for jetty & spark to properly shutdown
     Thread.sleep(TimeUnit.SECONDS.toMillis(2));
   }
 }
  public static void setJsscStartFlag() {

    if (jscStartFlag) {
      m_jsc.stop();
      jscStartFlag = false;
      m_jsc = null;
    }
    jsscStartFlag = true;
  }
 public static void main(String args[]) {
   SparkConf conf = new SparkConf().setAppName("KeyValueTest").setMaster("local");
   JavaSparkContext jsc = new JavaSparkContext(conf);
   JavaRDD<String> lines = jsc.textFile("/home/piyushm/samplejson.json");
   List<Person> persons = lines.mapPartitions(new ParseJson()).collect();
   JavaRDD<Person> personJavaRDD = jsc.parallelize(persons);
   JavaRDD<String> csvFileContent = jsc.textFile("/opt/sample.csv");
   System.out.println(csvFileContent.map(new ParseLine()).collect());
   System.out.println(persons);
   System.out.println(personJavaRDD.mapPartitions(new WriteJson()).collect());
   jsc.stop();
 }
Example #8
0
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      throw new IllegalArgumentException(
          "The number of arguments is incorrect. Usage:\n"
              + " <configuration file (conf.xml) path> <job file (.analysis.xml) path> [properties file path]\n"
              + "Got: "
              + Arrays.toString(args));
    }

    final SparkConf conf = new SparkConf().setAppName("DataCleaner-spark");
    final JavaSparkContext sparkContext = new JavaSparkContext(conf);

    final URI confXmlPath = URI.create(args[0]);
    final URI analysisJobXmlPath = URI.create(args[1]);

    final URI propertiesPath;
    if (args.length > 2) {
      propertiesPath = URI.create(args[2]);
    } else {
      propertiesPath = null;
    }

    final SparkJobContext sparkJobContext =
        new SparkJobContext(confXmlPath, analysisJobXmlPath, propertiesPath, sparkContext);

    final ServiceLoader<SparkJobLifeCycleListener> listenerLoaders =
        ServiceLoader.load(SparkJobLifeCycleListener.class);

    for (SparkJobLifeCycleListener listener : listenerLoaders) {
      sparkJobContext.addSparkJobLifeCycleListener(listener);
    }

    final SparkAnalysisRunner sparkAnalysisRunner =
        new SparkAnalysisRunner(sparkContext, sparkJobContext);
    try {
      final AnalysisResultFuture result = sparkAnalysisRunner.run();

      result.await();

      if (sparkJobContext.isResultEnabled()) {
        final Resource resultResource =
            ResultFilePathUtils.getResultResource(sparkContext, sparkJobContext);
        logger.info("DataCleaner result will be written to: {}", resultResource);
        saveResult(result, resultResource);
      } else {
        logger.info("DataCleaner result will not be written - disabled");
      }
    } finally {
      sparkContext.stop();
    }
  }
  public static void main(String[] args) {

    JavaSparkContext javaSparkContext = SparkConfSetup.getJavaSparkContext();

    CassandraConnector connector = SparkConfSetup.getCassandraConnector();

    basicCassandraSession(connector);

    writePeopleToCassandra(javaSparkContext);

    readPeopleFromCassandra(javaSparkContext);

    javaSparkContext.stop();
  }
Example #10
0
  public static void main(String[] args) {

    // create spark and sql context
    JavaSparkContext ctx = CreateSparkContext.create(args);

    SQLContext sqlContext = new SQLContext(ctx);

    WorkflowContext workflowContext = new WorkflowContext();

    chartswf(ctx, sqlContext, workflowContext);

    // stop the context
    ctx.stop();
  }
Example #11
0
  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      System.err.println("Usage: JavaWordCount <input_file> <output_file>");
      System.exit(1);
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterator<String> call(String s) {
                return Arrays.asList(SPACE.split(s)).iterator();
              }
            });

    JavaPairRDD<String, Integer> ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String s) {
                return new Tuple2<String, Integer>(s, 1);
              }
            });

    JavaPairRDD<String, Integer> counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
              }
            });

    /*
    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<?,?> tuple : output) {
      System.out.println(tuple._1() + ": " + tuple._2());
    }
    */
    counts.saveAsTextFile(args[1]);
    ctx.stop();
  }
  public static void main(String[] args) {
    // Create a Spark Context.
    SparkConf conf = new SparkConf().setAppName("Activity").set("spark.eventLog.enabled", "true");
    ;
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaStreamingContext jssc = new JavaStreamingContext(sc, STREAM_INTERVAL);
    String TOPIC = "activityevent";
    String zkQuorum = "localhost:2181";
    String group = "1";
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    topicMap.put(TOPIC, 1);

    JavaPairReceiverInputDStream<String, String> messages =
        KafkaUtils.createStream(jssc, zkQuorum, group, topicMap);
    // messages.print();
    JavaDStream<String> activitydatastream =
        messages.map(
            new Function<Tuple2<String, String>, String>() {
              @Override
              public String call(Tuple2<String, String> tuple2) {
                return tuple2._2();
              }
            });

    final Long teamWindowDurationMs = Durations.minutes(1).milliseconds();
    JavaDStream<Activity> ActivityEntryDStream = activitydatastream.map(Activity::parseFromLine);
    JavaPairDStream<WithTimestamp<String>, Double> ActivityWindowDStream =
        ActivityEntryDStream.mapToPair(
                windows ->
                    new Tuple2<>(
                        WithTimestamp.create(
                            windows.getActivity(),
                            // Apply Fixed Window by rounding the timestamp down to the nearest
                            // multiple of the window size
                            (convertMillsecs(windows.getTimestamp()) / teamWindowDurationMs)
                                * teamWindowDurationMs),
                        windows.getXaxis()))
            .reduceByKey(SUM_REDUCER);

    ActivityWindowDStream.print();

    jssc.start();
    jssc.awaitTermination();
    // jssc.close();
    sc.stop();
    sc.close();
  }
  public JavaApiDemo() {
    DemoApp demoApp = DemoApp$.MODULE$.apply();
    JavaSparkContext sc = new JavaSparkContext(demoApp.sc());

    // Here we are going to save some data to Cassandra...
    List<Person> people =
        Arrays.asList(
            Person.newInstance(1, "John", new Date()),
            Person.newInstance(2, "Anna", new Date()),
            Person.newInstance(3, "Andrew", new Date()));
    JavaRDD<Person> rdd = sc.parallelize(people);
    javaFunctions(rdd, Person.class).saveToCassandra("test", "people");

    // then, we want to read that data as an RDD of CassandraRows and convert them to strings...
    JavaRDD<String> cassandraRowsRDD =
        javaFunctions(sc)
            .cassandraTable("test", "people")
            .toJavaRDD()
            .map(
                new Function<CassandraRow, String>() {
                  @Override
                  public String call(CassandraRow cassandraRow) throws Exception {
                    return cassandraRow.toString();
                  }
                });
    System.out.println(
        "Data as CassandraRows: \n" + StringUtils.join("\n", cassandraRowsRDD.toArray()));

    // finally, we want to read that data as an RDD of Person beans and also convert them to
    // strings...
    JavaRDD<String> rdd2 =
        javaFunctions(sc)
            .cassandraTable("test", "people", Person.class)
            .toJavaRDD()
            .map(
                new Function<Person, String>() {
                  @Override
                  public String call(Person person) throws Exception {
                    return person.toString();
                  }
                });
    System.out.println("Data as Person beans: \n" + StringUtils.join("\n", rdd2.toArray()));

    sc.stop();
  }
  public static void main(String[] args) {
    if (args.length < 2) {
      System.err.println("Usage: NaiveBayesExample <training_data> <test_data>");
      System.exit(1);
    }
    String training_data_path = args[0];
    // https://class.coursera.org/cloudapplications-001/forum/thread?thread_id=1387
    // String test_data_path = args[0];
    String test_data_path = args[1];

    SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<LabeledPoint> train = sc.textFile(training_data_path).map(new DataToPoint());
    // JavaRDD<LabeledPoint> test = sc.textFile(training_data_path).map(new DataToPoint());
    JavaRDD<LabeledPoint> test = sc.textFile(test_data_path).map(new DataToPoint());

    final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0);

    JavaPairRDD<Double, Double> predictionAndLabel =
        test.mapToPair(
            new PairFunction<LabeledPoint, Double, Double>() {
              public Tuple2<Double, Double> call(LabeledPoint p) {
                return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
              }
            });

    double accuracy =
        predictionAndLabel
                .filter(
                    new Function<Tuple2<Double, Double>, Boolean>() {
                      public Boolean call(Tuple2<Double, Double> pl) {
                        return pl._1().equals(pl._2());
                      }
                    })
                .count()
            / (double) test.count();

    System.out.println(accuracy);

    sc.stop();
  }
  public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("JavaNormalizerExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // $example on$
    DataFrame dataFrame = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

    // Normalize each Vector using $L^1$ norm.
    Normalizer normalizer =
        new Normalizer().setInputCol("features").setOutputCol("normFeatures").setP(1.0);

    DataFrame l1NormData = normalizer.transform(dataFrame);
    l1NormData.show();

    // Normalize each Vector using $L^\infty$ norm.
    DataFrame lInfNormData =
        normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
    lInfNormData.show();
    // $example off$
    jsc.stop();
  }
Example #16
0
  public static void main(String[] args) {
    SparkConf sparkconf =
        new SparkConf()
            .setAppName("Simple Application")
            .setMaster("spark://1.245.77.10:7077")
            .set(
                "spark.driver.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set(
                "spark.executor.extraClassPath",
                "E:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/*")
            .set("fs.default.name", "file:///");
    JavaSparkContext sc = new JavaSparkContext(sparkconf);
    Configuration hadoopConfig = sc.hadoopConfiguration();
    hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw-0.2.jar");
    // sc.addJar("e:/installprogram/spark-1.5.2-bin-hadoop2.4/libthirdparty/jmatrw4spark-0.2.jar");

    /*JavaRDD<Double> matrdd2 = sc.parallelize(Arrays.asList(1.0, 3.0, 2.0));
    System.out.println("Start counting parallelize...");
    long values = matrdd2.count();
    System.out.println("Value count of parallelize is " + values);*/

    JavaPairRDD<Long, Double> matrdd =
        sc.newAPIHadoopFile(
            "e:/tmp/vecRow03_x256.mat",
            JMATFileInputFormat.class,
            Long.class,
            Double.class,
            hadoopConfig);
    System.out.println("Start job...");
    long values = matrdd.count();
    System.out.println("Value count of hadoop is " + values);

    sc.stop();
    sc.close();
  }
Example #17
0
  public static void main(String[] args) throws Exception {

    SparkConf sparkConf = new SparkConf().setAppName("ShopJsonParse");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    JavaRDD<String> ciku = ctx.textFile("hdfs://hadoop119:9000/ciku/ciku_zhuyu.txt", 1);
    JavaRDD<String> zhuyu =
        ciku.map(
                new Function<String, String>() {
                  @Override
                  public String call(String s) throws Exception {
                    String[] str = s.split(" ");
                    if (str[1].equals("1")) return str[0];
                    else return "kaer";
                  }
                })
            .distinct()
            .cache();
    JavaRDD<String> haoping =
        ciku.map(
                new Function<String, String>() {
                  @Override
                  public String call(String s) throws Exception {
                    String[] str = s.split(" ");
                    if (str[1].equals("2")) return str[0];
                    else return "kaer";
                  }
                })
            .distinct()
            .cache();
    JavaRDD<String> chaping =
        ciku.map(
                new Function<String, String>() {
                  @Override
                  public String call(String s) throws Exception {
                    String[] str = s.split(" ");
                    if (str[1].equals("3")) return str[0];
                    else return "kaer";
                  }
                })
            .distinct()
            .cache();
    final List<String> zhuyulist = zhuyu.collect();
    final List<String> hplist = haoping.collect();
    final List<String> cplist = chaping.collect();

    JavaRDD<String> mongoratedata = ctx.textFile("hdfs://hadoop119:9000/shopdata/ratelist.json");

    JavaRDD<Map<String, Object>> mongorateall =
        mongoratedata.map(
            new Function<String, Map<String, Object>>() {
              @Override
              public Map<String, Object> call(String line) throws Exception {
                return ParseLineToMap(line);
              }

              private Map<String, Object> ParseLineToMap(String line) {
                Map<String, Object> documentMap = new HashMap<String, Object>();
                try {
                  JSONObject jsonline = new JSONObject(line);
                  documentMap.put("PlatformItemId", jsonline.get("nid").toString());

                  Gson gson = new Gson();
                  rate rate = gson.fromJson(jsonline.get("rate").toString(), rate.class);
                  documentMap.put("ratelist", rate.parsemod());
                } catch (JSONException e) {
                  e.printStackTrace();
                }
                return documentMap;
              }
            });

    JavaPairRDD<String, String> Rates =
        mongorateall.flatMapToPair(
            new PairFlatMapFunction<Map<String, Object>, String, String>() {
              @Override
              public Iterable<Tuple2<String, String>> call(Map<String, Object> map)
                  throws Exception {
                ArrayList<Tuple2<String, String>> flatmaps =
                    new ArrayList<Tuple2<String, String>>();

                String itemid = (String) map.get("PlatformItemId");
                String itempro = "";

                Map<String, String> ratelist = (Map<String, String>) map.get("ratelist");
                if (ratelist == null) {
                  itempro = "null";
                  flatmaps.add(new Tuple2<String, String>(itemid, itempro));
                } else {
                  for (String value : ratelist.values()) {
                    itempro = value;
                    flatmaps.add(new Tuple2<String, String>(itemid, itempro));
                  }
                }
                return flatmaps;
              }
            });

    final Pattern SPACES = Pattern.compile("\\s+");
    JavaPairRDD<String, String> sentences =
        Rates.flatMapValues(
            new Function<String, Iterable<String>>() {
              @Override
              public Iterable<String> call(String s) throws Exception {
                ArrayList<String> list = new ArrayList<String>();
                if (s.contains(" ")) {
                  String[] str = SPACES.split(s);
                  int num = 0;
                  while (num < str.length) {
                    if (!str[num].equals("")) list.add(str[num]);
                    num++;
                  }
                } else {
                  list.add(s);
                }
                return list;
              }
            });

    String filter = "的 也 很 都 了 非常 有些 还 是 点 些 就 看起来 看上去 更 呢 哦 确实 什么的 较 太 啊 吧 得 那么 什么 挺";
    final String[] list = filter.split(" ");
    JavaPairRDD<String, String> words =
        sentences.mapValues(
            new Function<String, String>() {
              @Override
              public String call(String s) throws Exception {
                if (s.length() < 3) {
                  return s + " " + "kaer";
                }
                for (int i = 0; i < zhuyulist.size(); i++) {
                  String zhuyu = zhuyulist.get(i);
                  if (s.contains(zhuyu)) {
                    s = s.replace(zhuyu, " ");
                    int size = s.length();
                    int tap = s.lastIndexOf(" ");
                    String ss = "kaer";
                    if (tap + 1 < size) {
                      ss = s.substring(tap + 1, size);
                    } else {
                      if (tap - 1 > 0) ss = s.substring(0, tap - 1);
                    }
                    for (String tem : list) {
                      if (ss.contains(tem)) ss = ss.replace(tem, "");
                    }
                    return zhuyu + " " + ss;
                  }
                }
                return "long null";
              }
            });

    JavaPairRDD<String, String> filterwords =
        words
            .mapValues(
                new Function<String, String>() {
                  @Override
                  public String call(String s) throws Exception {
                    String tempstr;
                    if (s.contains("kaer")) {
                      tempstr = s.substring(0, s.indexOf(" "));
                      for (int i = 0; i < cplist.size(); i++) {
                        if (tempstr.equals(cplist.get(i))) return "差评 " + "," + tempstr;
                      }
                      for (int i = 0; i < hplist.size(); i++) {
                        if (tempstr.equals(hplist.get(i))) return "好评 " + "," + tempstr;
                      }
                      return "中评 " + "," + tempstr;
                    } else if (s.contains("null")) {
                      return s + ",null";
                    } else {
                      if (s.endsWith(" ")) return "long null,null";
                      tempstr = s.split(" ")[1];
                      for (int i = 0; i < cplist.size(); i++) {
                        if (tempstr.equals(cplist.get(i)))
                          return "差评 " + s.split(" ")[0] + "," + tempstr;
                      }
                      for (int i = 0; i < hplist.size(); i++) {
                        if (tempstr.equals(hplist.get(i)))
                          return "好评 " + s.split(" ")[0] + "," + tempstr;
                      }
                      return "中评 " + s.split(" ")[0] + "," + tempstr;
                    }
                  }
                })
            .filter(
                new Function<Tuple2<String, String>, Boolean>() {
                  @Override
                  public Boolean call(Tuple2<String, String> line) throws Exception {
                    if (line._2.contains("null")) return false;
                    else return true;
                  }
                });

    JavaPairRDD<String, String> ones =
        filterwords.mapToPair(
            new PairFunction<Tuple2<String, String>, String, String>() {
              @Override
              public Tuple2<String, String> call(Tuple2<String, String> line) throws Exception {
                String key = line._1();
                String value = "0,0,0", ll = line._2;
                if (ll.startsWith("好评")) value = "1,0,0";
                else if (ll.startsWith("中评")) value = "0,1,0";
                else if (ll.startsWith("差评")) value = "0,0,1";
                return new Tuple2<String, String>(key, value);
              }
            });

    JavaPairRDD<String, String> result =
        ones.reduceByKey(
            new Function2<String, String, String>() {
              @Override
              public String call(String s1, String s2) throws Exception {
                double h1 = Double.parseDouble(s1.split(",")[0]),
                    h2 = Double.parseDouble(s1.split(",")[1]),
                    h3 = Double.parseDouble(s1.split(",")[2]);
                double hh1 = Double.parseDouble(s2.split(",")[0]),
                    hh2 = Double.parseDouble(s2.split(",")[1]),
                    hh3 = Double.parseDouble(s2.split(",")[2]);
                return (h1 + hh1) + "," + (h2 + hh2) + "," + (h3 + hh3);
              }
            });

    JavaPairRDD<String, Integer> rateresult =
        result.mapValues(
            new Function<String, Integer>() {
              @Override
              public Integer call(String s1) throws Exception {
                double h1 = Double.parseDouble(s1.split(",")[0]),
                    h2 = Double.parseDouble(s1.split(",")[1]),
                    h3 = Double.parseDouble(s1.split(",")[2]);
                if (h1 + h3 == 0) return 50;
                else {
                  return (int) (h1 / (h1 + h3) * 100);
                }
              }
            });

    JavaRDD<String> mongocontentdata =
        ctx.textFile("hdfs://hadoop119:9000/shopdata/ProductContent.json");

    JavaRDD<Map<String, Object>> mongocontentall =
        mongocontentdata.map(
            new Function<String, Map<String, Object>>() {
              @Override
              public Map<String, Object> call(String line) throws Exception {
                return new ShopParse().ParseLine(line);
              }
            });

    JavaPairRDD<String, Map<String, Object>> content =
        mongocontentall.mapToPair(
            new PairFunction<Map<String, Object>, String, Map<String, Object>>() {
              @Override
              public Tuple2<String, Map<String, Object>> call(Map<String, Object> map)
                  throws Exception {
                return new Tuple2<String, Map<String, Object>>(
                    map.get("PlatformItemId").toString(), map);
              }
            });

    JavaRDD<String> mongoproListdata =
        ctx.textFile("hdfs://hadoop119:9000/shopdata/productList.json");

    JavaRDD<Map<String, Object>> mongoproListall =
        mongoproListdata.map(
            new Function<String, Map<String, Object>>() {
              @Override
              public Map<String, Object> call(String line) throws Exception {
                return new ShopParse().ParseproList(line);
              }
            });
    System.out.println("mongoproListall counts :" + mongoproListall.count());

    JavaPairRDD<String, Map<String, Object>> proList =
        mongoproListall
            .mapToPair(
                new PairFunction<Map<String, Object>, String, Map<String, Object>>() {
                  @Override
                  public Tuple2<String, Map<String, Object>> call(Map<String, Object> map)
                      throws Exception {
                    return new Tuple2<String, Map<String, Object>>(
                        map.get("PlatformItemId").toString(), map);
                  }
                })
            .filter(
                new Function<Tuple2<String, Map<String, Object>>, Boolean>() {
                  @Override
                  public Boolean call(Tuple2<String, Map<String, Object>> line) throws Exception {
                    if (line._2.get("isdownloads").toString().equals("true")) return true;
                    else return false;
                  }
                });
    System.out.println("proList counts :" + proList.count());

    JavaRDD<Map<String, Object>> ContJoinPro =
        content
            .join(proList)
            .map(
                new Function<
                    Tuple2<String, Tuple2<Map<String, Object>, Map<String, Object>>>,
                    Map<String, Object>>() {
                  @Override
                  public Map<String, Object> call(
                      Tuple2<String, Tuple2<Map<String, Object>, Map<String, Object>>> line)
                      throws Exception {
                    Map<String, Object> mapprod = line._2._1;
                    mapprod.put("Name", line._2._2.get("Name"));
                    mapprod.put("Photo", line._2._2.get("Photo"));
                    mapprod.put("SellerId", line._2._2.get("SellerId"));
                    mapprod.put("StoreName", line._2._2.get("StoreName"));
                    mapprod.put("Url", line._2._2.get("Url"));
                    mapprod.put("TaokeUrl", line._2._2.get("TaokeUrl"));
                    return mapprod;
                  }
                });

    JavaPairRDD<String, String> Messages =
        ContJoinPro.mapToPair(
            new PairFunction<Map<String, Object>, String, String>() {
              @Override
              public Tuple2<String, String> call(Map<String, Object> map) throws Exception {
                String itemid = (String) map.get("PlatformItemId");
                String itempro = "";
                String From = (String) map.get("isTmall");
                if (From.equals("true")) From = "2";
                else From = "1";
                String Quantity = (String) map.get("Quantity");
                String CmtCount = (String) map.get("ratecount");
                String ImgPath = (String) map.get("detailmessage");
                String[] ImgPaths = ImgPath.split("@=@=@"); // 1-5
                String mobprice = (String) map.get("mobmessage");
                String pcprice = (String) map.get("pcpricemessage");
                String minmaxPrice = (String) map.get("MaxMinPrice");
                String OriginalPrice = (String) map.get("OriginalPrice");
                double p1 = Double.parseDouble(mobprice);
                double p2 = Double.parseDouble(pcprice.split("@=@=@")[0]);
                double min = Double.parseDouble(minmaxPrice.split(",")[0]);
                double max = Double.parseDouble(minmaxPrice.split(",")[1]);
                double origin = Double.parseDouble(OriginalPrice);
                double Price = p1;
                if (Price > p2) Price = p2;
                if (Price == 100000.00) Price = min;
                if (origin < max) OriginalPrice = max + "";

                String IsPost = "0";
                if (!pcprice.endsWith("@=@=@") && pcprice.split("@=@=@")[1].startsWith("0.00"))
                  IsPost = "1";

                String Name = (String) map.get("Name");
                String SellerId = (String) map.get("SellerId");
                String StoreName = (String) map.get("StoreName");
                String Photo = (String) map.get("Photo");
                String Url = (String) map.get("Url");
                String TaokeUrl = (String) map.get("TaokeUrl");

                DecimalFormat ddf = new DecimalFormat("#0.00");
                String Discount = ddf.format(Price / Double.parseDouble(OriginalPrice)) + "";

                SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                String AddTime = df.format(new Date());
                String IsSell = "1";
                String Type = "2";
                String IsChangeImgPath = "0";
                String HotKeyId = "0";
                String OpenIid = "0";

                itempro =
                    From
                        + "@=@=@"
                        + Quantity
                        + "@=@=@"
                        + CmtCount
                        + "@=@=@"
                        + ImgPaths[0]
                        + "@=@=@"
                        + ImgPaths[1]
                        + "@=@=@"
                        + ImgPaths[2]
                        + "@=@=@"
                        + ImgPaths[3]
                        + "@=@=@"
                        + ImgPaths[4]
                        + "@=@=@"
                        + Price
                        + "@=@=@"
                        + IsPost
                        + "@=@=@"
                        + Name
                        + "@=@=@"
                        + SellerId
                        + "@=@=@"
                        + StoreName
                        + "@=@=@"
                        + OriginalPrice
                        + "@=@=@"
                        + Photo
                        + "@=@=@"
                        + Url
                        + "@=@=@"
                        + Discount
                        + "@=@=@"
                        + AddTime
                        + "@=@=@"
                        + IsSell
                        + "@=@=@"
                        + Type
                        + "@=@=@"
                        + IsChangeImgPath
                        + "@=@=@"
                        + HotKeyId
                        + "@=@=@"
                        + TaokeUrl
                        + "@=@=@"
                        + OpenIid;
                return new Tuple2<String, String>(itemid, itempro);
              }
            });

    JavaRDD<String> MessagesAll =
        Messages.leftOuterJoin(rateresult)
            .map(
                new Function<Tuple2<String, Tuple2<String, Optional<Integer>>>, String>() {
                  @Override
                  public String call(Tuple2<String, Tuple2<String, Optional<Integer>>> line)
                      throws Exception {
                    Optional<Integer> possible = line._2._2;
                    int fenshu = 50;
                    if (possible.isPresent()) fenshu = line._2._2.get();
                    return line._1 + "@=@=@" + line._2._1 + "@=@=@" + fenshu;
                  }
                });

    List<String> messages = MessagesAll.collect();
    new MessageToMysql().insert(messages);

    ctx.stop();
  }
 @After
 public void tearDown() {
   sc.stop();
   sc = null;
   System.clearProperty("spark.driver.port");
 }
 static synchronized void stopSparkContext(JavaSparkContext context) {
   if (!Boolean.getBoolean(TEST_REUSE_SPARK_CONTEXT)) {
     context.stop();
   }
 }
Example #20
0
  public static void main(String[] args) {

    // STEP 1: create a SparkConf object
    if (args.length < 1) {
      log.fatal("Syntax Error: there must be one argument (a file name or a directory)");
      throw new RuntimeException();
    }

    // STEP 2: create a SparkConf object
    SparkConf sparkConf = new SparkConf().setAppName("Trending Topic");

    // STEP 3: create a Java Spark context
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

    // STEP 4: read lines of files
    JavaRDD<String> lines = sparkContext.textFile(args[0]);

    JavaRDD<String> words;
    words =
        lines.flatMap(
            new FlatMapFunction<String, String>() {
              @Override
              public Iterable call(String s) throws Exception {
                return Arrays.asList(s.split("\t")[2].split(" "));
              }
            });

    JavaPairRDD<String, Integer> ones;
    ones =
        words.mapToPair(
            new PairFunction<String, String, Integer>() {
              @Override
              public Tuple2<String, Integer> call(String string) {
                return new Tuple2<>(string, 1);
              }
            });

    JavaPairRDD<String, Integer> counts;
    counts =
        ones.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
              @Override
              public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer + integer2;
              }
            });

    // Es necesario invertir las tuplas ya que no podemos ordenar por valor, sino por clave
    JavaPairRDD<Integer, String> swapped;
    swapped =
        counts.mapToPair(
            new PairFunction<Tuple2<String, Integer>, Integer, String>() {
              @Override
              public Tuple2<Integer, String> call(Tuple2<String, Integer> tupla) throws Exception {
                return tupla.swap();
              }
            });

    // STEP 7: sort the results by key
    List<Tuple2<Integer, String>> output = swapped.sortByKey().collect();

    // El ejercicio dice que quitemos las palabras que no aportan nada. Para ello podríamos ponerlas
    // en un fichero y leerlas y luego obviar esas. Vamos a obviar esa parte ya que se entiende y no
    // es el caso del ejercicio
    List<String> excluyentes = new LinkedList<>();
    excluyentes.add("rt");
    excluyentes.add("http");
    excluyentes.add("https");
    excluyentes.add("www");

    for (Tuple2<Integer, String> t : output) {
      if (excluyentes.contains(t._2)) {
        output.remove(t);
      }
    }

    // STEP 8: print the results
    for (int i = 0; i < 10; i++) {
      Tuple2<Integer, String> tuple;
      tuple = output.get(i);
      System.out.println(tuple._2() + ": " + tuple._1());
    }

    // STEP 9: stop the spark context
    sparkContext.stop();
  }
  public static void main(String[] args) {
    // parse the arguments
    Params params = parse(args);
    SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);
    SQLContext jsql = new SQLContext(jsc);

    // configure the base classifier
    LogisticRegression classifier =
        new LogisticRegression()
            .setMaxIter(params.maxIter)
            .setTol(params.tol)
            .setFitIntercept(params.fitIntercept);

    if (params.regParam != null) {
      classifier.setRegParam(params.regParam);
    }
    if (params.elasticNetParam != null) {
      classifier.setElasticNetParam(params.elasticNetParam);
    }

    // instantiate the One Vs Rest Classifier
    OneVsRest ovr = new OneVsRest().setClassifier(classifier);

    String input = params.input;
    RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input);
    RDD<LabeledPoint> train;
    RDD<LabeledPoint> test;

    // compute the train/ test split: if testInput is not provided use part of input
    String testInput = params.testInput;
    if (testInput != null) {
      train = inputData;
      // compute the number of features in the training set.
      int numFeatures = inputData.first().features().size();
      test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures);
    } else {
      double f = params.fracTest;
      RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[] {1 - f, f}, 12345);
      train = tmp[0];
      test = tmp[1];
    }

    // train the multiclass model
    DataFrame trainingDataFrame = jsql.createDataFrame(train, LabeledPoint.class);
    OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache());

    // score the model on test data
    DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class);
    DataFrame predictions = ovrModel.transform(testDataFrame.cache()).select("prediction", "label");

    // obtain metrics
    MulticlassMetrics metrics = new MulticlassMetrics(predictions);
    StructField predictionColSchema = predictions.schema().apply("prediction");
    Integer numClasses = (Integer) MetadataUtils.getNumClasses(predictionColSchema).get();

    // compute the false positive rate per label
    StringBuilder results = new StringBuilder();
    results.append("label\tfpr\n");
    for (int label = 0; label < numClasses; label++) {
      results.append(label);
      results.append("\t");
      results.append(metrics.falsePositiveRate((double) label));
      results.append("\n");
    }

    Matrix confusionMatrix = metrics.confusionMatrix();
    // output the Confusion Matrix
    System.out.println("Confusion Matrix");
    System.out.println(confusionMatrix);
    System.out.println();
    System.out.println(results);

    jsc.stop();
  }
 @After
 public void tearDown() {
   sc.stop();
   sc = null;
 }
 @After
 public void stopSparkContext() {
   jsc.stop();
   jsc = null;
   jsql = null;
 }
Example #24
0
  public static void main(String[] args) {

    // Path de resultados
    String pathResults = "results";

    String pathToCategories = "values.txt";
    String pathToWords = "words.txt";
    File file = new File(pathToWords);

    HashMap<Double, String> categoriesDict = new HashMap<>();
    HashMap<String, String> resultado = new HashMap<>();

    FileInputStream fis = null;
    try {
      fis = new FileInputStream(pathToCategories);
      // Construct BufferedReader from InputStreamReader
      BufferedReader br = new BufferedReader(new InputStreamReader(fis));

      String line = null;
      while ((line = br.readLine()) != null) {
        String[] words = line.split(" ");
        categoriesDict.put(Double.valueOf(words[0]), words[1]);
      }
      br.close();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }

    // Path donde estaran las categorias
    String pathCategories = "src/main/resources/categoriestest/";

    // Configuracion basica de la aplicacion
    SparkConf sparkConf = new SparkConf().setAppName("NaiveBayesTest").setMaster("local[*]");

    // Creacion del contexto
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    NaiveBayesModel model = NaiveBayesModel.load(jsc.sc(), pathResults);

    HashMap<String, String> dictionary = loadDictionary();

    JavaRDD<String> fileWords = null;

    if (file.exists()) {
      JavaRDD<String> input = jsc.textFile(pathToWords);
      fileWords =
          input.flatMap(
              new FlatMapFunction<String, String>() {
                @Override
                public Iterable<String> call(String s) throws Exception {
                  return Arrays.asList(s.split(" "));
                }
              });
    } else {
      System.out.println("Error, there is no words");
      System.exit(-1);
    }
    ArrayList<String> aFileWords = (ArrayList<String>) fileWords.collect();

    // Cogemos el fichero en el que se encuentran las categorias
    File dir = new File(pathCategories);
    for (File f : dir.listFiles()) {
      JavaRDD<String> input = jsc.textFile(f.getPath());
      JavaRDD<String> words =
          input.flatMap(
              new FlatMapFunction<String, String>() {
                @Override
                public Iterable<String> call(String s) throws Exception {
                  return Arrays.asList(s.split(" "));
                }
              });
      JavaPairRDD<String, Double> wordCount = Reducer.parseWords(words, dictionary);
      List<Tuple2<String, Double>> total = wordCount.collect();
      List<Tuple2<String, Double>> elementsRemoved = new ArrayList<>();
      for (Tuple2<String, Double> t : total) {
        if (!t._1.equals("")) {
          elementsRemoved.add(new Tuple2<>(t._1, t._2 / wordCount.count()));
        }
      }
      ArrayList<Tuple2<String, Double>> freqFinal = new ArrayList<>();
      for (String s : aFileWords) {
        boolean found = false;
        for (Tuple2<String, Double> t : elementsRemoved) {
          if (t._1.equals(s)) {
            found = true;
            freqFinal.add(t);
            break;
          }
        }
        if (!found) {
          freqFinal.add(new Tuple2<String, Double>(s, 0.0));
        }
      }
      double[] v = new double[freqFinal.size()];
      for (int i = 0; i < freqFinal.size(); i++) {
        Tuple2<String, Double> t = freqFinal.get(i);
        v[i] = t._2;
      }
      org.apache.spark.mllib.linalg.Vector vector = Vectors.dense(v);
      /**/
      double d = model.predict(vector);
      System.out.println(categoriesDict.get(d));
      resultado.put(f.getName(), categoriesDict.get(d));
    }
    jsc.stop();
    try {
      Thread.sleep(2000);
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
    for (String key : resultado.keySet()) {
      System.out.println(key + " - " + resultado.get(key));
    }
  }
  public void run() throws IOException, URISyntaxException {

    final SparkConf conf = new SparkConf().setAppName("FixMipmapUrlClient");
    final JavaSparkContext sparkContext = new JavaSparkContext(conf);

    final String sparkAppId = sparkContext.getConf().getAppId();
    final String executorsJson = LogUtilities.getExecutorsApiJson(sparkAppId);

    LOG.info("run: appId is {}, executors data is {}", sparkAppId, executorsJson);

    final RenderDataClient sourceDataClient =
        new RenderDataClient(parameters.baseDataUrl, parameters.owner, parameters.project);

    final List<Double> zValues =
        sourceDataClient.getStackZValues(parameters.stack, parameters.minZ, parameters.maxZ);

    if (zValues.size() == 0) {
      throw new IllegalArgumentException("source stack does not contain any matching z values");
    }

    final RenderDataClient targetDataClient =
        new RenderDataClient(
            parameters.baseDataUrl, parameters.getTargetOwner(), parameters.getTargetProject());

    final StackMetaData targetStackMetaData =
        targetDataClient.getStackMetaData(parameters.getTargetStack());
    if (!targetStackMetaData.isLoading()) {
      throw new IllegalArgumentException(
          "target stack must be in the loading state, meta data is " + targetStackMetaData);
    }

    final LinkedHashMap<Pattern, String> replacementData =
        loadReplacementData(parameters.replacementDataFile);

    final JavaRDD<Double> rddZValues = sparkContext.parallelize(zValues);

    final Function<Double, Integer> transformFunction =
        new Function<Double, Integer>() {

          final @Override public Integer call(final Double z) throws Exception {

            LogUtilities.setupExecutorLog4j("z " + z);
            // get the source client
            final RenderDataClient sourceDataClient =
                new RenderDataClient(parameters.baseDataUrl, parameters.owner, parameters.project);
            // get the target client(which can be the same as the source)
            final RenderDataClient targetDataClient =
                new RenderDataClient(
                    parameters.baseDataUrl,
                    parameters.getTargetOwner(),
                    parameters.getTargetProject());

            final ResolvedTileSpecCollection sourceCollection =
                sourceDataClient.getResolvedTiles(parameters.stack, z);

            final boolean fixImage =
                UrlType.BOTH.equals(parameters.urlType) || UrlType.IMAGE.equals(parameters.urlType);
            final boolean fixMask =
                UrlType.BOTH.equals(parameters.urlType) || UrlType.MASK.equals(parameters.urlType);

            boolean fixedAtLeastOneSpec = false;
            ImageAndMask imageAndMask;
            ImageAndMask fixedImageAndMask;
            String imageUrl;
            String maskUrl;
            for (final TileSpec tileSpec : sourceCollection.getTileSpecs()) {
              final Map.Entry<Integer, ImageAndMask> maxEntry =
                  tileSpec.getFloorMipmapEntry(Integer.MAX_VALUE);
              if (maxEntry != null) {
                for (int level = maxEntry.getKey(); level >= 0; level--) {
                  imageAndMask = tileSpec.getMipmap(level);
                  if (imageAndMask != null) {

                    if (fixImage) {
                      imageUrl = imageAndMask.getImageUrl();
                      for (final Pattern p : replacementData.keySet()) {
                        imageUrl = fixUrl(p, imageUrl, replacementData.get(p));
                      }
                    } else {
                      imageUrl = imageAndMask.getImageUrl();
                    }

                    if (fixMask) {
                      maskUrl = imageAndMask.getMaskUrl();
                      for (final Pattern p : replacementData.keySet()) {
                        maskUrl = fixUrl(p, maskUrl, replacementData.get(p));
                      }
                    } else {
                      maskUrl = imageAndMask.getMaskUrl();
                    }

                    fixedImageAndMask = new ImageAndMask(imageUrl, maskUrl);
                    fixedImageAndMask.validate();

                    final boolean imagePathChanged =
                        fixImage && (!imageUrl.equals(imageAndMask.getImageUrl()));
                    final boolean maskPathChanged =
                        fixMask && (!maskUrl.equals(imageAndMask.getMaskUrl()));
                    if (imagePathChanged || maskPathChanged) {
                      fixedAtLeastOneSpec = true;
                      tileSpec.putMipmap(level, fixedImageAndMask);
                    }
                  }
                }
              }
            }

            if (fixedAtLeastOneSpec) {
              targetDataClient.saveResolvedTiles(sourceCollection, parameters.getTargetStack(), z);
            } else {
              LOG.info("no changes necessary for z {}", z);
            }

            return sourceCollection.getTileCount();
          }
        };

    // assign a transformation to the RDD
    final JavaRDD<Integer> rddTileCounts = rddZValues.map(transformFunction);

    // use an action to get the results
    final List<Integer> tileCountList = rddTileCounts.collect();
    long total = 0;

    for (final Integer tileCount : tileCountList) {
      total += tileCount;
    }

    LOG.info("run: collected stats");
    LOG.info("run: saved {} tiles and transforms", total);

    sparkContext.stop();
  }