@SuppressWarnings("serial") @Override public SortedCounts<String> execute(final JavaSparkContext spark) { final JavaRDD<String> textFile = spark.textFile(inputFile); final JavaRDD<String> words = textFile.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(final String rawJSON) throws TwitterException { final Status tweet = TwitterObjectFactory.createStatus(rawJSON); String text = tweet.getText(); return Arrays.asList(text.split(" ")); } }); final JavaPairRDD<String, Integer> pairs = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(final String s) { return new Tuple2<String, Integer>(s.toLowerCase(), 1); } }); final JavaPairRDD<String, Integer> counts = pairs.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(final Integer a, final Integer b) { return a + b; } }); return SortedCounts.create(counts); }
public static void main(String[] args) { String master = args[0]; String appName = args[1]; String path = args[2]; SparkConf conf = new SparkConf().setAppName(appName).setMaster(master); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(path) .filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return !s.isEmpty() && !s.contains("Total"); } }); JavaRDD<String> usOnly = lines.filter( new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { return s.contains("United States"); } }); JavaPairRDD<String, Integer> yearAndMedals = usOnly.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { String[] fields = s.split(","); return new Tuple2<String, Integer>(fields[3], Integer.parseInt(fields[8])); } }); JavaPairRDD<String, Integer> reduced = yearAndMedals.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer accumulator, Integer currentValue) throws Exception { return accumulator + currentValue; } }); JavaPairRDD<String, Integer> result = reduced.filter( new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> tuple) throws Exception { return tuple._2 < 200; } }); System.out.println(); System.out.println(result.collect()); }
public static void main(String[] args) { if (args.length == 0) { System.err.println("Usage: Main <file>"); System.exit(1); } SparkConf conf = new SparkConf().setAppName("Days of the week by on-time arrival performance"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile(args[0]); JavaPairRDD<String, Double> dayArrivalDelayPair = lines.flatMapToPair( line -> { String[] splitLine = line.split(SPLIT_PATTERN); String key = splitLine.length == 0 ? "" : splitLine[0]; Double value = splitLine.length < 2 ? value = 0.0 : Double.valueOf(splitLine[1]); return Arrays.asList(new Tuple2<>(key, value)); }); JavaPairRDD<String, AverageWrapper> dayAverageWrapper = dayArrivalDelayPair.mapValues(value -> new AverageWrapper(value, 1)); JavaPairRDD<String, AverageWrapper> daysValueCount = dayAverageWrapper.reduceByKey( (aw1, aw2) -> new AverageWrapper( aw1.getValue() + aw2.getValue(), aw1.getCount() + aw2.getCount())); Map<String, AverageWrapper> resultMap = daysValueCount.collectAsMap(); List<Map.Entry<String, AverageWrapper>> listResults = new ArrayList<>(); listResults.addAll(resultMap.entrySet()); Collections.sort( listResults, (entry1, entry2) -> Double.valueOf(entry1.getValue().getValue()).compareTo(entry2.getValue().getValue())); for (Map.Entry<String, AverageWrapper> entry : listResults) { System.out.printf( "%s -> (%f, %d)\n", entry.getKey(), entry.getValue().getValue(), entry.getValue().getCount()); } // JavaPairRDD<String, Double> resultRDD = // daysValueCount.mapValues(averageWrapper -> averageWrapper.getValue() / // averageWrapper.getCount()); // // Map<String, Double> results = resultRDD.collectAsMap(); // List<Map.Entry<String, Double>> listResults = new ArrayList<>(); // listResults.addAll(results.entrySet()); // Collections.sort(listResults, (entry1, entry2) -> // entry1.getValue().compareTo(entry2.getValue())); // // for (Map.Entry<String, Double> entry : listResults) { // System.out.printf("%s:\t%f\n", entry.getKey(), entry.getValue()); // } }
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: JavaWordCount <input_file> <output_file>"); System.exit(1); } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) { return Arrays.asList(SPACE.split(s)).iterator(); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); /* List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?,?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } */ counts.saveAsTextFile(args[1]); ctx.stop(); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaWordCount <master> <file>"); System.exit(1); } JavaSparkContext ctx = new JavaSparkContext( args[0], "JavaWordCount", System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaWordCount.class)); JavaRDD<String> lines = ctx.textFile(args[1], 1); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1 + ": " + tuple._2); } System.exit(0); }
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1); } SparkSession spark = SparkSession.builder().appName("JavaWordCount").getOrCreate(); JavaRDD<String> lines = spark.read().text(args[0]).javaRDD(); JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) { return Arrays.asList(SPACE.split(s)).iterator(); } }); JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } spark.stop(); }
public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local").setAppName("My App"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("src/main/resources/data.txt"); @SuppressWarnings("serial") JavaRDD<String> words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); @SuppressWarnings("serial") JavaPairRDD<String, Integer> counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + "-> " + tuple._2()); } sc.close(); }
public static void main(String[] args) { // STEP 1: create a SparkConf object if (args.length < 1) { log.fatal("Syntax Error: there must be one argument (a file name or a directory)"); throw new RuntimeException(); } // STEP 2: create a SparkConf object SparkConf sparkConf = new SparkConf().setAppName("Trending Topic"); // STEP 3: create a Java Spark context JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); // STEP 4: read lines of files JavaRDD<String> lines = sparkContext.textFile(args[0]); JavaRDD<String> words; words = lines.flatMap( new FlatMapFunction<String, String>() { @Override public Iterable call(String s) throws Exception { return Arrays.asList(s.split("\t")[2].split(" ")); } }); JavaPairRDD<String, Integer> ones; ones = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String string) { return new Tuple2<>(string, 1); } }); JavaPairRDD<String, Integer> counts; counts = ones.reduceByKey( new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer integer, Integer integer2) throws Exception { return integer + integer2; } }); // Es necesario invertir las tuplas ya que no podemos ordenar por valor, sino por clave JavaPairRDD<Integer, String> swapped; swapped = counts.mapToPair( new PairFunction<Tuple2<String, Integer>, Integer, String>() { @Override public Tuple2<Integer, String> call(Tuple2<String, Integer> tupla) throws Exception { return tupla.swap(); } }); // STEP 7: sort the results by key List<Tuple2<Integer, String>> output = swapped.sortByKey().collect(); // El ejercicio dice que quitemos las palabras que no aportan nada. Para ello podríamos ponerlas // en un fichero y leerlas y luego obviar esas. Vamos a obviar esa parte ya que se entiende y no // es el caso del ejercicio List<String> excluyentes = new LinkedList<>(); excluyentes.add("rt"); excluyentes.add("http"); excluyentes.add("https"); excluyentes.add("www"); for (Tuple2<Integer, String> t : output) { if (excluyentes.contains(t._2)) { output.remove(t); } } // STEP 8: print the results for (int i = 0; i < 10; i++) { Tuple2<Integer, String> tuple; tuple = output.get(i); System.out.println(tuple._2() + ": " + tuple._1()); } // STEP 9: stop the spark context sparkContext.stop(); }
public static void main(String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("ShopJsonParse"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> ciku = ctx.textFile("hdfs://hadoop119:9000/ciku/ciku_zhuyu.txt", 1); JavaRDD<String> zhuyu = ciku.map( new Function<String, String>() { @Override public String call(String s) throws Exception { String[] str = s.split(" "); if (str[1].equals("1")) return str[0]; else return "kaer"; } }) .distinct() .cache(); JavaRDD<String> haoping = ciku.map( new Function<String, String>() { @Override public String call(String s) throws Exception { String[] str = s.split(" "); if (str[1].equals("2")) return str[0]; else return "kaer"; } }) .distinct() .cache(); JavaRDD<String> chaping = ciku.map( new Function<String, String>() { @Override public String call(String s) throws Exception { String[] str = s.split(" "); if (str[1].equals("3")) return str[0]; else return "kaer"; } }) .distinct() .cache(); final List<String> zhuyulist = zhuyu.collect(); final List<String> hplist = haoping.collect(); final List<String> cplist = chaping.collect(); JavaRDD<String> mongoratedata = ctx.textFile("hdfs://hadoop119:9000/shopdata/ratelist.json"); JavaRDD<Map<String, Object>> mongorateall = mongoratedata.map( new Function<String, Map<String, Object>>() { @Override public Map<String, Object> call(String line) throws Exception { return ParseLineToMap(line); } private Map<String, Object> ParseLineToMap(String line) { Map<String, Object> documentMap = new HashMap<String, Object>(); try { JSONObject jsonline = new JSONObject(line); documentMap.put("PlatformItemId", jsonline.get("nid").toString()); Gson gson = new Gson(); rate rate = gson.fromJson(jsonline.get("rate").toString(), rate.class); documentMap.put("ratelist", rate.parsemod()); } catch (JSONException e) { e.printStackTrace(); } return documentMap; } }); JavaPairRDD<String, String> Rates = mongorateall.flatMapToPair( new PairFlatMapFunction<Map<String, Object>, String, String>() { @Override public Iterable<Tuple2<String, String>> call(Map<String, Object> map) throws Exception { ArrayList<Tuple2<String, String>> flatmaps = new ArrayList<Tuple2<String, String>>(); String itemid = (String) map.get("PlatformItemId"); String itempro = ""; Map<String, String> ratelist = (Map<String, String>) map.get("ratelist"); if (ratelist == null) { itempro = "null"; flatmaps.add(new Tuple2<String, String>(itemid, itempro)); } else { for (String value : ratelist.values()) { itempro = value; flatmaps.add(new Tuple2<String, String>(itemid, itempro)); } } return flatmaps; } }); final Pattern SPACES = Pattern.compile("\\s+"); JavaPairRDD<String, String> sentences = Rates.flatMapValues( new Function<String, Iterable<String>>() { @Override public Iterable<String> call(String s) throws Exception { ArrayList<String> list = new ArrayList<String>(); if (s.contains(" ")) { String[] str = SPACES.split(s); int num = 0; while (num < str.length) { if (!str[num].equals("")) list.add(str[num]); num++; } } else { list.add(s); } return list; } }); String filter = "的 也 很 都 了 非常 有些 还 是 点 些 就 看起来 看上去 更 呢 哦 确实 什么的 较 太 啊 吧 得 那么 什么 挺"; final String[] list = filter.split(" "); JavaPairRDD<String, String> words = sentences.mapValues( new Function<String, String>() { @Override public String call(String s) throws Exception { if (s.length() < 3) { return s + " " + "kaer"; } for (int i = 0; i < zhuyulist.size(); i++) { String zhuyu = zhuyulist.get(i); if (s.contains(zhuyu)) { s = s.replace(zhuyu, " "); int size = s.length(); int tap = s.lastIndexOf(" "); String ss = "kaer"; if (tap + 1 < size) { ss = s.substring(tap + 1, size); } else { if (tap - 1 > 0) ss = s.substring(0, tap - 1); } for (String tem : list) { if (ss.contains(tem)) ss = ss.replace(tem, ""); } return zhuyu + " " + ss; } } return "long null"; } }); JavaPairRDD<String, String> filterwords = words .mapValues( new Function<String, String>() { @Override public String call(String s) throws Exception { String tempstr; if (s.contains("kaer")) { tempstr = s.substring(0, s.indexOf(" ")); for (int i = 0; i < cplist.size(); i++) { if (tempstr.equals(cplist.get(i))) return "差评 " + "," + tempstr; } for (int i = 0; i < hplist.size(); i++) { if (tempstr.equals(hplist.get(i))) return "好评 " + "," + tempstr; } return "中评 " + "," + tempstr; } else if (s.contains("null")) { return s + ",null"; } else { if (s.endsWith(" ")) return "long null,null"; tempstr = s.split(" ")[1]; for (int i = 0; i < cplist.size(); i++) { if (tempstr.equals(cplist.get(i))) return "差评 " + s.split(" ")[0] + "," + tempstr; } for (int i = 0; i < hplist.size(); i++) { if (tempstr.equals(hplist.get(i))) return "好评 " + s.split(" ")[0] + "," + tempstr; } return "中评 " + s.split(" ")[0] + "," + tempstr; } } }) .filter( new Function<Tuple2<String, String>, Boolean>() { @Override public Boolean call(Tuple2<String, String> line) throws Exception { if (line._2.contains("null")) return false; else return true; } }); JavaPairRDD<String, String> ones = filterwords.mapToPair( new PairFunction<Tuple2<String, String>, String, String>() { @Override public Tuple2<String, String> call(Tuple2<String, String> line) throws Exception { String key = line._1(); String value = "0,0,0", ll = line._2; if (ll.startsWith("好评")) value = "1,0,0"; else if (ll.startsWith("中评")) value = "0,1,0"; else if (ll.startsWith("差评")) value = "0,0,1"; return new Tuple2<String, String>(key, value); } }); JavaPairRDD<String, String> result = ones.reduceByKey( new Function2<String, String, String>() { @Override public String call(String s1, String s2) throws Exception { double h1 = Double.parseDouble(s1.split(",")[0]), h2 = Double.parseDouble(s1.split(",")[1]), h3 = Double.parseDouble(s1.split(",")[2]); double hh1 = Double.parseDouble(s2.split(",")[0]), hh2 = Double.parseDouble(s2.split(",")[1]), hh3 = Double.parseDouble(s2.split(",")[2]); return (h1 + hh1) + "," + (h2 + hh2) + "," + (h3 + hh3); } }); JavaPairRDD<String, Integer> rateresult = result.mapValues( new Function<String, Integer>() { @Override public Integer call(String s1) throws Exception { double h1 = Double.parseDouble(s1.split(",")[0]), h2 = Double.parseDouble(s1.split(",")[1]), h3 = Double.parseDouble(s1.split(",")[2]); if (h1 + h3 == 0) return 50; else { return (int) (h1 / (h1 + h3) * 100); } } }); JavaRDD<String> mongocontentdata = ctx.textFile("hdfs://hadoop119:9000/shopdata/ProductContent.json"); JavaRDD<Map<String, Object>> mongocontentall = mongocontentdata.map( new Function<String, Map<String, Object>>() { @Override public Map<String, Object> call(String line) throws Exception { return new ShopParse().ParseLine(line); } }); JavaPairRDD<String, Map<String, Object>> content = mongocontentall.mapToPair( new PairFunction<Map<String, Object>, String, Map<String, Object>>() { @Override public Tuple2<String, Map<String, Object>> call(Map<String, Object> map) throws Exception { return new Tuple2<String, Map<String, Object>>( map.get("PlatformItemId").toString(), map); } }); JavaRDD<String> mongoproListdata = ctx.textFile("hdfs://hadoop119:9000/shopdata/productList.json"); JavaRDD<Map<String, Object>> mongoproListall = mongoproListdata.map( new Function<String, Map<String, Object>>() { @Override public Map<String, Object> call(String line) throws Exception { return new ShopParse().ParseproList(line); } }); System.out.println("mongoproListall counts :" + mongoproListall.count()); JavaPairRDD<String, Map<String, Object>> proList = mongoproListall .mapToPair( new PairFunction<Map<String, Object>, String, Map<String, Object>>() { @Override public Tuple2<String, Map<String, Object>> call(Map<String, Object> map) throws Exception { return new Tuple2<String, Map<String, Object>>( map.get("PlatformItemId").toString(), map); } }) .filter( new Function<Tuple2<String, Map<String, Object>>, Boolean>() { @Override public Boolean call(Tuple2<String, Map<String, Object>> line) throws Exception { if (line._2.get("isdownloads").toString().equals("true")) return true; else return false; } }); System.out.println("proList counts :" + proList.count()); JavaRDD<Map<String, Object>> ContJoinPro = content .join(proList) .map( new Function< Tuple2<String, Tuple2<Map<String, Object>, Map<String, Object>>>, Map<String, Object>>() { @Override public Map<String, Object> call( Tuple2<String, Tuple2<Map<String, Object>, Map<String, Object>>> line) throws Exception { Map<String, Object> mapprod = line._2._1; mapprod.put("Name", line._2._2.get("Name")); mapprod.put("Photo", line._2._2.get("Photo")); mapprod.put("SellerId", line._2._2.get("SellerId")); mapprod.put("StoreName", line._2._2.get("StoreName")); mapprod.put("Url", line._2._2.get("Url")); mapprod.put("TaokeUrl", line._2._2.get("TaokeUrl")); return mapprod; } }); JavaPairRDD<String, String> Messages = ContJoinPro.mapToPair( new PairFunction<Map<String, Object>, String, String>() { @Override public Tuple2<String, String> call(Map<String, Object> map) throws Exception { String itemid = (String) map.get("PlatformItemId"); String itempro = ""; String From = (String) map.get("isTmall"); if (From.equals("true")) From = "2"; else From = "1"; String Quantity = (String) map.get("Quantity"); String CmtCount = (String) map.get("ratecount"); String ImgPath = (String) map.get("detailmessage"); String[] ImgPaths = ImgPath.split("@=@=@"); // 1-5 String mobprice = (String) map.get("mobmessage"); String pcprice = (String) map.get("pcpricemessage"); String minmaxPrice = (String) map.get("MaxMinPrice"); String OriginalPrice = (String) map.get("OriginalPrice"); double p1 = Double.parseDouble(mobprice); double p2 = Double.parseDouble(pcprice.split("@=@=@")[0]); double min = Double.parseDouble(minmaxPrice.split(",")[0]); double max = Double.parseDouble(minmaxPrice.split(",")[1]); double origin = Double.parseDouble(OriginalPrice); double Price = p1; if (Price > p2) Price = p2; if (Price == 100000.00) Price = min; if (origin < max) OriginalPrice = max + ""; String IsPost = "0"; if (!pcprice.endsWith("@=@=@") && pcprice.split("@=@=@")[1].startsWith("0.00")) IsPost = "1"; String Name = (String) map.get("Name"); String SellerId = (String) map.get("SellerId"); String StoreName = (String) map.get("StoreName"); String Photo = (String) map.get("Photo"); String Url = (String) map.get("Url"); String TaokeUrl = (String) map.get("TaokeUrl"); DecimalFormat ddf = new DecimalFormat("#0.00"); String Discount = ddf.format(Price / Double.parseDouble(OriginalPrice)) + ""; SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String AddTime = df.format(new Date()); String IsSell = "1"; String Type = "2"; String IsChangeImgPath = "0"; String HotKeyId = "0"; String OpenIid = "0"; itempro = From + "@=@=@" + Quantity + "@=@=@" + CmtCount + "@=@=@" + ImgPaths[0] + "@=@=@" + ImgPaths[1] + "@=@=@" + ImgPaths[2] + "@=@=@" + ImgPaths[3] + "@=@=@" + ImgPaths[4] + "@=@=@" + Price + "@=@=@" + IsPost + "@=@=@" + Name + "@=@=@" + SellerId + "@=@=@" + StoreName + "@=@=@" + OriginalPrice + "@=@=@" + Photo + "@=@=@" + Url + "@=@=@" + Discount + "@=@=@" + AddTime + "@=@=@" + IsSell + "@=@=@" + Type + "@=@=@" + IsChangeImgPath + "@=@=@" + HotKeyId + "@=@=@" + TaokeUrl + "@=@=@" + OpenIid; return new Tuple2<String, String>(itemid, itempro); } }); JavaRDD<String> MessagesAll = Messages.leftOuterJoin(rateresult) .map( new Function<Tuple2<String, Tuple2<String, Optional<Integer>>>, String>() { @Override public String call(Tuple2<String, Tuple2<String, Optional<Integer>>> line) throws Exception { Optional<Integer> possible = line._2._2; int fenshu = 50; if (possible.isPresent()) fenshu = line._2._2.get(); return line._1 + "@=@=@" + line._2._1 + "@=@=@" + fenshu; } }); List<String> messages = MessagesAll.collect(); new MessageToMysql().insert(messages); ctx.stop(); }