Пример #1
0
  public static void main(String[] args) {
    if (args.length == 0) {
      System.err.println("Usage: JavaTC <host> [<slices>]");
      System.exit(1);
    }

    JavaSparkContext sc =
        new JavaSparkContext(
            args[0],
            "JavaTC",
            System.getenv("SPARK_HOME"),
            JavaSparkContext.jarOfClass(JavaTC.class));
    Integer slices = (args.length > 1) ? Integer.parseInt(args[1]) : 2;
    JavaPairRDD<Integer, Integer> tc = sc.parallelizePairs(generateGraph(), slices).cache();

    // Linear transitive closure: each round grows paths by one edge,
    // by joining the graph's edges with the already-discovered paths.
    // e.g. join the path (y, z) from the TC with the edge (x, y) from
    // the graph to obtain the path (x, z).

    // Because join() joins on keys, the edges are stored in reversed order.
    JavaPairRDD<Integer, Integer> edges =
        tc.mapToPair(
            new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
              @Override
              public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
                return new Tuple2<Integer, Integer>(e._2(), e._1());
              }
            });

    long oldCount = 0;
    long nextCount = tc.count();
    do {
      oldCount = nextCount;
      // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
      // then project the result to obtain the new (x, z) paths.
      tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
      nextCount = tc.count();
    } while (nextCount != oldCount);

    System.out.println("TC has " + tc.count() + " edges.");
    System.exit(0);
  }
Пример #2
0
  @Override
  public void publishAdditionalModelData(
      JavaSparkContext sparkContext,
      PMML pmml,
      JavaRDD<String> newData,
      JavaRDD<String> pastData,
      Path modelParentPath,
      QueueProducer<String, String> modelUpdateQueue) {

    JavaRDD<String> allData = pastData == null ? newData : newData.union(pastData);

    log.info("Sending user / X data as model updates");
    String xPathString = PMMLUtils.getExtensionValue(pmml, "X");
    JavaPairRDD<Integer, double[]> userRDD =
        fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)));

    if (noKnownItems) {
      userRDD.foreach(new EnqueueFeatureVecsFn("X", modelUpdateQueue));
    } else {
      log.info("Sending known item data with model updates");
      JavaPairRDD<Integer, Collection<Integer>> knownItems = knownsRDD(allData, true);
      userRDD
          .join(knownItems)
          .foreach(new EnqueueFeatureVecsAndKnownItemsFn("X", modelUpdateQueue));
    }

    log.info("Sending item / Y data as model updates");
    String yPathString = PMMLUtils.getExtensionValue(pmml, "Y");
    JavaPairRDD<Integer, double[]> productRDD =
        fromRDD(readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)));

    // For now, there is no use in sending known users for each item
    // if (noKnownItems) {
    productRDD.foreach(new EnqueueFeatureVecsFn("Y", modelUpdateQueue));
    // } else {
    //  log.info("Sending known user data with model updates");
    //  JavaPairRDD<Integer,Collection<Integer>> knownUsers = knownsRDD(allData, false);
    //  productRDD.join(knownUsers).foreach(
    //      new EnqueueFeatureVecsAndKnownItemsFn("Y", modelUpdateQueue));
    // }
  }
  public static void main(String[] args) throws IOException {
    Parameters param = new Parameters();
    long initTime = System.currentTimeMillis();

    SparkConf conf = new SparkConf().setAppName("StarJoin");
    JavaSparkContext sc = new JavaSparkContext(conf);

    if (param.useKryo) {
      conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
      conf.set("spark.kryo.registrator", MyBloomFilter.BloomFilterRegistrator.class.getName());
      conf.set("spark.kryoserializer.buffer.mb", param.buffer);
    }

    MyBloomFilter.BloomFilter<String> BFS =
        new MyBloomFilter.BloomFilter(1.0, param.bitsS, param.hashes);
    MyBloomFilter.BloomFilter<String> BFD =
        new MyBloomFilter.BloomFilter(1.0, param.bitsD, param.hashes);
    MyBloomFilter.BloomFilter<String> BFC =
        new MyBloomFilter.BloomFilter(1.0, param.bitsC, param.hashes);

    JavaPairRDD<String, String> supps =
        sc.textFile(param.suppPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> s = supps.collect();
    for (int i = 0; i < s.size(); i++) {
      BFS.add(s.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varS = sc.broadcast(BFS);

    JavaPairRDD<String, String> custs =
        sc.textFile(param.custPath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[3].equals("UNITED KI1") | s[3].equals("UNITED KI5");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[3]);
                  }
                });

    List<Tuple2<String, String>> c = custs.collect();
    for (int i = 0; i < c.size(); i++) {
      BFC.add(c.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varC = sc.broadcast(BFC);

    JavaPairRDD<String, String> dates =
        sc.textFile(param.datePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return s[6].equals("Dec1997");
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String>() {
                  public Tuple2<String, String> call(String[] s) {
                    return new Tuple2<String, String>(s[0], s[4]);
                  }
                });

    List<Tuple2<String, String>> d = dates.collect();
    for (int i = 0; i < d.size(); i++) {
      BFD.add(d.get(i)._1);
    }

    final Broadcast<MyBloomFilter.BloomFilter<String>> varD = sc.broadcast(BFD);

    JavaPairRDD<String, String[]> lines =
        sc.textFile(param.linePath)
            .map(
                new Function<String, String[]>() {
                  public String[] call(String line) {
                    return line.split("\\|");
                  }
                })
            .filter(
                new Function<String[], Boolean>() {
                  public Boolean call(String[] s) {
                    return varC.value().contains(s[2].getBytes())
                        & varS.value().contains(s[4].getBytes())
                        & varD.value().contains(s[5].getBytes());
                  }
                })
            .mapToPair(
                new PairFunction<String[], String, String[]>() {
                  public Tuple2<String, String[]> call(String[] s) {
                    String[] v = {s[2], s[5], s[12]};
                    return new Tuple2<String, String[]>(s[4], v);
                  }
                });

    JavaPairRDD<String, String[]> result =
        lines
            .join(supps)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    result =
        result
            .join(custs)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, String[]>() {
                  public Tuple2<String, String[]> call(Tuple2<String, Tuple2<String[], String>> s) {
                    String[] v = {s._2._1[1], s._2._1[2], s._2._2};
                    return new Tuple2<String, String[]>(s._2._1[0], v);
                  }
                });

    JavaPairRDD<String, Long> final_result =
        result
            .join(dates)
            .mapToPair(
                new PairFunction<Tuple2<String, Tuple2<String[], String>>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, Tuple2<String[], String>> s) {
                    return new Tuple2<String, Long>(
                        s._2._1[2] + "," + s._2._1[1] + "," + s._2._2, Long.parseLong(s._2._1[0]));
                  }
                })
            .reduceByKey(
                new Function2<Long, Long, Long>() {
                  public Long call(Long i1, Long i2) {
                    return i1 + i2;
                  }
                });

    JavaPairRDD<String, String> sub_result =
        final_result.mapToPair(
            new PairFunction<Tuple2<String, Long>, String, String>() {
              public Tuple2<String, String> call(Tuple2<String, Long> line) {
                return new Tuple2(line._1 + "," + line._2.toString(), null);
              }
            });

    final_result =
        sub_result
            .sortByKey(new Q3Comparator())
            .mapToPair(
                new PairFunction<Tuple2<String, String>, String, Long>() {
                  public Tuple2<String, Long> call(Tuple2<String, String> line) {
                    String[] s = line._1.split(",");
                    return new Tuple2<String, Long>(
                        s[0] + "," + s[1] + "," + s[2], Long.parseLong(s[3]));
                  }
                });

    Configuration HDFSconf = new Configuration();
    FileSystem fs = FileSystem.get(HDFSconf);
    fs.delete(new Path(param.output), true);

    final_result.saveAsTextFile(param.output);

    long finalTime = System.currentTimeMillis();
    System.out.print("Tempo total(ms): ");
    System.out.println(finalTime - initTime);

    sc.close();
  }
  /**
   * Produce, train and decode the Hmm. Must be done after adding theme and before calculating any
   * strengths.
   *
   * @param sc the spark context
   * @param piThreshold Threshold on pi
   * @param aaThreshold Threshold on a
   * @param maxIterations Max number of iterations
   */
  public void analyse(
      JavaSparkContext sc, double piThreshold, double aaThreshold, int maxIterations) {
    int numberHiddenStates = (int) (numberOfThemes + 1);

    int numberObservableOutputSymbols = (int) numberOfWords;

    // setting up initial state probability distribution
    double[] pi = new double[numberHiddenStates];
    double initialStateDistribution = 1.0f / numberHiddenStates;
    for (int i = 0; i < numberHiddenStates; i++) {
      pi[i] = initialStateDistribution;
    }

    // setting up state transition probability distribution
    double[][] stateTransitionProbabilityDistribution =
        new double[numberHiddenStates][numberObservableOutputSymbols];
    double halfInitialStateDistribution = initialStateDistribution / 2.0;
    for (int i = 0; i < numberHiddenStates; i++) {
      for (int j = 0; j < numberObservableOutputSymbols; j++) {
        if (i == 0 && j == 0) {
          // .5 chance to stay in the background model
          stateTransitionProbabilityDistribution[i][j] = 0.5f;
        } else if (i == 0) {
          // equal chance to go from the background model to any other state
          stateTransitionProbabilityDistribution[i][j] = halfInitialStateDistribution;
        } else if (j == 0) {
          // .5 chance to return in the background model
          stateTransitionProbabilityDistribution[i][j] = 0.5f;
        } else if (i == j) {
          // .5 chance to return in the same state
          stateTransitionProbabilityDistribution[i][j] = 0.5f;
        } else {
          // no chance to go between any two different states
          stateTransitionProbabilityDistribution[i][j] = 0.0f;
        }
      }
    }

    // setting the output prob distribution
    outputProbabilityDistribution[0] = bgAsArray;

    // setting up and training the hmm
    if (outputProbabilityDistribution == null) {
      System.out.println(
          "error : you need to specify the themes via"
              + "addAllThemesFromRDD before analyzing the sequence!");
    }
    hmm =
        new Hmm2(
            numberHiddenStates,
            numberObservableOutputSymbols,
            pi,
            stateTransitionProbabilityDistribution,
            outputProbabilityDistribution);

    if (outputProbabilityDistribution == null) {
      System.out.println(
          "error : you need to specify the themes via addAllThemesFromRDD before analyzing the sequence!");
    }

    JavaPairRDD<Tuple2<Long, Long>, Long> wordStreamZippedWithIndex = wordStream.zipWithIndex();

    JavaRDD<Tuple2<Integer, Integer>> observedSequenceRdd =
        wordStreamZippedWithIndex.map(
            new Function<Tuple2<Tuple2<Long, Long>, Long>, Tuple2<Integer, Integer>>() {

              private static final long serialVersionUID = 1L;

              @Override
              public Tuple2<Integer, Integer> call(Tuple2<Tuple2<Long, Long>, Long> wordEntry)
                  throws Exception {
                return new Tuple2<Integer, Integer>(
                    wordEntry._2.intValue(), wordEntry._1._1.intValue());
              }
            });

    System.out.println("observedSequenceRdd length : " + observedSequenceRdd.count());
    System.out.println(
        "observedSequenceRdd : "
            + Arrays.toString(Arrays.copyOf(observedSequenceRdd.collect().toArray(), 50)));
    hmm.rawTrain(sc, observedSequenceRdd, piThreshold, aaThreshold, maxIterations);

    mostLikelySequenceThemeShifts = hmm.decode(sc, observedSequenceRdd);

    JavaPairRDD<Integer, Tuple2<Long, Long>> wordStreamZippedWithIndexReversed =
        wordStreamZippedWithIndex.mapToPair(
            new PairFunction<Tuple2<Tuple2<Long, Long>, Long>, Integer, Tuple2<Long, Long>>() {

              private static final long serialVersionUID = 1L;

              @Override
              public Tuple2<Integer, Tuple2<Long, Long>> call(Tuple2<Tuple2<Long, Long>, Long> arg0)
                  throws Exception {
                // TODO Auto-generated method stub
                return new Tuple2<Integer, Tuple2<Long, Long>>(arg0._2.intValue(), arg0._1);
              }
            });

    JavaPairRDD<Integer, Tuple2<Integer, Tuple2<Long, Long>>> zippedDecodedSequence =
        mostLikelySequenceThemeShifts.join(wordStreamZippedWithIndexReversed);

    JavaRDD<Tuple2<Long, Integer>> nonZeroMostLikelyByTimestamp =
        zippedDecodedSequence.flatMap(
            new FlatMapFunction<
                Tuple2<Integer, Tuple2<Integer, Tuple2<Long, Long>>>, Tuple2<Long, Integer>>() {

              @Override
              public Iterable<Tuple2<Long, Integer>> call(
                  Tuple2<Integer, Tuple2<Integer, Tuple2<Long, Long>>> arg0) throws Exception {
                ArrayList<Tuple2<Long, Integer>> list = new ArrayList<Tuple2<Long, Integer>>(1);
                if (arg0._2._1 != 0) {
                  list.add(new Tuple2<Long, Integer>(arg0._2._2._2, arg0._2._1));
                }
                return list;
              }
            });

    JavaPairRDD<Long, Iterable<Tuple2<Long, Integer>>> groupedRdd =
        nonZeroMostLikelyByTimestamp.groupBy(
            new Function<Tuple2<Long, Integer>, Long>() {

              @Override
              public Long call(Tuple2<Long, Integer> arg0) throws Exception {
                return arg0._1;
              }
            });
    List<Long> timestampsList = groupedRdd.keys().collect();

    Collections.sort(timestampsList);

    JavaPairRDD<Long, Map<Integer, Integer>> resultByTimestamp =
        groupedRdd.mapValues(
            new Function<Iterable<Tuple2<Long, Integer>>, Map<Integer, Integer>>() {

              @Override
              public Map<Integer, Integer> call(Iterable<Tuple2<Long, Integer>> arg0)
                  throws Exception {
                Map<Integer, Integer> countMap = new HashMap<Integer, Integer>();
                for (Tuple2<Long, Integer> tuple : arg0) {
                  if (countMap.containsKey(tuple._2)) {
                    countMap.put(tuple._2, countMap.get(tuple._2) + 1);
                  } else {
                    countMap.put(tuple._2, 1);
                  }
                }
                return countMap;
              }
            });

    List<Tuple2<Long, Map<Integer, Integer>>> collectedResults = resultByTimestamp.collect();

    Collections.sort(
        collectedResults,
        new Comparator<Tuple2<Long, Map<Integer, Integer>>>() {
          @Override
          public int compare(
              Tuple2<Long, Map<Integer, Integer>> index1,
              Tuple2<Long, Map<Integer, Integer>> index2) {
            return index1._1.compareTo(index2._1);
          }
        });

    // Printing in the appropriate format for the csv file
    for (int themeIndex = 1; themeIndex < numberOfThemes; themeIndex++) {
      System.out.println("Theme " + (themeIndex - 1));
      Iterator<Tuple2<Long, Map<Integer, Integer>>> resultsIterator = collectedResults.iterator();
      Iterator<Long> timestampsIterator = timestampsList.iterator();

      Tuple2<Long, Map<Integer, Integer>> currentTuple = resultsIterator.next();

      while (timestampsIterator.hasNext()) {
        long timestamp = timestampsIterator.next();
        if (currentTuple != null && timestamp == currentTuple._1) {
          int strength =
              currentTuple._2().get(themeIndex) == null ? 0 : currentTuple._2().get(themeIndex);
          System.out.println(timestamp + "," + strength);
          if (resultsIterator.hasNext()) {
            currentTuple = resultsIterator.next();
          } else {
            currentTuple = null;
          }

        } else {
          System.out.println(timestamp + ",0");
        }
      }
    }

    // old way to do it
    int timeDuration = collectedResults.size();
    int[][] themesStrengths = new int[(int) numberOfThemes][timeDuration];

    for (int timeIndex = 0; timeIndex < timeDuration; timeIndex++) {
      Tuple2<Long, Map<Integer, Integer>> tuple = collectedResults.get(timeIndex);
      for (Entry<Integer, Integer> entry : tuple._2.entrySet()) {
        themesStrengths[entry.getKey() - 1][timeIndex] = entry.getValue();
      }
    }

    System.out.println("");
    for (int i = 0; i < numberOfThemes; i++) {
      System.out.print("themeStrength_" + i + " = [");
      for (int j = 0; j < timeDuration; j++) {
        System.out.print(" " + themesStrengths[i][j]);
      }
      System.out.println("];");
    }
  }