Пример #1
0
 private void writeString(final ByteBuf buf, final AsciiString s) {
   final int encodedLength = Huffman.encodedLength(s);
   if (encodedLength < s.length()) {
     Hpack.writeHuffmanString(buf, s, encodedLength);
   } else {
     Hpack.writeRawString(buf, s);
   }
 }
Пример #2
0
  @Test
  public void testHuffmanDecode() throws Exception {
    final String expected = "https://www.example.com";

    final byte[] encoded =
        TestUtil.bytes(
            0x9d, 0x29, 0xad, 0x17, 0x18, 0x63, 0xc7, 0x8f, 0x0b, 0x97, 0xc8, 0xe9, 0xae, 0x82,
            0xae, 0x43, 0xd3);

    final ByteBuf in = Unpooled.wrappedBuffer(encoded);
    final ByteBuf out = Unpooled.buffer();

    Huffman.decode(in, out);

    final String decoded = out.toString(US_ASCII);
    assertThat(decoded, is(expected));
  }
Пример #3
0
  // Training word2vec based on corpus
  public void train(JavaRDD<String> corpusRDD) throws Exception {
    log.info("Start training ...");

    // SparkContext
    final JavaSparkContext sc = new JavaSparkContext(corpusRDD.context());

    // Pre-defined variables
    Map<String, Object> tokenizerVarMap = getTokenizerVarMap();
    Map<String, Object> word2vecVarMap = getWord2vecVarMap();

    // Variables to fill in in train
    // final JavaRDD<AtomicLong> sentenceWordsCountRDD;
    final JavaRDD<List<VocabWord>> vocabWordListRDD;
    // final JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD;
    final VocabCache vocabCache;
    final JavaRDD<Long> sentenceCumSumCountRDD;

    // Start Training //
    //////////////////////////////////////
    log.info("Tokenization and building VocabCache ...");
    // Processing every sentence and make a VocabCache which gets fed into a LookupCache
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(tokenizerVarMap);
    TextPipeline pipeline =
        new TextPipeline(corpusRDD.repartition(numPartitions), broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();

    // Get total word count and put into word2vec variable map
    word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount() / numPartitions);

    // 2 RDDs: (vocab words list) and (sentence Count).Already cached
    // sentenceWordsCountRDD = pipeline.getSentenceCountRDD();
    vocabWordListRDD = pipeline.getVocabWordListRDD();

    // Get vocabCache and broad-casted vocabCache
    Broadcast<VocabCache> vocabCacheBroadcast = pipeline.getBroadCastVocabCache();
    vocabCache = vocabCacheBroadcast.getValue();

    //////////////////////////////////////
    log.info("Building Huffman Tree ...");
    // Building Huffman Tree would update the code and point in each of the vocabWord in vocabCache
    Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();

    /////////////////////////////////////
    log.info("Training word2vec sentences ...");

    word2vecVarMap.put("vecNum", vocabCache.numWords());

    // Map<Tuple2<Integer,Integer>, INDArray> s0 = new HashMap();
    Map<Pair<Integer, Integer>, INDArray> s0 = new HashMap();
    for (int k = 0; k < K; k++) {
      for (int i = 0; i < vocabCache.numWords(); i++) {
        s0.put(new Pair(i, k), getRandomSyn0Vec(vectorLength));
      }
    }
    for (int i = vocabCache.numWords(); i < vocabCache.numWords() * 2 - 1; i++) {
      s0.put(new Pair(i, 0), Nd4j.zeros(1, vectorLength));
    }

    for (int i = 0; i < iterations; i++) {
      System.out.println("iteration: " + i);

      word2vecVarMap.put("alpha", alpha - (alpha - minAlpha) / iterations * i);
      word2vecVarMap.put("minAlpha", alpha - (alpha - minAlpha) / iterations * (i + 1));

      FlatMapFunction firstIterationFunction =
          new FirstIterationFunction(word2vecVarMap, expTable, sc.broadcast(s0));

      class MapPairFunction
          implements PairFunction<Map.Entry<Integer, INDArray>, Integer, INDArray> {
        public Tuple2<Integer, INDArray> call(Map.Entry<Integer, INDArray> pair) {
          return new Tuple2(pair.getKey(), pair.getValue());
        }
      }

      class Sum implements Function2<INDArray, INDArray, INDArray> {
        public INDArray call(INDArray a, INDArray b) {
          return a.add(b);
        }
      }

      // @SuppressWarnings("unchecked")
      JavaPairRDD<Pair<Integer, Integer>, INDArray> indexSyn0UpdateEntryRDD =
          vocabWordListRDD
              .mapPartitions(firstIterationFunction)
              .mapToPair(new MapPairFunction())
              .cache();
      Map<Pair<Integer, Integer>, Object> count = indexSyn0UpdateEntryRDD.countByKey();
      indexSyn0UpdateEntryRDD = indexSyn0UpdateEntryRDD.reduceByKey(new Sum());

      // Get all the syn0 updates into a list in driver
      List<Tuple2<Pair<Integer, Integer>, INDArray>> syn0UpdateEntries =
          indexSyn0UpdateEntryRDD.collect();

      // Updating syn0
      s0 = new HashMap();
      for (Tuple2<Pair<Integer, Integer>, INDArray> syn0UpdateEntry : syn0UpdateEntries) {
        int cc = Integer.parseInt(count.get(syn0UpdateEntry._1).toString());
        // int cc = 1;
        if (cc > 0) {
          INDArray tmp = Nd4j.zeros(1, vectorLength).addi(syn0UpdateEntry._2).divi(cc);
          s0.put(syn0UpdateEntry._1, tmp);
        }
      }
    }

    syn0 = Nd4j.zeros(vocabCache.numWords() * K, vectorLength);
    for (Map.Entry<Pair<Integer, Integer>, INDArray> ss : s0.entrySet()) {
      if (ss.getKey().getFirst() < vocabCache.numWords()) {
        syn0.getRow(ss.getKey().getSecond() * vocabCache.numWords() + ss.getKey().getFirst())
            .addi(ss.getValue());
      }
    }

    vocab = vocabCache;
    syn0.diviRowVector(syn0.norm2(1));

    BufferedWriter write = new BufferedWriter(new FileWriter(new File(path), false));
    for (int i = 0; i < syn0.rows(); i++) {
      String word = vocab.wordAtIndex(i % vocab.numWords());
      if (word == null) {
        continue;
      }
      word = word + "(" + i / vocab.numWords() + ")";
      StringBuilder sb = new StringBuilder();
      sb.append(word.replaceAll(" ", "_"));
      sb.append(" ");
      INDArray wordVector = syn0.getRow(i);
      for (int j = 0; j < wordVector.length(); j++) {
        sb.append(wordVector.getDouble(j));
        if (j < wordVector.length() - 1) {
          sb.append(" ");
        }
      }
      sb.append("\n");
      write.write(sb.toString());
    }
    write.flush();
    write.close();
  }