private void writeString(final ByteBuf buf, final AsciiString s) { final int encodedLength = Huffman.encodedLength(s); if (encodedLength < s.length()) { Hpack.writeHuffmanString(buf, s, encodedLength); } else { Hpack.writeRawString(buf, s); } }
@Test public void testHuffmanDecode() throws Exception { final String expected = "https://www.example.com"; final byte[] encoded = TestUtil.bytes( 0x9d, 0x29, 0xad, 0x17, 0x18, 0x63, 0xc7, 0x8f, 0x0b, 0x97, 0xc8, 0xe9, 0xae, 0x82, 0xae, 0x43, 0xd3); final ByteBuf in = Unpooled.wrappedBuffer(encoded); final ByteBuf out = Unpooled.buffer(); Huffman.decode(in, out); final String decoded = out.toString(US_ASCII); assertThat(decoded, is(expected)); }
// Training word2vec based on corpus public void train(JavaRDD<String> corpusRDD) throws Exception { log.info("Start training ..."); // SparkContext final JavaSparkContext sc = new JavaSparkContext(corpusRDD.context()); // Pre-defined variables Map<String, Object> tokenizerVarMap = getTokenizerVarMap(); Map<String, Object> word2vecVarMap = getWord2vecVarMap(); // Variables to fill in in train // final JavaRDD<AtomicLong> sentenceWordsCountRDD; final JavaRDD<List<VocabWord>> vocabWordListRDD; // final JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD; final VocabCache vocabCache; final JavaRDD<Long> sentenceCumSumCountRDD; // Start Training // ////////////////////////////////////// log.info("Tokenization and building VocabCache ..."); // Processing every sentence and make a VocabCache which gets fed into a LookupCache Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(tokenizerVarMap); TextPipeline pipeline = new TextPipeline(corpusRDD.repartition(numPartitions), broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); // Get total word count and put into word2vec variable map word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount() / numPartitions); // 2 RDDs: (vocab words list) and (sentence Count).Already cached // sentenceWordsCountRDD = pipeline.getSentenceCountRDD(); vocabWordListRDD = pipeline.getVocabWordListRDD(); // Get vocabCache and broad-casted vocabCache Broadcast<VocabCache> vocabCacheBroadcast = pipeline.getBroadCastVocabCache(); vocabCache = vocabCacheBroadcast.getValue(); ////////////////////////////////////// log.info("Building Huffman Tree ..."); // Building Huffman Tree would update the code and point in each of the vocabWord in vocabCache Huffman huffman = new Huffman(vocabCache.vocabWords()); huffman.build(); ///////////////////////////////////// log.info("Training word2vec sentences ..."); word2vecVarMap.put("vecNum", vocabCache.numWords()); // Map<Tuple2<Integer,Integer>, INDArray> s0 = new HashMap(); Map<Pair<Integer, Integer>, INDArray> s0 = new HashMap(); for (int k = 0; k < K; k++) { for (int i = 0; i < vocabCache.numWords(); i++) { s0.put(new Pair(i, k), getRandomSyn0Vec(vectorLength)); } } for (int i = vocabCache.numWords(); i < vocabCache.numWords() * 2 - 1; i++) { s0.put(new Pair(i, 0), Nd4j.zeros(1, vectorLength)); } for (int i = 0; i < iterations; i++) { System.out.println("iteration: " + i); word2vecVarMap.put("alpha", alpha - (alpha - minAlpha) / iterations * i); word2vecVarMap.put("minAlpha", alpha - (alpha - minAlpha) / iterations * (i + 1)); FlatMapFunction firstIterationFunction = new FirstIterationFunction(word2vecVarMap, expTable, sc.broadcast(s0)); class MapPairFunction implements PairFunction<Map.Entry<Integer, INDArray>, Integer, INDArray> { public Tuple2<Integer, INDArray> call(Map.Entry<Integer, INDArray> pair) { return new Tuple2(pair.getKey(), pair.getValue()); } } class Sum implements Function2<INDArray, INDArray, INDArray> { public INDArray call(INDArray a, INDArray b) { return a.add(b); } } // @SuppressWarnings("unchecked") JavaPairRDD<Pair<Integer, Integer>, INDArray> indexSyn0UpdateEntryRDD = vocabWordListRDD .mapPartitions(firstIterationFunction) .mapToPair(new MapPairFunction()) .cache(); Map<Pair<Integer, Integer>, Object> count = indexSyn0UpdateEntryRDD.countByKey(); indexSyn0UpdateEntryRDD = indexSyn0UpdateEntryRDD.reduceByKey(new Sum()); // Get all the syn0 updates into a list in driver List<Tuple2<Pair<Integer, Integer>, INDArray>> syn0UpdateEntries = indexSyn0UpdateEntryRDD.collect(); // Updating syn0 s0 = new HashMap(); for (Tuple2<Pair<Integer, Integer>, INDArray> syn0UpdateEntry : syn0UpdateEntries) { int cc = Integer.parseInt(count.get(syn0UpdateEntry._1).toString()); // int cc = 1; if (cc > 0) { INDArray tmp = Nd4j.zeros(1, vectorLength).addi(syn0UpdateEntry._2).divi(cc); s0.put(syn0UpdateEntry._1, tmp); } } } syn0 = Nd4j.zeros(vocabCache.numWords() * K, vectorLength); for (Map.Entry<Pair<Integer, Integer>, INDArray> ss : s0.entrySet()) { if (ss.getKey().getFirst() < vocabCache.numWords()) { syn0.getRow(ss.getKey().getSecond() * vocabCache.numWords() + ss.getKey().getFirst()) .addi(ss.getValue()); } } vocab = vocabCache; syn0.diviRowVector(syn0.norm2(1)); BufferedWriter write = new BufferedWriter(new FileWriter(new File(path), false)); for (int i = 0; i < syn0.rows(); i++) { String word = vocab.wordAtIndex(i % vocab.numWords()); if (word == null) { continue; } word = word + "(" + i / vocab.numWords() + ")"; StringBuilder sb = new StringBuilder(); sb.append(word.replaceAll(" ", "_")); sb.append(" "); INDArray wordVector = syn0.getRow(i); for (int j = 0; j < wordVector.length(); j++) { sb.append(wordVector.getDouble(j)); if (j < wordVector.length() - 1) { sb.append(" "); } } sb.append("\n"); write.write(sb.toString()); } write.flush(); write.close(); }