public void draw(Configuration conf) throws IOException {
    List<Pair<Integer, Integer>> list = new ArrayList<Pair<Integer, Integer>>();
    SequenceFileDirIterator<IntWritable, IntWritable> iterator =
        new SequenceFileDirIterator<IntWritable, IntWritable>(
            inputPath, PathType.LIST, MultipleSequenceOutputFormat.FILTER, null, true, conf);
    while (iterator.hasNext()) {
      Pair<IntWritable, IntWritable> writablePair = iterator.next();
      Pair<Integer, Integer> pair =
          new Pair<Integer, Integer>(writablePair.getFirst().get(), writablePair.getSecond().get());
      list.add(pair);
    }
    iterator.close();
    Collections.sort(
        list,
        new Comparator<Pair<Integer, Integer>>() {
          @Override
          public int compare(Pair<Integer, Integer> o1, Pair<Integer, Integer> o2) {
            if (o1.getFirst() < o2.getFirst()) {
              return -1;
            }
            return 1;
          }
        });
    XYDataset dataSet = createDataSet(list);
    JFreeChart chart =
        ChartFactory.createXYLineChart(
            title, "", "count", dataSet, PlotOrientation.VERTICAL, true, true, false);
    XYPlot plot = (XYPlot) chart.getPlot();
    NumberAxis axis = (NumberAxis) plot.getRangeAxis();
    axis.setNumberFormatOverride(numberFormat);

    BufferedImage image = chart.createBufferedImage(WIDTH, HEIGHT);
    ImageIO.write(image, FORMAT, new File(imgFile));
  }
 @Test
 public void testRun() throws Exception {
   Path input = getTestTempDirPath("input");
   Path output = getTestTempDirPath("output");
   Path seedsPath = getTestTempDirPath("seeds");
   List<VectorWritable> points = getPointsWritable(REFERENCE);
   List<VectorWritable> seeds = getPointsWritable(SEEDS);
   Configuration conf = new Configuration();
   ClusteringTestUtils.writePointsToFile(points, true, new Path(input, "file1"), fs, conf);
   ClusteringTestUtils.writePointsToFile(seeds, true, new Path(seedsPath, "part-seeds"), fs, conf);
   String[] args = {
     optKey(DefaultOptionCreator.INPUT_OPTION),
     input.toString(),
     optKey(VectorDistanceSimilarityJob.SEEDS),
     seedsPath.toString(),
     optKey(DefaultOptionCreator.OUTPUT_OPTION),
     output.toString(),
     optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
     EuclideanDistanceMeasure.class.getName()
   };
   ToolRunner.run(new Configuration(), new VectorDistanceSimilarityJob(), args);
   int expect = SEEDS.length * REFERENCE.length;
   DummyOutputCollector<StringTuple, DoubleWritable> collector =
       new DummyOutputCollector<StringTuple, DoubleWritable>();
   //
   for (Pair<StringTuple, DoubleWritable> record :
       new SequenceFileIterable<StringTuple, DoubleWritable>(
           new Path(output, "part-m-00000"), conf)) {
     collector.collect(record.getFirst(), record.getSecond());
   }
   assertEquals(expect, collector.getData().size());
 }
 public Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) {
   Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
   for (Pair<IntWritable, LongWritable> pair :
       new SequenceFileIterable<IntWritable, LongWritable>(documentFrequencyPath, true, conf)) {
     documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
   }
   return documentFrequency;
 }
 public Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) {
   Map<String, Integer> dictionnary = new HashMap<String, Integer>();
   for (Pair<Text, IntWritable> pair :
       new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) {
     dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
   }
   return dictionnary;
 }
Example #5
0
 protected static OpenObjectIntHashMap<String> readIndexFromCache(Configuration conf)
     throws IOException {
   OpenObjectIntHashMap<String> index = new OpenObjectIntHashMap<String>();
   for (Pair<Writable, IntWritable> entry :
       new SequenceFileIterable<Writable, IntWritable>(cachedFile(conf), conf)) {
     index.put(entry.getFirst().toString(), entry.getSecond().get());
   }
   return index;
 }
Example #6
0
 protected static Map<String, Vector> readScoresFromCache(Configuration conf) throws IOException {
   Map<String, Vector> sumVectors = Maps.newHashMap();
   for (Pair<Text, VectorWritable> entry :
       new SequenceFileDirIterable<Text, VectorWritable>(
           cachedFile(conf), PathType.LIST, PathFilters.partFilter(), conf)) {
     sumVectors.put(entry.getFirst().toString(), entry.getSecond().get());
   }
   return sumVectors;
 }
Example #7
0
  /**
   * Read the document frequency List which is built at the end of the DF Count Job. This will use
   * constant memory and will run at the speed of your disk read
   */
  private static Pair<Long[], List<Path>> createDictionaryChunks(
      Path featureCountPath,
      Path dictionaryPathBase,
      Configuration baseConf,
      int chunkSizeInMegabytes)
      throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter =
        new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);

    try {
      long currentChunkSize = 0;
      long featureCount = 0;
      long vectorCount = Long.MAX_VALUE;
      Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
      for (Pair<IntWritable, LongWritable> record :
          new SequenceFileDirIterable<IntWritable, LongWritable>(
              filesPattern, PathType.GLOB, null, null, true, conf)) {

        if (currentChunkSize > chunkSizeLimit) {
          Closeables.close(freqWriter, false);
          chunkIndex++;

          chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
          chunkPaths.add(chunkPath);

          freqWriter =
              new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
          currentChunkSize = 0;
        }

        int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
        currentChunkSize += fieldSize;
        IntWritable key = record.getFirst();
        LongWritable value = record.getSecond();
        if (key.get() >= 0) {
          freqWriter.append(key, value);
        } else if (key.get() == -1) {
          vectorCount = value.get();
        }
        featureCount = Math.max(key.get(), featureCount);
      }
      featureCount++;
      Long[] counts = {featureCount, vectorCount};
      return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
      Closeables.close(freqWriter, false);
    }
  }
  /** Test Parallel FPGrowth on retail data using top-level runPFPGrowth() method */
  @Test
  public void testParallelRetailVs() throws Exception {

    PFPGrowth.runPFPGrowth(paramsImpl1);
    List<Pair<String, TopKStringPatterns>> frequentPatterns1 =
        PFPGrowth.readFrequentPattern(paramsImpl1);

    Map<Set<String>, Long> results1 = Maps.newHashMap();
    for (Pair<String, TopKStringPatterns> topK : frequentPatterns1) {
      Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator();
      while (topKIt.hasNext()) {
        Pair<List<String>, Long> entry = topKIt.next();
        results1.put(new HashSet<String>(entry.getFirst()), entry.getSecond());
      }
    }

    PFPGrowth.runPFPGrowth(paramsImpl2);
    List<Pair<String, TopKStringPatterns>> frequentPatterns2 =
        PFPGrowth.readFrequentPattern(paramsImpl2);

    Map<Set<String>, Long> results2 = Maps.newHashMap();
    for (Pair<String, TopKStringPatterns> topK : frequentPatterns2) {
      Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator();
      while (topKIt.hasNext()) {
        Pair<List<String>, Long> entry = topKIt.next();
        results2.put(new HashSet<String>(entry.getFirst()), entry.getSecond());
      }
    }

    for (Entry<Set<String>, Long> entry : results1.entrySet()) {
      Set<String> key = entry.getKey();
      if (results2.get(key) == null) {
        System.out.println("spurious (1): " + key + " with " + entry.getValue());
      } else {
        if (!results2.get(key).equals(results1.get(entry.getKey()))) {
          System.out.println(
              "invalid (1): "
                  + key
                  + ", expected: "
                  + results2.get(key)
                  + ", got: "
                  + results1.get(entry.getKey()));
        } else {
          System.out.println("matched (1): " + key + ", with: " + results2.get(key));
        }
      }
    }

    for (Entry<Set<String>, Long> entry : results2.entrySet()) {
      Set<String> key = entry.getKey();
      if (results1.get(key) == null) {
        System.out.println("missing (1): " + key + " with " + entry.getValue());
      }
    }
    assertEquals(results2.size(), results1.size());
  }
Example #9
0
 /** Reads a binary mapping file */
 public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) {
   OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap();
   Path itemIDIndexPath = new Path(idIndexPathStr);
   for (Pair<VarIntWritable, VarLongWritable> record :
       new SequenceFileDirIterable<VarIntWritable, VarLongWritable>(
           itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
     indexIDMap.put(record.getFirst().get(), record.getSecond().get());
   }
   return indexIDMap;
 }
Example #10
0
  static NaiveBayesModel readModelFromTempDir(Path base, Configuration conf) {

    float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f);

    // read feature sums and label sums
    Vector scoresPerLabel = null;
    Vector scoresPerFeature = null;
    for (Pair<Text, VectorWritable> record :
        new SequenceFileDirIterable<Text, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.WEIGHTS),
            PathType.LIST,
            PathFilters.partFilter(),
            conf)) {
      String key = record.getFirst().toString();
      VectorWritable value = record.getSecond();
      if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) {
        scoresPerFeature = value.get();
      } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) {
        scoresPerLabel = value.get();
      }
    }

    Preconditions.checkNotNull(scoresPerFeature);
    Preconditions.checkNotNull(scoresPerLabel);

    Matrix scoresPerLabelAndFeature =
        new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size());
    for (Pair<IntWritable, VectorWritable> entry :
        new SequenceFileDirIterable<IntWritable, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS),
            PathType.LIST,
            PathFilters.partFilter(),
            conf)) {
      scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get());
    }

    Vector perlabelThetaNormalizer = null;
    for (Pair<Text, VectorWritable> entry :
        new SequenceFileDirIterable<Text, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.THETAS),
            PathType.LIST,
            PathFilters.partFilter(),
            conf)) {
      if (entry.getFirst().toString().equals(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER)) {
        perlabelThetaNormalizer = entry.getSecond().get();
      }
    }

    Preconditions.checkNotNull(perlabelThetaNormalizer);

    return new NaiveBayesModel(
        scoresPerLabelAndFeature,
        scoresPerFeature,
        scoresPerLabel,
        perlabelThetaNormalizer,
        alphaI);
  }
Example #11
0
 public void loadResults(Path outDirPath, Configuration conf) throws IOException {
   Path finalNumberFile = new Path(outDirPath, "part-r-00000");
   SequenceFileIterator<IntWritable, DoubleWritable> iterator =
       new SequenceFileIterator<IntWritable, DoubleWritable>(finalNumberFile, true, conf);
   try {
     while (iterator.hasNext()) {
       Pair<IntWritable, DoubleWritable> next = iterator.next();
       readIndividualResult(next.getFirst().get(), next.getSecond().get());
     }
   } finally {
     Closeables.close(iterator, false);
   }
 }
Example #12
0
 /** read a {@link Matrix} from a SequenceFile<IntWritable,VectorWritable> */
 public static OpenIntObjectHashMap<Vector> readMatrixRows(Configuration conf, Path path) {
   boolean readOneRow = false;
   OpenIntObjectHashMap<Vector> rows = new OpenIntObjectHashMap<>();
   for (Pair<IntWritable, VectorWritable> record :
       new SequenceFileIterable<IntWritable, VectorWritable>(path, true, conf)) {
     IntWritable key = record.getFirst();
     readOneRow = true;
     rows.put(key.get(), record.getSecond().get());
   }
   if (!readOneRow) {
     throw new IllegalStateException("Not a single row read!");
   }
   return rows;
 }
Example #13
0
 private static void printTopWords(List<Queue<Pair<String, Double>>> topWords, File outputDir)
     throws IOException {
   for (int i = 0; i < topWords.size(); ++i) {
     Collection<Pair<String, Double>> topK = topWords.get(i);
     Writer out = null;
     boolean printingToSystemOut = false;
     try {
       if (outputDir != null) {
         out =
             new OutputStreamWriter(
                 new FileOutputStream(new File(outputDir, "topic_" + i)), Charsets.UTF_8);
       } else {
         out = new OutputStreamWriter(System.out, Charsets.UTF_8);
         printingToSystemOut = true;
         out.write("Topic " + i);
         out.write('\n');
         out.write("===========");
         out.write('\n');
       }
       List<Pair<String, Double>> topKasList = Lists.newArrayListWithCapacity(topK.size());
       for (Pair<String, Double> wordWithScore : topK) {
         topKasList.add(wordWithScore);
       }
       Collections.sort(
           topKasList,
           new Comparator<Pair<String, Double>>() {
             @Override
             public int compare(Pair<String, Double> pair1, Pair<String, Double> pair2) {
               return pair2.getSecond().compareTo(pair1.getSecond());
             }
           });
       for (Pair<String, Double> wordWithScore : topKasList) {
         out.write(
             wordWithScore.getFirst()
                 + " [p("
                 + wordWithScore.getFirst()
                 + "|topic_"
                 + i
                 + ") = "
                 + wordWithScore.getSecond());
         out.write('\n');
       }
     } finally {
       if (!printingToSystemOut) {
         Closeables.closeQuietly(out);
       }
     }
   }
 }
  @Override
  protected void reduce(IntWritable key, Iterable<TransactionTree> values, Context context)
      throws IOException {
    TransactionTree cTree = new TransactionTree();
    for (TransactionTree tr : values) {
      for (Pair<IntArrayList, Long> p : tr) {
        cTree.addPattern(p.getFirst(), p.getSecond());
      }
    }

    List<Pair<Integer, Long>> localFList = Lists.newArrayList();
    for (Entry<Integer, MutableLong> fItem : cTree.generateFList().entrySet()) {
      localFList.add(new Pair<Integer, Long>(fItem.getKey(), fItem.getValue().toLong()));
    }

    Collections.sort(localFList, new CountDescendingPairComparator<Integer, Long>());

    if (useFP2) {
      org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthIds fpGrowth =
          new org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthIds();
      fpGrowth.generateTopKFrequentPatterns(
          cTree.iterator(),
          freqList,
          minSupport,
          maxHeapSize,
          PFPGrowth.getGroupMembers(key.get(), maxPerGroup, numFeatures),
          new IntegerStringOutputConverter(
              new ContextWriteOutputCollector<
                  IntWritable, TransactionTree, Text, TopKStringPatterns>(context),
              featureReverseMap),
          new ContextStatusUpdater<IntWritable, TransactionTree, Text, TopKStringPatterns>(
              context));
    } else {
      FPGrowth<Integer> fpGrowth = new FPGrowth<Integer>();
      fpGrowth.generateTopKFrequentPatterns(
          new IteratorAdapter(cTree.iterator()),
          localFList,
          minSupport,
          maxHeapSize,
          new HashSet<Integer>(
              PFPGrowth.getGroupMembers(key.get(), maxPerGroup, numFeatures).toList()),
          new IntegerStringOutputConverter(
              new ContextWriteOutputCollector<
                  IntWritable, TransactionTree, Text, TopKStringPatterns>(context),
              featureReverseMap),
          new ContextStatusUpdater<IntWritable, TransactionTree, Text, TopKStringPatterns>(
              context));
    }
  }
Example #15
0
  /**
   * Parses the resulting recommendations from the output of the reducer.
   *
   * @param outputDir Directory containing the output of the Hadoop job.
   * @return
   * @throws IOException
   */
  public static Map<Integer, Double> parseResults(String outputDir, Configuration conf)
      throws IOException {
    Path path = new Path(outputDir);

    Pair<IntWritable, VectorWritable> result =
        Iterables.getOnlyElement(LabUtils.readSequence(path, conf));
    Vector ratingsVector = result.getSecond().get();

    Map<Integer, Double> ratings = new HashMap<Integer, Double>();
    for (Element el : ratingsVector) {
      ratings.put(el.index(), el.get());
    }

    return ratings;
  }
 private Map<Integer, Long> buildOffsets(Path input, long startIndex) throws IOException {
   Map<Integer, Long> offsets = new HashMap<Integer, Long>();
   SequenceFileDirIterator<IntWritable, LongWritable> iter =
       new SequenceFileDirIterator<IntWritable, LongWritable>(
           new Path(input + "/part*"), PathType.GLOB, null, null, true, new Configuration());
   long cusum = startIndex;
   while (iter.hasNext()) {
     Pair<IntWritable, LongWritable> e = iter.next();
     int partitionId = e.getFirst().get();
     long currentLineNum = e.getSecond().get();
     offsets.put(partitionId, cusum);
     cusum += currentLineNum;
   }
   return offsets;
 }
 public static Map<Integer, List<VectorWritable>> getRepresentativePoints(
     Configuration conf, Path statePath) {
   Map<Integer, List<VectorWritable>> representativePoints = Maps.newHashMap();
   for (Pair<IntWritable, VectorWritable> record :
       new SequenceFileDirIterable<IntWritable, VectorWritable>(
           statePath, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
     int keyValue = record.getFirst().get();
     List<VectorWritable> repPoints = representativePoints.get(keyValue);
     if (repPoints == null) {
       repPoints = Lists.newArrayList();
       representativePoints.put(keyValue, repPoints);
     }
     repPoints.add(record.getSecond());
   }
   return representativePoints;
 }
Example #18
0
 /** read a {@link Matrix} from a SequenceFile<IntWritable,VectorWritable> */
 public static Matrix readMatrix(Configuration conf, Path path, int rows, int columns) {
   boolean readOneRow = false;
   Matrix matrix = new DenseMatrix(rows, columns);
   for (Pair<IntWritable, VectorWritable> record :
       new SequenceFileIterable<IntWritable, VectorWritable>(path, true, conf)) {
     IntWritable key = record.getFirst();
     VectorWritable value = record.getSecond();
     readOneRow = true;
     int row = key.get();
     for (Element element : value.get().nonZeroes()) {
       matrix.set(row, element.index(), element.get());
     }
   }
   if (!readOneRow) {
     throw new IllegalStateException("Not a single row read!");
   }
   return matrix;
 }
Example #19
0
  private static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) {

    List<TermIndexWeight> vectorTerms = new ArrayList<TermIndexWeight>();

    Iterator<Vector.Element> iter = vector.iterateNonZero();
    while (iter.hasNext()) {
      Vector.Element elt = iter.next();
      vectorTerms.add(new TermIndexWeight(elt.index(), elt.get()));
    }

    // Sort results in reverse order (ie weight in descending order)
    Collections.sort(
        vectorTerms,
        new Comparator<TermIndexWeight>() {
          @Override
          public int compare(TermIndexWeight one, TermIndexWeight two) {
            return Double.compare(two.weight, one.weight);
          }
        });

    Collection<Pair<String, Double>> topTerms = new LinkedList<Pair<String, Double>>();

    for (int i = 0; (i < vectorTerms.size()) && (i < numTerms); i++) {
      int index = vectorTerms.get(i).index;
      String dictTerm = dictionary[index];
      if (dictTerm == null) {
        log.error("Dictionary entry missing for {}", index);
        continue;
      }
      topTerms.add(new Pair<String, Double>(dictTerm, vectorTerms.get(i).weight));
    }

    StringBuilder sb = new StringBuilder(100);

    for (Pair<String, Double> item : topTerms) {
      String term = item.getFirst();
      sb.append("\n\t\t");
      sb.append(StringUtils.rightPad(term, 40));
      sb.append("=>");
      sb.append(StringUtils.leftPad(item.getSecond().toString(), 20));
    }
    return sb.toString();
  }
  private XYDataset createDataSet(List<Pair<Integer, Integer>> list) {
    DefaultXYDataset dataSet = new DefaultXYDataset();

    int i = 0;
    int j = 0;
    double[][] values = new double[2][2 * list.size()];
    for (Pair<Integer, Integer> pair : list) {
      int count = pair.getFirst();
      int num = pair.getSecond();
      values[0][j] = Double.valueOf(i);
      values[1][j++] = Double.valueOf(count);
      i += num;
      values[0][j] = Double.valueOf(i - 1);
      values[1][j++] = Double.valueOf(count);
    }
    dataSet.addSeries("", values);

    return dataSet;
  }
  @Override
  protected void setup(Context context) throws IOException, InterruptedException {

    super.setup(context);
    Parameters params =
        new Parameters(context.getConfiguration().get(PFPGrowth.PFP_PARAMETERS, ""));

    for (Pair<String, Long> e : PFPGrowth.readFList(context.getConfiguration())) {
      featureReverseMap.add(e.getFirst());
      freqList.add(e.getSecond());
    }

    maxHeapSize = Integer.valueOf(params.get(PFPGrowth.MAX_HEAPSIZE, "50"));
    minSupport = Integer.valueOf(params.get(PFPGrowth.MIN_SUPPORT, "3"));

    maxPerGroup = params.getInt(PFPGrowth.MAX_PER_GROUP, 0);
    numFeatures = featureReverseMap.size();
    useFP2 = "true".equals(params.get(PFPGrowth.USE_FPG2));
  }
  /**
   * Processes the output from the output path.<br>
   *
   * @param outputPath directory that contains the output of the job
   * @param keys can be null
   * @param trees can be null
   * @throws java.io.IOException
   */
  protected static void processOutput(
      JobContext job, Path outputPath, TreeID[] keys, Node[] trees, int[] nneg, int[] npos)
      throws IOException {
    Preconditions.checkArgument(
        keys == null && trees == null || keys != null && trees != null,
        "if keys is null, trees should also be null");
    Preconditions.checkArgument(
        keys == null || keys.length == trees.length, "keys.length != trees.length");

    Configuration conf = job.getConfiguration();

    FileSystem fs = outputPath.getFileSystem(conf);

    Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);

    // read all the outputs
    int index = 0;
    int index_ = 0;
    for (Path path : outfiles) {
      for (Pair<TreeID, MapredOutput> record :
          new SequenceFileIterable<TreeID, MapredOutput>(path, conf)) {
        TreeID key = record.getFirst();
        MapredOutput value = record.getSecond();
        if (keys != null) {
          keys[index] = key;
        }
        if (trees != null) {
          trees[index] = value.getTree();
          nneg[index_] = value.getNneg();
          npos[index_] = value.getNpos();
        }
        index++;
      }
      index_++;
    }

    // make sure we got all the keys/values
    if (keys != null && index != keys.length) {
      throw new IllegalStateException("Some key/values are missing from the output");
    }
  }
Example #23
0
 private static List<Queue<Pair<String, Double>>> topWordsForTopics(
     String dir, Configuration job, List<String> wordList, int numWordsToPrint) {
   List<Queue<Pair<String, Double>>> queues = Lists.newArrayList();
   Map<Integer, Double> expSums = Maps.newHashMap();
   for (Pair<IntPairWritable, DoubleWritable> record :
       new SequenceFileDirIterable<IntPairWritable, DoubleWritable>(
           new Path(dir, "part-*"), PathType.GLOB, null, null, true, job)) {
     IntPairWritable key = record.getFirst();
     int topic = key.getFirst();
     int word = key.getSecond();
     ensureQueueSize(queues, topic);
     if (word >= 0 && topic >= 0) {
       double score = record.getSecond().get();
       if (expSums.get(topic) == null) {
         expSums.put(topic, 0.0);
       }
       expSums.put(topic, expSums.get(topic) + Math.exp(score));
       String realWord = wordList.get(word);
       maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint);
     }
   }
   for (int i = 0; i < queues.size(); i++) {
     Queue<Pair<String, Double>> queue = queues.get(i);
     Queue<Pair<String, Double>> newQueue = new PriorityQueue<Pair<String, Double>>(queue.size());
     double norm = expSums.get(i);
     for (Pair<String, Double> pair : queue) {
       newQueue.add(new Pair<String, Double>(pair.getFirst(), Math.exp(pair.getSecond()) / norm));
     }
     queues.set(i, newQueue);
   }
   return queues;
 }
 private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
   if (dictionaryPath == null) {
     return null;
   }
   Path dictionaryFile = new Path(dictionaryPath);
   List<Pair<Integer, String>> termList = Lists.newArrayList();
   int maxTermId = 0;
   // key is word value is id
   for (Pair<Writable, IntWritable> record :
       new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
     termList.add(
         new Pair<Integer, String>(record.getSecond().get(), record.getFirst().toString()));
     maxTermId = Math.max(maxTermId, record.getSecond().get());
   }
   String[] terms = new String[maxTermId + 1];
   for (Pair<Integer, String> pair : termList) {
     terms[pair.getFirst()] = pair.getSecond();
   }
   return terms;
 }
 private static Matrix loadVectors(String vectorPathString, Configuration conf)
     throws IOException {
   Path vectorPath = new Path(vectorPathString);
   FileSystem fs = vectorPath.getFileSystem(conf);
   List<Path> subPaths = Lists.newArrayList();
   if (fs.isFile(vectorPath)) {
     subPaths.add(vectorPath);
   } else {
     for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
       subPaths.add(fileStatus.getPath());
     }
   }
   List<Pair<Integer, Vector>> rowList = Lists.newArrayList();
   int numRows = Integer.MIN_VALUE;
   int numCols = -1;
   boolean sequentialAccess = false;
   for (Path subPath : subPaths) {
     for (Pair<IntWritable, VectorWritable> record :
         new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) {
       int id = record.getFirst().get();
       Vector vector = record.getSecond().get();
       if (vector instanceof NamedVector) {
         vector = ((NamedVector) vector).getDelegate();
       }
       if (numCols < 0) {
         numCols = vector.size();
         sequentialAccess = vector.isSequentialAccess();
       }
       rowList.add(Pair.of(id, vector));
       numRows = Math.max(numRows, id);
     }
   }
   numRows++;
   Vector[] rowVectors = new Vector[numRows];
   for (Pair<Integer, Vector> pair : rowList) {
     rowVectors[pair.getFirst()] = pair.getSecond();
   }
   return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);
 }
 @Override
 public Pair<List<Integer>, Long> next() {
   Pair<IntArrayList, Long> innerNext = innerIter.next();
   return new Pair(innerNext.getFirst().toList(), innerNext.getSecond());
 }