public void draw(Configuration conf) throws IOException { List<Pair<Integer, Integer>> list = new ArrayList<Pair<Integer, Integer>>(); SequenceFileDirIterator<IntWritable, IntWritable> iterator = new SequenceFileDirIterator<IntWritable, IntWritable>( inputPath, PathType.LIST, MultipleSequenceOutputFormat.FILTER, null, true, conf); while (iterator.hasNext()) { Pair<IntWritable, IntWritable> writablePair = iterator.next(); Pair<Integer, Integer> pair = new Pair<Integer, Integer>(writablePair.getFirst().get(), writablePair.getSecond().get()); list.add(pair); } iterator.close(); Collections.sort( list, new Comparator<Pair<Integer, Integer>>() { @Override public int compare(Pair<Integer, Integer> o1, Pair<Integer, Integer> o2) { if (o1.getFirst() < o2.getFirst()) { return -1; } return 1; } }); XYDataset dataSet = createDataSet(list); JFreeChart chart = ChartFactory.createXYLineChart( title, "", "count", dataSet, PlotOrientation.VERTICAL, true, true, false); XYPlot plot = (XYPlot) chart.getPlot(); NumberAxis axis = (NumberAxis) plot.getRangeAxis(); axis.setNumberFormatOverride(numberFormat); BufferedImage image = chart.createBufferedImage(WIDTH, HEIGHT); ImageIO.write(image, FORMAT, new File(imgFile)); }
@Test public void testRun() throws Exception { Path input = getTestTempDirPath("input"); Path output = getTestTempDirPath("output"); Path seedsPath = getTestTempDirPath("seeds"); List<VectorWritable> points = getPointsWritable(REFERENCE); List<VectorWritable> seeds = getPointsWritable(SEEDS); Configuration conf = new Configuration(); ClusteringTestUtils.writePointsToFile(points, true, new Path(input, "file1"), fs, conf); ClusteringTestUtils.writePointsToFile(seeds, true, new Path(seedsPath, "part-seeds"), fs, conf); String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), input.toString(), optKey(VectorDistanceSimilarityJob.SEEDS), seedsPath.toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName() }; ToolRunner.run(new Configuration(), new VectorDistanceSimilarityJob(), args); int expect = SEEDS.length * REFERENCE.length; DummyOutputCollector<StringTuple, DoubleWritable> collector = new DummyOutputCollector<StringTuple, DoubleWritable>(); // for (Pair<StringTuple, DoubleWritable> record : new SequenceFileIterable<StringTuple, DoubleWritable>( new Path(output, "part-m-00000"), conf)) { collector.collect(record.getFirst(), record.getSecond()); } assertEquals(expect, collector.getData().size()); }
public Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) { Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>(); for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(documentFrequencyPath, true, conf)) { documentFrequency.put(pair.getFirst().get(), pair.getSecond().get()); } return documentFrequency; }
public Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) { Map<String, Integer> dictionnary = new HashMap<String, Integer>(); for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) { dictionnary.put(pair.getFirst().toString(), pair.getSecond().get()); } return dictionnary; }
protected static OpenObjectIntHashMap<String> readIndexFromCache(Configuration conf) throws IOException { OpenObjectIntHashMap<String> index = new OpenObjectIntHashMap<String>(); for (Pair<Writable, IntWritable> entry : new SequenceFileIterable<Writable, IntWritable>(cachedFile(conf), conf)) { index.put(entry.getFirst().toString(), entry.getSecond().get()); } return index; }
protected static Map<String, Vector> readScoresFromCache(Configuration conf) throws IOException { Map<String, Vector> sumVectors = Maps.newHashMap(); for (Pair<Text, VectorWritable> entry : new SequenceFileDirIterable<Text, VectorWritable>( cachedFile(conf), PathType.LIST, PathFilters.partFilter(), conf)) { sumVectors.put(entry.getFirst().toString(), entry.getSecond().get()); } return sumVectors; }
/** * Read the document frequency List which is built at the end of the DF Count Job. This will use * constant memory and will run at the speed of your disk read */ private static Pair<Long[], List<Path>> createDictionaryChunks( Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); try { long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>( filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(freqWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; IntWritable key = record.getFirst(); LongWritable value = record.getSecond(); if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } featureCount++; Long[] counts = {featureCount, vectorCount}; return new Pair<Long[], List<Path>>(counts, chunkPaths); } finally { Closeables.close(freqWriter, false); } }
/** Test Parallel FPGrowth on retail data using top-level runPFPGrowth() method */ @Test public void testParallelRetailVs() throws Exception { PFPGrowth.runPFPGrowth(paramsImpl1); List<Pair<String, TopKStringPatterns>> frequentPatterns1 = PFPGrowth.readFrequentPattern(paramsImpl1); Map<Set<String>, Long> results1 = Maps.newHashMap(); for (Pair<String, TopKStringPatterns> topK : frequentPatterns1) { Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator(); while (topKIt.hasNext()) { Pair<List<String>, Long> entry = topKIt.next(); results1.put(new HashSet<String>(entry.getFirst()), entry.getSecond()); } } PFPGrowth.runPFPGrowth(paramsImpl2); List<Pair<String, TopKStringPatterns>> frequentPatterns2 = PFPGrowth.readFrequentPattern(paramsImpl2); Map<Set<String>, Long> results2 = Maps.newHashMap(); for (Pair<String, TopKStringPatterns> topK : frequentPatterns2) { Iterator<Pair<List<String>, Long>> topKIt = topK.getSecond().iterator(); while (topKIt.hasNext()) { Pair<List<String>, Long> entry = topKIt.next(); results2.put(new HashSet<String>(entry.getFirst()), entry.getSecond()); } } for (Entry<Set<String>, Long> entry : results1.entrySet()) { Set<String> key = entry.getKey(); if (results2.get(key) == null) { System.out.println("spurious (1): " + key + " with " + entry.getValue()); } else { if (!results2.get(key).equals(results1.get(entry.getKey()))) { System.out.println( "invalid (1): " + key + ", expected: " + results2.get(key) + ", got: " + results1.get(entry.getKey())); } else { System.out.println("matched (1): " + key + ", with: " + results2.get(key)); } } } for (Entry<Set<String>, Long> entry : results2.entrySet()) { Set<String> key = entry.getKey(); if (results1.get(key) == null) { System.out.println("missing (1): " + key + " with " + entry.getValue()); } } assertEquals(results2.size(), results1.size()); }
/** Reads a binary mapping file */ public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) { OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap(); Path itemIDIndexPath = new Path(idIndexPathStr); for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>( itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { indexIDMap.put(record.getFirst().get(), record.getSecond().get()); } return indexIDMap; }
static NaiveBayesModel readModelFromTempDir(Path base, Configuration conf) { float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f); // read feature sums and label sums Vector scoresPerLabel = null; Vector scoresPerFeature = null; for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>( new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) { String key = record.getFirst().toString(); VectorWritable value = record.getSecond(); if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) { scoresPerFeature = value.get(); } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) { scoresPerLabel = value.get(); } } Preconditions.checkNotNull(scoresPerFeature); Preconditions.checkNotNull(scoresPerLabel); Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size()); for (Pair<IntWritable, VectorWritable> entry : new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(), conf)) { scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get()); } Vector perlabelThetaNormalizer = null; for (Pair<Text, VectorWritable> entry : new SequenceFileDirIterable<Text, VectorWritable>( new Path(base, TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(), conf)) { if (entry.getFirst().toString().equals(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER)) { perlabelThetaNormalizer = entry.getSecond().get(); } } Preconditions.checkNotNull(perlabelThetaNormalizer); return new NaiveBayesModel( scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel, perlabelThetaNormalizer, alphaI); }
public void loadResults(Path outDirPath, Configuration conf) throws IOException { Path finalNumberFile = new Path(outDirPath, "part-r-00000"); SequenceFileIterator<IntWritable, DoubleWritable> iterator = new SequenceFileIterator<IntWritable, DoubleWritable>(finalNumberFile, true, conf); try { while (iterator.hasNext()) { Pair<IntWritable, DoubleWritable> next = iterator.next(); readIndividualResult(next.getFirst().get(), next.getSecond().get()); } } finally { Closeables.close(iterator, false); } }
/** read a {@link Matrix} from a SequenceFile<IntWritable,VectorWritable> */ public static OpenIntObjectHashMap<Vector> readMatrixRows(Configuration conf, Path path) { boolean readOneRow = false; OpenIntObjectHashMap<Vector> rows = new OpenIntObjectHashMap<>(); for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(path, true, conf)) { IntWritable key = record.getFirst(); readOneRow = true; rows.put(key.get(), record.getSecond().get()); } if (!readOneRow) { throw new IllegalStateException("Not a single row read!"); } return rows; }
private static void printTopWords(List<Queue<Pair<String, Double>>> topWords, File outputDir) throws IOException { for (int i = 0; i < topWords.size(); ++i) { Collection<Pair<String, Double>> topK = topWords.get(i); Writer out = null; boolean printingToSystemOut = false; try { if (outputDir != null) { out = new OutputStreamWriter( new FileOutputStream(new File(outputDir, "topic_" + i)), Charsets.UTF_8); } else { out = new OutputStreamWriter(System.out, Charsets.UTF_8); printingToSystemOut = true; out.write("Topic " + i); out.write('\n'); out.write("==========="); out.write('\n'); } List<Pair<String, Double>> topKasList = Lists.newArrayListWithCapacity(topK.size()); for (Pair<String, Double> wordWithScore : topK) { topKasList.add(wordWithScore); } Collections.sort( topKasList, new Comparator<Pair<String, Double>>() { @Override public int compare(Pair<String, Double> pair1, Pair<String, Double> pair2) { return pair2.getSecond().compareTo(pair1.getSecond()); } }); for (Pair<String, Double> wordWithScore : topKasList) { out.write( wordWithScore.getFirst() + " [p(" + wordWithScore.getFirst() + "|topic_" + i + ") = " + wordWithScore.getSecond()); out.write('\n'); } } finally { if (!printingToSystemOut) { Closeables.closeQuietly(out); } } } }
@Override protected void reduce(IntWritable key, Iterable<TransactionTree> values, Context context) throws IOException { TransactionTree cTree = new TransactionTree(); for (TransactionTree tr : values) { for (Pair<IntArrayList, Long> p : tr) { cTree.addPattern(p.getFirst(), p.getSecond()); } } List<Pair<Integer, Long>> localFList = Lists.newArrayList(); for (Entry<Integer, MutableLong> fItem : cTree.generateFList().entrySet()) { localFList.add(new Pair<Integer, Long>(fItem.getKey(), fItem.getValue().toLong())); } Collections.sort(localFList, new CountDescendingPairComparator<Integer, Long>()); if (useFP2) { org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthIds fpGrowth = new org.apache.mahout.fpm.pfpgrowth.fpgrowth2.FPGrowthIds(); fpGrowth.generateTopKFrequentPatterns( cTree.iterator(), freqList, minSupport, maxHeapSize, PFPGrowth.getGroupMembers(key.get(), maxPerGroup, numFeatures), new IntegerStringOutputConverter( new ContextWriteOutputCollector< IntWritable, TransactionTree, Text, TopKStringPatterns>(context), featureReverseMap), new ContextStatusUpdater<IntWritable, TransactionTree, Text, TopKStringPatterns>( context)); } else { FPGrowth<Integer> fpGrowth = new FPGrowth<Integer>(); fpGrowth.generateTopKFrequentPatterns( new IteratorAdapter(cTree.iterator()), localFList, minSupport, maxHeapSize, new HashSet<Integer>( PFPGrowth.getGroupMembers(key.get(), maxPerGroup, numFeatures).toList()), new IntegerStringOutputConverter( new ContextWriteOutputCollector< IntWritable, TransactionTree, Text, TopKStringPatterns>(context), featureReverseMap), new ContextStatusUpdater<IntWritable, TransactionTree, Text, TopKStringPatterns>( context)); } }
/** * Parses the resulting recommendations from the output of the reducer. * * @param outputDir Directory containing the output of the Hadoop job. * @return * @throws IOException */ public static Map<Integer, Double> parseResults(String outputDir, Configuration conf) throws IOException { Path path = new Path(outputDir); Pair<IntWritable, VectorWritable> result = Iterables.getOnlyElement(LabUtils.readSequence(path, conf)); Vector ratingsVector = result.getSecond().get(); Map<Integer, Double> ratings = new HashMap<Integer, Double>(); for (Element el : ratingsVector) { ratings.put(el.index(), el.get()); } return ratings; }
private Map<Integer, Long> buildOffsets(Path input, long startIndex) throws IOException { Map<Integer, Long> offsets = new HashMap<Integer, Long>(); SequenceFileDirIterator<IntWritable, LongWritable> iter = new SequenceFileDirIterator<IntWritable, LongWritable>( new Path(input + "/part*"), PathType.GLOB, null, null, true, new Configuration()); long cusum = startIndex; while (iter.hasNext()) { Pair<IntWritable, LongWritable> e = iter.next(); int partitionId = e.getFirst().get(); long currentLineNum = e.getSecond().get(); offsets.put(partitionId, cusum); cusum += currentLineNum; } return offsets; }
public static Map<Integer, List<VectorWritable>> getRepresentativePoints( Configuration conf, Path statePath) { Map<Integer, List<VectorWritable>> representativePoints = Maps.newHashMap(); for (Pair<IntWritable, VectorWritable> record : new SequenceFileDirIterable<IntWritable, VectorWritable>( statePath, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { int keyValue = record.getFirst().get(); List<VectorWritable> repPoints = representativePoints.get(keyValue); if (repPoints == null) { repPoints = Lists.newArrayList(); representativePoints.put(keyValue, repPoints); } repPoints.add(record.getSecond()); } return representativePoints; }
/** read a {@link Matrix} from a SequenceFile<IntWritable,VectorWritable> */ public static Matrix readMatrix(Configuration conf, Path path, int rows, int columns) { boolean readOneRow = false; Matrix matrix = new DenseMatrix(rows, columns); for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(path, true, conf)) { IntWritable key = record.getFirst(); VectorWritable value = record.getSecond(); readOneRow = true; int row = key.get(); for (Element element : value.get().nonZeroes()) { matrix.set(row, element.index(), element.get()); } } if (!readOneRow) { throw new IllegalStateException("Not a single row read!"); } return matrix; }
private static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) { List<TermIndexWeight> vectorTerms = new ArrayList<TermIndexWeight>(); Iterator<Vector.Element> iter = vector.iterateNonZero(); while (iter.hasNext()) { Vector.Element elt = iter.next(); vectorTerms.add(new TermIndexWeight(elt.index(), elt.get())); } // Sort results in reverse order (ie weight in descending order) Collections.sort( vectorTerms, new Comparator<TermIndexWeight>() { @Override public int compare(TermIndexWeight one, TermIndexWeight two) { return Double.compare(two.weight, one.weight); } }); Collection<Pair<String, Double>> topTerms = new LinkedList<Pair<String, Double>>(); for (int i = 0; (i < vectorTerms.size()) && (i < numTerms); i++) { int index = vectorTerms.get(i).index; String dictTerm = dictionary[index]; if (dictTerm == null) { log.error("Dictionary entry missing for {}", index); continue; } topTerms.add(new Pair<String, Double>(dictTerm, vectorTerms.get(i).weight)); } StringBuilder sb = new StringBuilder(100); for (Pair<String, Double> item : topTerms) { String term = item.getFirst(); sb.append("\n\t\t"); sb.append(StringUtils.rightPad(term, 40)); sb.append("=>"); sb.append(StringUtils.leftPad(item.getSecond().toString(), 20)); } return sb.toString(); }
private XYDataset createDataSet(List<Pair<Integer, Integer>> list) { DefaultXYDataset dataSet = new DefaultXYDataset(); int i = 0; int j = 0; double[][] values = new double[2][2 * list.size()]; for (Pair<Integer, Integer> pair : list) { int count = pair.getFirst(); int num = pair.getSecond(); values[0][j] = Double.valueOf(i); values[1][j++] = Double.valueOf(count); i += num; values[0][j] = Double.valueOf(i - 1); values[1][j++] = Double.valueOf(count); } dataSet.addSeries("", values); return dataSet; }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Parameters params = new Parameters(context.getConfiguration().get(PFPGrowth.PFP_PARAMETERS, "")); for (Pair<String, Long> e : PFPGrowth.readFList(context.getConfiguration())) { featureReverseMap.add(e.getFirst()); freqList.add(e.getSecond()); } maxHeapSize = Integer.valueOf(params.get(PFPGrowth.MAX_HEAPSIZE, "50")); minSupport = Integer.valueOf(params.get(PFPGrowth.MIN_SUPPORT, "3")); maxPerGroup = params.getInt(PFPGrowth.MAX_PER_GROUP, 0); numFeatures = featureReverseMap.size(); useFP2 = "true".equals(params.get(PFPGrowth.USE_FPG2)); }
/** * Processes the output from the output path.<br> * * @param outputPath directory that contains the output of the job * @param keys can be null * @param trees can be null * @throws java.io.IOException */ protected static void processOutput( JobContext job, Path outputPath, TreeID[] keys, Node[] trees, int[] nneg, int[] npos) throws IOException { Preconditions.checkArgument( keys == null && trees == null || keys != null && trees != null, "if keys is null, trees should also be null"); Preconditions.checkArgument( keys == null || keys.length == trees.length, "keys.length != trees.length"); Configuration conf = job.getConfiguration(); FileSystem fs = outputPath.getFileSystem(conf); Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath); // read all the outputs int index = 0; int index_ = 0; for (Path path : outfiles) { for (Pair<TreeID, MapredOutput> record : new SequenceFileIterable<TreeID, MapredOutput>(path, conf)) { TreeID key = record.getFirst(); MapredOutput value = record.getSecond(); if (keys != null) { keys[index] = key; } if (trees != null) { trees[index] = value.getTree(); nneg[index_] = value.getNneg(); npos[index_] = value.getNpos(); } index++; } index_++; } // make sure we got all the keys/values if (keys != null && index != keys.length) { throw new IllegalStateException("Some key/values are missing from the output"); } }
private static List<Queue<Pair<String, Double>>> topWordsForTopics( String dir, Configuration job, List<String> wordList, int numWordsToPrint) { List<Queue<Pair<String, Double>>> queues = Lists.newArrayList(); Map<Integer, Double> expSums = Maps.newHashMap(); for (Pair<IntPairWritable, DoubleWritable> record : new SequenceFileDirIterable<IntPairWritable, DoubleWritable>( new Path(dir, "part-*"), PathType.GLOB, null, null, true, job)) { IntPairWritable key = record.getFirst(); int topic = key.getFirst(); int word = key.getSecond(); ensureQueueSize(queues, topic); if (word >= 0 && topic >= 0) { double score = record.getSecond().get(); if (expSums.get(topic) == null) { expSums.put(topic, 0.0); } expSums.put(topic, expSums.get(topic) + Math.exp(score)); String realWord = wordList.get(word); maybeEnqueue(queues.get(topic), realWord, score, numWordsToPrint); } } for (int i = 0; i < queues.size(); i++) { Queue<Pair<String, Double>> queue = queues.get(i); Queue<Pair<String, Double>> newQueue = new PriorityQueue<Pair<String, Double>>(queue.size()); double norm = expSums.get(i); for (Pair<String, Double> pair : queue) { newQueue.add(new Pair<String, Double>(pair.getFirst(), Math.exp(pair.getSecond()) / norm)); } queues.set(i, newQueue); } return queues; }
private static String[] loadDictionary(String dictionaryPath, Configuration conf) { if (dictionaryPath == null) { return null; } Path dictionaryFile = new Path(dictionaryPath); List<Pair<Integer, String>> termList = Lists.newArrayList(); int maxTermId = 0; // key is word value is id for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { termList.add( new Pair<Integer, String>(record.getSecond().get(), record.getFirst().toString())); maxTermId = Math.max(maxTermId, record.getSecond().get()); } String[] terms = new String[maxTermId + 1]; for (Pair<Integer, String> pair : termList) { terms[pair.getFirst()] = pair.getSecond(); } return terms; }
private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException { Path vectorPath = new Path(vectorPathString); FileSystem fs = vectorPath.getFileSystem(conf); List<Path> subPaths = Lists.newArrayList(); if (fs.isFile(vectorPath)) { subPaths.add(vectorPath); } else { for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) { subPaths.add(fileStatus.getPath()); } } List<Pair<Integer, Vector>> rowList = Lists.newArrayList(); int numRows = Integer.MIN_VALUE; int numCols = -1; boolean sequentialAccess = false; for (Path subPath : subPaths) { for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) { int id = record.getFirst().get(); Vector vector = record.getSecond().get(); if (vector instanceof NamedVector) { vector = ((NamedVector) vector).getDelegate(); } if (numCols < 0) { numCols = vector.size(); sequentialAccess = vector.isSequentialAccess(); } rowList.add(Pair.of(id, vector)); numRows = Math.max(numRows, id); } } numRows++; Vector[] rowVectors = new Vector[numRows]; for (Pair<Integer, Vector> pair : rowList) { rowVectors[pair.getFirst()] = pair.getSecond(); } return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess); }
@Override public Pair<List<Integer>, Long> next() { Pair<IntArrayList, Long> innerNext = innerIter.next(); return new Pair(innerNext.getFirst().toList(), innerNext.getSecond()); }