static NaiveBayesModel readModelFromTempDir(Path base, Configuration conf) { float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f); // read feature sums and label sums Vector scoresPerLabel = null; Vector scoresPerFeature = null; for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>( new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) { String key = record.getFirst().toString(); VectorWritable value = record.getSecond(); if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) { scoresPerFeature = value.get(); } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) { scoresPerLabel = value.get(); } } Preconditions.checkNotNull(scoresPerFeature); Preconditions.checkNotNull(scoresPerLabel); Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size()); for (Pair<IntWritable, VectorWritable> entry : new SequenceFileDirIterable<IntWritable, VectorWritable>( new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(), conf)) { scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get()); } Vector perlabelThetaNormalizer = null; for (Pair<Text, VectorWritable> entry : new SequenceFileDirIterable<Text, VectorWritable>( new Path(base, TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(), conf)) { if (entry.getFirst().toString().equals(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER)) { perlabelThetaNormalizer = entry.getSecond().get(); } } Preconditions.checkNotNull(perlabelThetaNormalizer); return new NaiveBayesModel( scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel, perlabelThetaNormalizer, alphaI); }
/** * Lists all files in the output {@code Path} * * @param fs {@code FileSystem} to use * @param outpath output {@code Path} * @return {@code Path} array */ public static Path[] listOutputFiles(FileSystem fs, Path outpath) throws IOException { Collection<Path> outpaths = Lists.newArrayList(); for (FileStatus s : fs.listStatus(outpath, PathFilters.logsCRCFilter())) { if (!s.isDir() && !s.getPath().getName().startsWith("_")) { outpaths.add(s.getPath()); } } return outpaths.toArray(new Path[outpaths.size()]); }
protected static Map<String, Vector> readScoresFromCache(Configuration conf) throws IOException { Map<String, Vector> sumVectors = Maps.newHashMap(); for (Pair<Text, VectorWritable> entry : new SequenceFileDirIterable<Text, VectorWritable>( cachedFile(conf), PathType.LIST, PathFilters.partFilter(), conf)) { sumVectors.put(entry.getFirst().toString(), entry.getSecond().get()); } return sumVectors; }
/** Reads a binary mapping file */ public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) { OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap(); Path itemIDIndexPath = new Path(idIndexPathStr); for (Pair<VarIntWritable, VarLongWritable> record : new SequenceFileDirIterable<VarIntWritable, VarLongWritable>( itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { indexIDMap.put(record.getFirst().get(), record.getSecond().get()); } return indexIDMap; }
private long createLabelIndex(Path labPath) throws IOException { long labelSize = 0; String path = System.getProperty("user.dir"); Iterable<Pair<Text, IntWritable>> iterable = new SequenceFileDirIterable<Text, IntWritable>( new Path(path + "/../out/training"), PathType.LIST, PathFilters.logsCRCFilter(), getConf()); labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable); return labelSize; }
public static Map<Integer, List<VectorWritable>> getRepresentativePoints( Configuration conf, Path statePath) { Map<Integer, List<VectorWritable>> representativePoints = Maps.newHashMap(); for (Pair<IntWritable, VectorWritable> record : new SequenceFileDirIterable<IntWritable, VectorWritable>( statePath, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { int keyValue = record.getFirst().get(); List<VectorWritable> repPoints = representativePoints.get(keyValue); if (repPoints == null) { repPoints = Lists.newArrayList(); representativePoints.put(keyValue, repPoints); } repPoints.add(record.getSecond()); } return representativePoints; }
@Override public Iterator<MatrixSlice> iterateAll() { try { return Iterators.transform( new SequenceFileDirIterator<IntWritable, VectorWritable>( new Path(rowPath, "*"), PathType.GLOB, PathFilters.logsCRCFilter(), null, true, conf), new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() { @Override public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) { return new MatrixSlice(from.getSecond().get(), from.getFirst().get()); } }); } catch (IOException ioe) { throw new IllegalStateException(ioe); } }
private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException { Path vectorPath = new Path(vectorPathString); FileSystem fs = vectorPath.getFileSystem(conf); List<Path> subPaths = Lists.newArrayList(); if (fs.isFile(vectorPath)) { subPaths.add(vectorPath); } else { for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) { subPaths.add(fileStatus.getPath()); } } List<Pair<Integer, Vector>> rowList = Lists.newArrayList(); int numRows = Integer.MIN_VALUE; int numCols = -1; boolean sequentialAccess = false; for (Path subPath : subPaths) { for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) { int id = record.getFirst().get(); Vector vector = record.getSecond().get(); if (vector instanceof NamedVector) { vector = ((NamedVector) vector).getDelegate(); } if (numCols < 0) { numCols = vector.size(); sequentialAccess = vector.isSequentialAccess(); } rowList.add(Pair.of(id, vector)); numRows = Math.max(numRows, id); } } numRows++; Vector[] rowVectors = new Vector[numRows]; for (Pair<Integer, Vector> pair : rowList) { rowVectors[pair.getFirst()] = pair.getSecond(); } return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess); }