Ejemplo n.º 1
0
  static NaiveBayesModel readModelFromTempDir(Path base, Configuration conf) {

    float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f);

    // read feature sums and label sums
    Vector scoresPerLabel = null;
    Vector scoresPerFeature = null;
    for (Pair<Text, VectorWritable> record :
        new SequenceFileDirIterable<Text, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.WEIGHTS),
            PathType.LIST,
            PathFilters.partFilter(),
            conf)) {
      String key = record.getFirst().toString();
      VectorWritable value = record.getSecond();
      if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) {
        scoresPerFeature = value.get();
      } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) {
        scoresPerLabel = value.get();
      }
    }

    Preconditions.checkNotNull(scoresPerFeature);
    Preconditions.checkNotNull(scoresPerLabel);

    Matrix scoresPerLabelAndFeature =
        new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size());
    for (Pair<IntWritable, VectorWritable> entry :
        new SequenceFileDirIterable<IntWritable, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS),
            PathType.LIST,
            PathFilters.partFilter(),
            conf)) {
      scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get());
    }

    Vector perlabelThetaNormalizer = null;
    for (Pair<Text, VectorWritable> entry :
        new SequenceFileDirIterable<Text, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.THETAS),
            PathType.LIST,
            PathFilters.partFilter(),
            conf)) {
      if (entry.getFirst().toString().equals(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER)) {
        perlabelThetaNormalizer = entry.getSecond().get();
      }
    }

    Preconditions.checkNotNull(perlabelThetaNormalizer);

    return new NaiveBayesModel(
        scoresPerLabelAndFeature,
        scoresPerFeature,
        scoresPerLabel,
        perlabelThetaNormalizer,
        alphaI);
  }
Ejemplo n.º 2
0
 /**
  * Lists all files in the output {@code Path}
  *
  * @param fs {@code FileSystem} to use
  * @param outpath output {@code Path}
  * @return {@code Path} array
  */
 public static Path[] listOutputFiles(FileSystem fs, Path outpath) throws IOException {
   Collection<Path> outpaths = Lists.newArrayList();
   for (FileStatus s : fs.listStatus(outpath, PathFilters.logsCRCFilter())) {
     if (!s.isDir() && !s.getPath().getName().startsWith("_")) {
       outpaths.add(s.getPath());
     }
   }
   return outpaths.toArray(new Path[outpaths.size()]);
 }
Ejemplo n.º 3
0
 protected static Map<String, Vector> readScoresFromCache(Configuration conf) throws IOException {
   Map<String, Vector> sumVectors = Maps.newHashMap();
   for (Pair<Text, VectorWritable> entry :
       new SequenceFileDirIterable<Text, VectorWritable>(
           cachedFile(conf), PathType.LIST, PathFilters.partFilter(), conf)) {
     sumVectors.put(entry.getFirst().toString(), entry.getSecond().get());
   }
   return sumVectors;
 }
Ejemplo n.º 4
0
 /** Reads a binary mapping file */
 public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) {
   OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap();
   Path itemIDIndexPath = new Path(idIndexPathStr);
   for (Pair<VarIntWritable, VarLongWritable> record :
       new SequenceFileDirIterable<VarIntWritable, VarLongWritable>(
           itemIDIndexPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
     indexIDMap.put(record.getFirst().get(), record.getSecond().get());
   }
   return indexIDMap;
 }
Ejemplo n.º 5
0
  private long createLabelIndex(Path labPath) throws IOException {
    long labelSize = 0;
    String path = System.getProperty("user.dir");

    Iterable<Pair<Text, IntWritable>> iterable =
        new SequenceFileDirIterable<Text, IntWritable>(
            new Path(path + "/../out/training"),
            PathType.LIST,
            PathFilters.logsCRCFilter(),
            getConf());
    labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
    return labelSize;
  }
 public static Map<Integer, List<VectorWritable>> getRepresentativePoints(
     Configuration conf, Path statePath) {
   Map<Integer, List<VectorWritable>> representativePoints = Maps.newHashMap();
   for (Pair<IntWritable, VectorWritable> record :
       new SequenceFileDirIterable<IntWritable, VectorWritable>(
           statePath, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
     int keyValue = record.getFirst().get();
     List<VectorWritable> repPoints = representativePoints.get(keyValue);
     if (repPoints == null) {
       repPoints = Lists.newArrayList();
       representativePoints.put(keyValue, repPoints);
     }
     repPoints.add(record.getSecond());
   }
   return representativePoints;
 }
 @Override
 public Iterator<MatrixSlice> iterateAll() {
   try {
     return Iterators.transform(
         new SequenceFileDirIterator<IntWritable, VectorWritable>(
             new Path(rowPath, "*"), PathType.GLOB, PathFilters.logsCRCFilter(), null, true, conf),
         new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() {
           @Override
           public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) {
             return new MatrixSlice(from.getSecond().get(), from.getFirst().get());
           }
         });
   } catch (IOException ioe) {
     throw new IllegalStateException(ioe);
   }
 }
 private static Matrix loadVectors(String vectorPathString, Configuration conf)
     throws IOException {
   Path vectorPath = new Path(vectorPathString);
   FileSystem fs = vectorPath.getFileSystem(conf);
   List<Path> subPaths = Lists.newArrayList();
   if (fs.isFile(vectorPath)) {
     subPaths.add(vectorPath);
   } else {
     for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
       subPaths.add(fileStatus.getPath());
     }
   }
   List<Pair<Integer, Vector>> rowList = Lists.newArrayList();
   int numRows = Integer.MIN_VALUE;
   int numCols = -1;
   boolean sequentialAccess = false;
   for (Path subPath : subPaths) {
     for (Pair<IntWritable, VectorWritable> record :
         new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) {
       int id = record.getFirst().get();
       Vector vector = record.getSecond().get();
       if (vector instanceof NamedVector) {
         vector = ((NamedVector) vector).getDelegate();
       }
       if (numCols < 0) {
         numCols = vector.size();
         sequentialAccess = vector.isSequentialAccess();
       }
       rowList.add(Pair.of(id, vector));
       numRows = Math.max(numRows, id);
     }
   }
   numRows++;
   Vector[] rowVectors = new Vector[numRows];
   for (Pair<Integer, Vector> pair : rowList) {
     rowVectors[pair.getFirst()] = pair.getSecond();
   }
   return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);
 }