コード例 #1
0
 @Override
 public Matrix getDiagonalMatrix() {
   if (diagonalMatrix == null) {
     diagonalMatrix = new DenseMatrix(desiredRank, desiredRank);
   }
   if (diagonalMatrix.get(0, 1) <= 0) {
     try {
       Vector norms = fetchVector(new Path(baseDir, "norms"), 0);
       Vector projections = fetchVector(new Path(baseDir, "projections"), 0);
       if (norms != null && projections != null) {
         int i = 0;
         while (i < projections.size() - 1) {
           diagonalMatrix.set(i, i, projections.get(i));
           diagonalMatrix.set(i, i + 1, norms.get(i));
           diagonalMatrix.set(i + 1, i, norms.get(i));
           i++;
         }
         diagonalMatrix.set(i, i, projections.get(i));
       }
     } catch (IOException e) {
       log.error("Could not load diagonal matrix of norms and projections: ", e);
     }
   }
   return diagonalMatrix;
 }
 @Override
 public double pdf(VectorWritable v) {
   Vector x = v.get();
   // return the product of the component pdfs
   // TODO: is this reasonable? correct?
   double pdf = pdf(x, stdDev.get(0));
   for (int i = 1; i < x.size(); i++) {
     pdf *= pdf(x, stdDev.get(i));
   }
   return pdf;
 }
コード例 #3
0
  /**
   * Initializes training. Runs through all data points in the training set and updates the weight
   * vector whenever a classification error occurs.
   *
   * <p>Can be called multiple times.
   *
   * @param dataset the dataset to train on. Each column is treated as point.
   * @param labelset the set of labels, one for each data point. If the cardinalities of data- and
   *     labelset do not match, a CardinalityException is thrown
   */
  public void train(Vector labelset, Matrix dataset) throws TrainingException {
    if (labelset.size() != dataset.columnSize()) {
      throw new CardinalityException(labelset.size(), dataset.columnSize());
    }

    boolean converged = false;
    int iteration = 0;
    while (!converged) {
      if (iteration > 1000) {
        throw new TrainingException("Too many iterations needed to find hyperplane.");
      }

      converged = true;
      int columnCount = dataset.columnSize();
      for (int i = 0; i < columnCount; i++) {
        Vector dataPoint = dataset.viewColumn(i);
        log.debug("Training point: {}", dataPoint);

        synchronized (this.model) {
          boolean prediction = model.classify(dataPoint);
          double label = labelset.get(i);
          if (label <= 0 && prediction || label > 0 && !prediction) {
            log.debug("updating");
            converged = false;
            update(label, dataPoint, this.model);
          }
        }
      }
      iteration++;
    }
  }
コード例 #4
0
  static void mainToOutput(String[] args, PrintWriter output) throws Exception {
    if (!parseArgs(args)) {
      return;
    }
    AdaptiveLogisticModelParameters lmp =
        AdaptiveLogisticModelParameters.loadFromFile(new File(modelFile));

    CsvRecordFactory csv = lmp.getCsvRecordFactory();
    csv.setIdName(idColumn);

    AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();

    State<Wrapper, CrossFoldLearner> best = lr.getBest();
    if (best == null) {
      output.println("AdaptiveLogisticRegression has not be trained probably.");
      return;
    }
    CrossFoldLearner learner = best.getPayload().getLearner();

    BufferedReader in = TrainAdaptiveLogistic.open(inputFile);
    BufferedWriter out =
        new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(outputFile), Charsets.UTF_8));

    out.write(idColumn + ",target,score");
    out.newLine();

    String line = in.readLine();
    csv.firstLine(line);
    line = in.readLine();
    Map<String, Double> results = new HashMap<String, Double>();
    int k = 0;
    while (line != null) {
      Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
      csv.processLine(line, v, false);
      Vector scores = learner.classifyFull(v);
      results.clear();
      if (maxScoreOnly) {
        results.put(csv.getTargetLabel(scores.maxValueIndex()), scores.maxValue());
      } else {
        for (int i = 0; i < scores.size(); i++) {
          results.put(csv.getTargetLabel(i), scores.get(i));
        }
      }

      for (Map.Entry<String, Double> entry : results.entrySet()) {
        out.write(csv.getIdString(line) + ',' + entry.getKey() + ',' + entry.getValue());
        out.newLine();
      }
      k++;
      if (k % 100 == 0) {
        output.println(k + " records processed");
      }
      line = in.readLine();
    }
    out.flush();
    out.close();
    output.println(k + " records processed totally.");
  }
コード例 #5
0
  public void processVectors(
      List<Pair<Integer, NamedVector>> vectors, boolean train, SequenceFile.Writer writer)
      throws IOException {

    BytesWritable recordKey = new BytesWritable("".getBytes());
    Text value = new Text();

    if (train) Collections.shuffle(vectors);

    double mu = -1.0;
    double ll = -1.0;
    int actual = -1;
    for (Pair<Integer, NamedVector> pair : vectors) {
      NamedVector v = pair.getValue();
      if (train) {
        actual = pair.getKey();
        mu = Math.min(this.k + 1, 200);
        ll = learningAlgorithm.logLikelihood(actual, v);
        this.averageLL = this.averageLL + (ll - this.averageLL) / mu;
      }

      Vector p = new DenseVector(LABELS);
      learningAlgorithm.classifyFull(p, v);
      int estimated = p.maxValueIndex();
      this.counts[estimated]++;

      if (writer != null) {
        value.set(
            String.format(
                "%s%c%d%c01%f",
                v.getName(),
                SEQUENCE_FILE_FIELD_SEPARATOR,
                estimated,
                SEQUENCE_FILE_FIELD_SEPARATOR,
                p.get(estimated)));
        writer.append(recordKey, value);
      }

      if (train) {
        int correct = (estimated == actual ? 1 : 0);
        this.averageCorrect = this.averageCorrect + (correct - this.averageCorrect) / mu;

        learningAlgorithm.train(actual, v);
        learningAlgorithm.close();
      }
      this.k++;
      int bump = this.BUMPS[(int) Math.floor(this.step) % this.BUMPS.length];
      int scale = (int) Math.pow(10, Math.floor(this.step / this.BUMPS.length));
      if (this.k % Math.min(MAX_STEP, bump * scale) == 0) {
        this.step += 0.25;
        if (train)
          System.out.printf(
              "%10d %10.3f %10.3f %10.2f %d\n",
              this.k, ll, this.averageLL, this.averageCorrect * 100, estimated);
        else System.out.printf("%c%10d, per label: %s\n", CR, this.k, Arrays.toString(this.counts));
      }
    }
  }
コード例 #6
0
 private static String getFormatedOutput(VectorWritable vw) {
   String formatedString = "";
   int formatWidth = 8;
   Vector vector = vw.get();
   for (int i = 0; i < vector.size(); ++i) {
     formatedString += String.format("%" + Integer.toString(formatWidth) + ".4f", vector.get(i));
   }
   return formatedString;
 }
コード例 #7
0
    private boolean consider(Vector.Element occurrenceA, Vector.Element occurrenceB) {
      int numNonZeroEntriesA = numNonZeroEntries.get(occurrenceA.index());
      int numNonZeroEntriesB = numNonZeroEntries.get(occurrenceB.index());

      double maxValueA = maxValues.get(occurrenceA.index());
      double maxValueB = maxValues.get(occurrenceB.index());

      return similarity.consider(
          numNonZeroEntriesA, numNonZeroEntriesB, maxValueA, maxValueB, threshold);
    }
コード例 #8
0
ファイル: MathHelper.java プロジェクト: ChineseDr/mahout
 /** checks whether the {@link Vector} is equivalent to the set of {@link Vector.Element}s */
 public static boolean consistsOf(Vector vector, Vector.Element... elements) {
   if (elements.length != numberOfNoNZeroNonNaNElements(vector)) {
     return false;
   }
   for (Vector.Element element : elements) {
     if (Math.abs(element.get() - vector.get(element.index())) > MahoutTestCase.EPSILON) {
       return false;
     }
   }
   return true;
 }
コード例 #9
0
ファイル: AbstractCluster.java プロジェクト: Earne/HiBench
 /**
  * Return a human-readable formatted string representation of the vector, not intended to be
  * complete nor usable as an input/output representation
  */
 public static String formatVector(Vector v, String[] bindings) {
   StringBuilder buf = new StringBuilder();
   if (v instanceof NamedVector) {
     buf.append(((NamedVector) v).getName()).append(" = ");
   }
   int nzero = 0;
   Iterator<Vector.Element> iterateNonZero = v.iterateNonZero();
   while (iterateNonZero.hasNext()) {
     iterateNonZero.next();
     nzero++;
   }
   // if vector is sparse or if we have bindings, use sparse notation
   if (nzero < v.size() || bindings != null) {
     buf.append('[');
     for (int i = 0; i < v.size(); i++) {
       double elem = v.get(i);
       if (elem == 0.0) {
         continue;
       }
       String label;
       if (bindings != null && (label = bindings[i]) != null) {
         buf.append(label).append(':');
       } else {
         buf.append(i).append(':');
       }
       buf.append(String.format(Locale.ENGLISH, "%.3f", elem)).append(", ");
     }
   } else {
     buf.append('[');
     for (int i = 0; i < v.size(); i++) {
       double elem = v.get(i);
       buf.append(String.format(Locale.ENGLISH, "%.3f", elem)).append(", ");
     }
   }
   if (buf.length() > 1) {
     buf.setLength(buf.length() - 2);
   }
   buf.append(']');
   return buf.toString();
 }
コード例 #10
0
 @Override
 public double getScaleFactor() {
   if (scaleFactor <= 0) {
     try {
       Vector v = fetchVector(new Path(baseDir, "scaleFactor"), 0);
       if (v != null && v.size() > 0) {
         scaleFactor = v.get(0);
       }
     } catch (IOException e) {
       log.error("could not load scaleFactor:", e);
     }
   }
   return scaleFactor;
 }
コード例 #11
0
 public static Matrix sampledCorpus(
     Matrix matrix, Random random, int numDocs, int numSamples, int numTopicsPerDoc) {
   Matrix corpus = new SparseRowMatrix(numDocs, matrix.numCols());
   LDASampler modelSampler = new LDASampler(matrix, random);
   Vector topicVector = new DenseVector(matrix.numRows());
   for (int i = 0; i < numTopicsPerDoc; i++) {
     int topic = random.nextInt(topicVector.size());
     topicVector.set(topic, topicVector.get(topic) + 1);
   }
   for (int docId = 0; docId < numDocs; docId++) {
     for (int sample : modelSampler.sample(topicVector, numSamples)) {
       corpus.set(docId, sample, corpus.get(docId, sample) + 1);
     }
   }
   return corpus;
 }
コード例 #12
0
  @Test
  public void testAddToVector() {
    FeatureVectorEncoder enc = new ContinuousValueEncoder("foo");
    Vector v1 = new DenseVector(20);
    enc.addToVector("-123", v1);
    Assert.assertEquals(-123, v1.minValue(), 0);
    Assert.assertEquals(0, v1.maxValue(), 0);
    Assert.assertEquals(123, v1.norm(1), 0);

    v1 = new DenseVector(20);
    enc.addToVector("123", v1);
    Assert.assertEquals(123, v1.maxValue(), 0);
    Assert.assertEquals(0, v1.minValue(), 0);
    Assert.assertEquals(123, v1.norm(1), 0);

    Vector v2 = new DenseVector(20);
    enc.setProbes(2);
    enc.addToVector("123", v2);
    Assert.assertEquals(123, v2.maxValue(), 0);
    Assert.assertEquals(2 * 123, v2.norm(1), 0);

    v1 = v2.minus(v1);
    Assert.assertEquals(123, v1.maxValue(), 0);
    Assert.assertEquals(123, v1.norm(1), 0);

    Vector v3 = new DenseVector(20);
    enc.setProbes(2);
    enc.addToVector("100", v3);
    v1 = v2.minus(v3);
    Assert.assertEquals(23, v1.maxValue(), 0);
    Assert.assertEquals(2 * 23, v1.norm(1), 0);

    enc.addToVector("7", v1);
    Assert.assertEquals(30, v1.maxValue(), 0);
    Assert.assertEquals(2 * 30, v1.norm(1), 0);
    Assert.assertEquals(30, v1.get(10), 0);
    Assert.assertEquals(30, v1.get(18), 0);

    try {
      enc.addToVector("foobar", v1);
      Assert.fail("Should have noticed bad numeric format");
    } catch (NumberFormatException e) {
      Assert.assertEquals("For input string: \"foobar\"", e.getMessage());
    }
  }
コード例 #13
0
  @Test
  public void testMatrixDiagonalizeReducer() throws Exception {
    MatrixDiagonalizeMapper mapper = new MatrixDiagonalizeMapper();
    Configuration conf = getConfiguration();
    conf.setInt(Keys.AFFINITY_DIMENSIONS, RAW_DIMENSIONS);

    // set up the dummy writers
    DummyRecordWriter<NullWritable, IntDoublePairWritable> mapWriter = new DummyRecordWriter<>();
    Mapper<IntWritable, VectorWritable, NullWritable, IntDoublePairWritable>.Context mapContext =
        DummyRecordWriter.build(mapper, conf, mapWriter);

    // perform the mapping
    for (int i = 0; i < RAW_DIMENSIONS; i++) {
      RandomAccessSparseVector toAdd = new RandomAccessSparseVector(RAW_DIMENSIONS);
      toAdd.assign(RAW[i]);
      mapper.map(new IntWritable(i), new VectorWritable(toAdd), mapContext);
    }

    // now perform the reduction
    MatrixDiagonalizeReducer reducer = new MatrixDiagonalizeReducer();
    DummyRecordWriter<NullWritable, VectorWritable> redWriter = new DummyRecordWriter<>();
    Reducer<NullWritable, IntDoublePairWritable, NullWritable, VectorWritable>.Context redContext =
        DummyRecordWriter.build(
            reducer, conf, redWriter, NullWritable.class, IntDoublePairWritable.class);

    // only need one reduction
    reducer.reduce(NullWritable.get(), mapWriter.getValue(NullWritable.get()), redContext);

    // first, make sure there's only one result
    List<VectorWritable> list = redWriter.getValue(NullWritable.get());
    assertEquals("Only a single resulting vector", 1, list.size());
    Vector v = list.get(0).get();
    for (int i = 0; i < v.size(); i++) {
      assertEquals("Element sum is correct", rowSum(RAW[i]), v.get(i), 0.01);
    }
  }
コード例 #14
0
 public double thetaNormalizer(int label) {
   return perlabelThetaNormalizer.get(label);
 }