public Instance pipe(Instance carrier) {
    if (!(carrier.getTarget() instanceof String)) {
      throw new IllegalArgumentException("Target must be of type String");
    }

    String featuresLine = (String) carrier.getTarget();

    String[] features = featuresLine.split(",?\\s+");

    double[] values = new double[features.length];
    Arrays.fill(values, 1.0);

    for (int i = 0; i < features.length; i++) {

      // Support the syntax "FEATURE=0.000342 OTHER_FEATURE=-2.32423"
      //        \

      if (features[i].indexOf("=") != -1) {
        String[] keyValuePair = features[i].split("=");
        features[i] = keyValuePair[0];
        values[i] = Double.parseDouble(keyValuePair[1]);
      }

      // ensure that the feature has a spot in the alphabet
      //        \

      getTargetAlphabet().lookupIndex(features[i], true);
    }

    FeatureVector target = new FeatureVector(getTargetAlphabet(), features, values);

    carrier.setTarget(target);

    return carrier;
  }
Пример #2
0
  public static InstanceList copy(InstanceList instances) {
    InstanceList ret = (InstanceList) instances.clone();
    // LabelAlphabet labelDict = (LabelAlphabet) ret.getTargetAlphabet();
    Alphabet featDict = ret.getDataAlphabet();

    for (int i = 0; i < ret.size(); i++) {
      Instance instance = ret.get(i);
      Instance clone = (Instance) instance.clone();
      FeatureVector fv = (FeatureVector) clone.getData();

      int[] indices = fv.getIndices();
      double[] values = fv.getValues();

      int[] newIndices = new int[indices.length];
      System.arraycopy(indices, 0, newIndices, 0, indices.length);

      double[] newValues = new double[indices.length];
      System.arraycopy(values, 0, newValues, 0, indices.length);

      FeatureVector newFv = new FeatureVector(featDict, newIndices, newValues);
      Instance newInstance =
          new Instance(newFv, instance.getTarget(), instance.getName(), instance.getSource());
      ret.set(i, newInstance);
    }

    return ret;
  }
Пример #3
0
  public boolean train(
      InstanceList ilist, InstanceList validation, InstanceList testing, TransducerEvaluator eval) {
    assert (ilist.size() > 0);
    if (emissionEstimator == null) {
      emissionEstimator = new Multinomial.LaplaceEstimator[numStates()];
      transitionEstimator = new Multinomial.LaplaceEstimator[numStates()];
      emissionMultinomial = new Multinomial[numStates()];
      transitionMultinomial = new Multinomial[numStates()];
      Alphabet transitionAlphabet = new Alphabet();
      for (int i = 0; i < numStates(); i++)
        transitionAlphabet.lookupIndex(((State) states.get(i)).getName(), true);
      for (int i = 0; i < numStates(); i++) {
        emissionEstimator[i] = new Multinomial.LaplaceEstimator(inputAlphabet);
        transitionEstimator[i] = new Multinomial.LaplaceEstimator(transitionAlphabet);
        emissionMultinomial[i] =
            new Multinomial(getUniformArray(inputAlphabet.size()), inputAlphabet);
        transitionMultinomial[i] =
            new Multinomial(getUniformArray(transitionAlphabet.size()), transitionAlphabet);
      }
      initialEstimator = new Multinomial.LaplaceEstimator(transitionAlphabet);
    }
    for (Instance instance : ilist) {
      FeatureSequence input = (FeatureSequence) instance.getData();
      FeatureSequence output = (FeatureSequence) instance.getTarget();
      new SumLatticeDefault(this, input, output, new Incrementor());
    }
    initialMultinomial = initialEstimator.estimate();
    for (int i = 0; i < numStates(); i++) {
      emissionMultinomial[i] = emissionEstimator[i].estimate();
      transitionMultinomial[i] = transitionEstimator[i].estimate();
      getState(i).setInitialWeight(initialMultinomial.logProbability(getState(i).getName()));
    }

    return true;
  }
Пример #4
0
  /**
   * converts the sentence based instance list into a token based one This is needed for the
   * ME-version of JET (JetMeClassifier)
   *
   * @param METrainerDummyPipe
   * @param inst just the features for one sentence to be transformed
   * @return
   */
  public static InstanceList convertFeatsforClassifier(
      final Pipe METrainerDummyPipe, final Instance inst) {

    final InstanceList iList = new InstanceList(METrainerDummyPipe);

    final FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData();
    final LabelSequence ls = (LabelSequence) inst.getTarget();
    final LabelAlphabet ldict = (LabelAlphabet) ls.getAlphabet();
    final Object source = inst.getSource();
    final Object name = inst.getName();

    if (ls.size() != fvs.size()) {
      System.err.println(
          "failed making token instances: size of labelsequence != size of featue vector sequence: "
              + ls.size()
              + " - "
              + fvs.size());
      System.exit(-1);
    }

    for (int j = 0; j < fvs.size(); j++) {
      final Instance I =
          new Instance(fvs.getFeatureVector(j), ldict.lookupLabel(ls.get(j)), name, source);
      iList.add(I);
    }

    return iList;
  }
Пример #5
0
  public SVM train(InstanceList trainingList) {
    svm_problem problem = new svm_problem();
    problem.l = trainingList.size();
    problem.x = new svm_node[problem.l][];
    problem.y = new double[problem.l];

    for (int i = 0; i < trainingList.size(); i++) {
      Instance instance = trainingList.get(i);
      svm_node[] input = SVM.getSvmNodes(instance);
      if (input == null) {
        continue;
      }
      int labelIndex = ((Label) instance.getTarget()).getIndex();
      problem.x[i] = input;
      problem.y[i] = labelIndex;
    }

    int max_index = trainingList.getDataAlphabet().size();

    if (param.gamma == 0 && max_index > 0) {
      param.gamma = 1.0 / max_index;
    }

    // int numLabels = trainingList.getTargetAlphabet().size();
    // int[] weight_label = new int[numLabels];
    // double[] weight = trainingList.targetLabelDistribution().getValues();
    // double minValue = Double.MAX_VALUE;
    //
    // for (int i = 0; i < weight.length; i++) {
    // if (minValue > weight[i]) {
    // minValue = weight[i];
    // }
    // }
    //
    // for (int i = 0; i < weight.length; i++) {
    // weight_label[i] = i;
    // weight[i] = weight[i] / minValue;
    // }
    //
    // param.weight_label = weight_label;
    // param.weight = weight;

    String error_msg = svm.svm_check_parameter(problem, param);

    if (error_msg != null) {
      System.err.print("Error: " + error_msg + "\n");
      System.exit(1);
    }

    svm_model model = svm.svm_train(problem, param);

    classifier = new SVM(model, trainingList.getPipe());

    return classifier;
  }
Пример #6
0
 private boolean[][] labelConnectionsIn(InstanceList trainingSet) {
   int numLabels = outputAlphabet.size();
   boolean[][] connections = new boolean[numLabels][numLabels];
   for (Instance instance : trainingSet) {
     FeatureSequence output = (FeatureSequence) instance.getTarget();
     for (int j = 1; j < output.size(); j++) {
       int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1));
       int destIndex = outputAlphabet.lookupIndex(output.get(j));
       assert (sourceIndex >= 0 && destIndex >= 0);
       connections[sourceIndex][destIndex] = true;
     }
   }
   return connections;
 }
  public Instance pipe(Instance carrier) {
    Sequence data = (Sequence) carrier.getData();
    Sequence target = (Sequence) carrier.getTarget();

    if (data.size() != target.size())
      throw new IllegalArgumentException(
          "Trying to print into SimpleTagger format, where data and target lengths do not match\n"
              + "data.length = "
              + data.size()
              + ", target.length = "
              + target.size());

    int N = data.size();

    if (data instanceof TokenSequence) {
      throw new UnsupportedOperationException("Not yet implemented.");
    } else if (data instanceof FeatureVectorSequence) {

      FeatureVectorSequence fvs = (FeatureVectorSequence) data;
      Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null;

      for (int i = 0; i < N; i++) {
        Object label = target.get(i);
        writer.print(label);

        FeatureVector fv = fvs.getFeatureVector(i);
        for (int loc = 0; loc < fv.numLocations(); loc++) {
          writer.print(' ');
          String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString();
          double value = fv.valueAtLocation(loc);
          // if (!Maths.almostEquals(value, 1.0)) {
          //    throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector
          // not binary at time slice "+i+" fv:"+fv);
          // }
          writer.print(fname + String.valueOf(value));
        }
        writer.println();
      }
    } else {
      throw new IllegalArgumentException("Don't know how to print data of type " + data);
    }

    writer.println();
    // writer.print(getDataAlphabet());

    return carrier;
  }
Пример #8
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   TokenSequence targets =
       carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null;
   TokenSequence source =
       carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null;
   StringBuffer sb = new StringBuffer();
   if (prefix != null) sb.append(prefix);
   sb.append("name: " + carrier.getName() + "\n");
   for (int i = 0; i < ts.size(); i++) {
     if (source != null) {
       sb.append(source.get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof TokenSequence) {
       sb.append(((TokenSequence) carrier.getTarget()).get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof FeatureSequence) {
       sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString());
       sb.append(' ');
     }
     PropertyList pl = ts.get(i).getFeatures();
     if (pl != null) {
       PropertyList.Iterator iter = pl.iterator();
       while (iter.hasNext()) {
         iter.next();
         double v = iter.getNumericValue();
         if (v == 1.0) sb.append(iter.getKey());
         else sb.append(iter.getKey() + '=' + v);
         sb.append(' ');
       }
     }
     sb.append('\n');
   }
   System.out.print(sb.toString());
   return carrier;
 }