public Instance pipe(Instance carrier) {
    if (!(carrier.getTarget() instanceof String)) {
      throw new IllegalArgumentException("Target must be of type String");
    }

    String featuresLine = (String) carrier.getTarget();

    String[] features = featuresLine.split(",?\\s+");

    double[] values = new double[features.length];
    Arrays.fill(values, 1.0);

    for (int i = 0; i < features.length; i++) {

      // Support the syntax "FEATURE=0.000342 OTHER_FEATURE=-2.32423"
      //        \

      if (features[i].indexOf("=") != -1) {
        String[] keyValuePair = features[i].split("=");
        features[i] = keyValuePair[0];
        values[i] = Double.parseDouble(keyValuePair[1]);
      }

      // ensure that the feature has a spot in the alphabet
      //        \

      getTargetAlphabet().lookupIndex(features[i], true);
    }

    FeatureVector target = new FeatureVector(getTargetAlphabet(), features, values);

    carrier.setTarget(target);

    return carrier;
  }
Пример #2
0
  /** This is (mostly) copied from CRF4.java */
  public boolean[][] labelConnectionsIn(
      Alphabet outputAlphabet, InstanceList trainingSet, String start) {
    int numLabels = outputAlphabet.size();
    boolean[][] connections = new boolean[numLabels][numLabels];
    for (int i = 0; i < trainingSet.size(); i++) {
      Instance instance = trainingSet.getInstance(i);
      FeatureSequence output = (FeatureSequence) instance.getTarget();
      for (int j = 1; j < output.size(); j++) {
        int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1));
        int destIndex = outputAlphabet.lookupIndex(output.get(j));
        assert (sourceIndex >= 0 && destIndex >= 0);
        connections[sourceIndex][destIndex] = true;
      }
    }

    // Handle start state
    if (start != null) {
      int startIndex = outputAlphabet.lookupIndex(start);
      for (int j = 0; j < outputAlphabet.size(); j++) {
        connections[startIndex][j] = true;
      }
    }

    return connections;
  }
  public Instance pipe(Instance carrier) {
    Sequence data = (Sequence) carrier.getData();
    Sequence target = (Sequence) carrier.getTarget();

    if (data.size() != target.size())
      throw new IllegalArgumentException(
          "Trying to print into SimpleTagger format, where data and target lengths do not match\n"
              + "data.length = "
              + data.size()
              + ", target.length = "
              + target.size());

    int N = data.size();

    if (data instanceof TokenSequence) {
      throw new UnsupportedOperationException("Not yet implemented.");
    } else if (data instanceof FeatureVectorSequence) {

      FeatureVectorSequence fvs = (FeatureVectorSequence) data;
      Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null;

      for (int i = 0; i < N; i++) {
        Object label = target.get(i);
        writer.print(label);

        FeatureVector fv = fvs.getFeatureVector(i);
        for (int loc = 0; loc < fv.numLocations(); loc++) {
          writer.print(' ');
          String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString();
          double value = fv.valueAtLocation(loc);
          // if (!Maths.almostEquals(value, 1.0)) {
          //    throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector
          // not binary at time slice "+i+" fv:"+fv);
          // }
          writer.print(fname + String.valueOf(value));
        }
        writer.println();
      }
    } else {
      throw new IllegalArgumentException("Don't know how to print data of type " + data);
    }

    writer.println();
    // writer.print(getDataAlphabet());

    return carrier;
  }
Пример #4
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   TokenSequence targets =
       carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null;
   TokenSequence source =
       carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null;
   StringBuffer sb = new StringBuffer();
   if (prefix != null) sb.append(prefix);
   sb.append("name: " + carrier.getName() + "\n");
   for (int i = 0; i < ts.size(); i++) {
     if (source != null) {
       sb.append(source.get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof TokenSequence) {
       sb.append(((TokenSequence) carrier.getTarget()).get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof FeatureSequence) {
       sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString());
       sb.append(' ');
     }
     PropertyList pl = ts.get(i).getFeatures();
     if (pl != null) {
       PropertyList.Iterator iter = pl.iterator();
       while (iter.hasNext()) {
         iter.next();
         double v = iter.getNumericValue();
         if (v == 1.0) sb.append(iter.getKey());
         else sb.append(iter.getKey() + '=' + v);
         sb.append(' ');
       }
     }
     sb.append('\n');
   }
   System.out.print(sb.toString());
   return carrier;
 }
  public void test(
      Transducer transducer,
      InstanceList data,
      String description,
      PrintStream viterbiOutputStream) {
    int[] ntrue = new int[segmentTags.length];
    int[] npred = new int[segmentTags.length];
    int[] ncorr = new int[segmentTags.length];

    LabelAlphabet dict = (LabelAlphabet) transducer.getInputPipe().getTargetAlphabet();

    for (int i = 0; i < data.size(); i++) {
      Instance instance = data.getInstance(i);
      Sequence input = (Sequence) instance.getData();
      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = transducer.viterbiPath(input).output();
      assert (predOutput.size() == trueOutput.size());

      List trueSegs = new ArrayList();
      List predSegs = new ArrayList();

      addSegs(trueSegs, trueOutput);
      addSegs(predSegs, predOutput);

      //      System.out.println("FieldF1Evaluator instance "+instance.getName ());
      //      printSegs(dict, trueSegs, "True");
      //      printSegs(dict, predSegs, "Pred");

      for (Iterator it = predSegs.iterator(); it.hasNext(); ) {
        Segment seg = (Segment) it.next();
        npred[seg.tag]++;
        if (trueSegs.contains(seg)) {
          ncorr[seg.tag]++;
        }
      }

      for (Iterator it = trueSegs.iterator(); it.hasNext(); ) {
        Segment seg = (Segment) it.next();
        ntrue[seg.tag]++;
      }
    }

    DecimalFormat f = new DecimalFormat("0.####");
    logger.info(description + " per-field F1");
    for (int tag = 0; tag < segmentTags.length; tag++) {
      double precision = ((double) ncorr[tag]) / npred[tag];
      double recall = ((double) ncorr[tag]) / ntrue[tag];
      double f1 = (2 * precision * recall) / (precision + recall);
      Label name = dict.lookupLabel(segmentTags[tag]);
      logger.info(
          " segments "
              + name
              + "  true = "
              + ntrue[tag]
              + "  pred = "
              + npred[tag]
              + "  correct = "
              + ncorr[tag]);
      logger.info(
          " precision="
              + f.format(precision)
              + " recall="
              + f.format(recall)
              + " f1="
              + f.format(f1));
    }
  }