Пример #1
0
  public boolean train(
      InstanceList ilist, InstanceList validation, InstanceList testing, TransducerEvaluator eval) {
    assert (ilist.size() > 0);
    if (emissionEstimator == null) {
      emissionEstimator = new Multinomial.LaplaceEstimator[numStates()];
      transitionEstimator = new Multinomial.LaplaceEstimator[numStates()];
      emissionMultinomial = new Multinomial[numStates()];
      transitionMultinomial = new Multinomial[numStates()];
      Alphabet transitionAlphabet = new Alphabet();
      for (int i = 0; i < numStates(); i++)
        transitionAlphabet.lookupIndex(((State) states.get(i)).getName(), true);
      for (int i = 0; i < numStates(); i++) {
        emissionEstimator[i] = new Multinomial.LaplaceEstimator(inputAlphabet);
        transitionEstimator[i] = new Multinomial.LaplaceEstimator(transitionAlphabet);
        emissionMultinomial[i] =
            new Multinomial(getUniformArray(inputAlphabet.size()), inputAlphabet);
        transitionMultinomial[i] =
            new Multinomial(getUniformArray(transitionAlphabet.size()), transitionAlphabet);
      }
      initialEstimator = new Multinomial.LaplaceEstimator(transitionAlphabet);
    }
    for (Instance instance : ilist) {
      FeatureSequence input = (FeatureSequence) instance.getData();
      FeatureSequence output = (FeatureSequence) instance.getTarget();
      new SumLatticeDefault(this, input, output, new Incrementor());
    }
    initialMultinomial = initialEstimator.estimate();
    for (int i = 0; i < numStates(); i++) {
      emissionMultinomial[i] = emissionEstimator[i].estimate();
      transitionMultinomial[i] = transitionEstimator[i].estimate();
      getState(i).setInitialWeight(initialMultinomial.logProbability(getState(i).getName()));
    }

    return true;
  }
  public Instance pipe(Instance carrier) {
    if (!(carrier.getTarget() instanceof String)) {
      throw new IllegalArgumentException("Target must be of type String");
    }

    String featuresLine = (String) carrier.getTarget();

    String[] features = featuresLine.split(",?\\s+");

    double[] values = new double[features.length];
    Arrays.fill(values, 1.0);

    for (int i = 0; i < features.length; i++) {

      // Support the syntax "FEATURE=0.000342 OTHER_FEATURE=-2.32423"
      //        \

      if (features[i].indexOf("=") != -1) {
        String[] keyValuePair = features[i].split("=");
        features[i] = keyValuePair[0];
        values[i] = Double.parseDouble(keyValuePair[1]);
      }

      // ensure that the feature has a spot in the alphabet
      //        \

      getTargetAlphabet().lookupIndex(features[i], true);
    }

    FeatureVector target = new FeatureVector(getTargetAlphabet(), features, values);

    carrier.setTarget(target);

    return carrier;
  }
Пример #3
0
  /**
   * converts the sentence based instance list into a token based one This is needed for the
   * ME-version of JET (JetMeClassifier)
   *
   * @param METrainerDummyPipe
   * @param inst just the features for one sentence to be transformed
   * @return
   */
  public static InstanceList convertFeatsforClassifier(
      final Pipe METrainerDummyPipe, final Instance inst) {

    final InstanceList iList = new InstanceList(METrainerDummyPipe);

    final FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData();
    final LabelSequence ls = (LabelSequence) inst.getTarget();
    final LabelAlphabet ldict = (LabelAlphabet) ls.getAlphabet();
    final Object source = inst.getSource();
    final Object name = inst.getName();

    if (ls.size() != fvs.size()) {
      System.err.println(
          "failed making token instances: size of labelsequence != size of featue vector sequence: "
              + ls.size()
              + " - "
              + fvs.size());
      System.exit(-1);
    }

    for (int j = 0; j < fvs.size(); j++) {
      final Instance I =
          new Instance(fvs.getFeatureVector(j), ldict.lookupLabel(ls.get(j)), name, source);
      iList.add(I);
    }

    return iList;
  }
Пример #4
0
 public void split() {
   if (m_ilist == null) throw new IllegalStateException("Frozen.  Cannot split.");
   int numLeftChildren = 0;
   boolean[] toLeftChild = new boolean[m_instIndices.length];
   for (int i = 0; i < m_instIndices.length; i++) {
     Instance instance = m_ilist.get(m_instIndices[i]);
     FeatureVector fv = (FeatureVector) instance.getData();
     if (fv.value(m_gainRatio.getMaxValuedIndex()) <= m_gainRatio.getMaxValuedThreshold()) {
       toLeftChild[i] = true;
       numLeftChildren++;
     } else toLeftChild[i] = false;
   }
   logger.info(
       "leftChild.size="
           + numLeftChildren
           + " rightChild.size="
           + (m_instIndices.length - numLeftChildren));
   int[] leftIndices = new int[numLeftChildren];
   int[] rightIndices = new int[m_instIndices.length - numLeftChildren];
   int li = 0, ri = 0;
   for (int i = 0; i < m_instIndices.length; i++) {
     if (toLeftChild[i]) leftIndices[li++] = m_instIndices[i];
     else rightIndices[ri++] = m_instIndices[i];
   }
   m_leftChild = new Node(m_ilist, this, m_minNumInsts, leftIndices);
   m_rightChild = new Node(m_ilist, this, m_minNumInsts, rightIndices);
 }
  @Override
  public Instance pipe(Instance carrier) {
    Arg1RankInstance instance = (Arg1RankInstance) carrier;

    Document document = (Document) instance.getData();
    List<Pair<Integer, Integer>> candidates = instance.getCandidates();
    int connStart = instance.getConnStart();
    int connEnd = instance.getConnEnd();
    int arg2Line = instance.getArg2Line();
    int arg2HeadPos = instance.getArg2HeadPos();

    FeatureVector fvs[] = new FeatureVector[candidates.size()];

    for (int i = 0; i < candidates.size(); i++) {
      Pair<Integer, Integer> candidate = candidates.get(i);
      PropertyList pl = null;
      pl = addBaselineFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd);
      pl =
          addConstituentFeatures(
              pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd);
      pl =
          addDependencyFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd);
      // pl = addLexicoSyntacticFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart,
      // connEnd);

      fvs[i] = new FeatureVector(getDataAlphabet(), pl, true, true);
    }

    // set target label
    LabelAlphabet ldict = (LabelAlphabet) getTargetAlphabet();
    carrier.setTarget(ldict.lookupLabel(String.valueOf(instance.getTrueArg1Candidate())));
    carrier.setData(new FeatureVectorSequence(fvs));

    return carrier;
  }
Пример #6
0
  public static InstanceList scale(InstanceList trainingList, double lower, double upper) {
    InstanceList ret = copy(trainingList);
    Alphabet featDict = ret.getDataAlphabet();

    double[] feat_max = new double[featDict.size()];
    double[] feat_min = new double[featDict.size()];

    for (int i = 0; i < feat_max.length; i++) {
      feat_max[i] = -Double.MAX_VALUE;
      feat_min[i] = Double.MAX_VALUE;
    }

    for (int i = 0; i < ret.size(); i++) {
      Instance inst = ret.get(i);
      FeatureVector fv = (FeatureVector) inst.getData();

      for (int loc = 0; loc < fv.numLocations(); loc++) {
        int featId = fv.indexAtLocation(loc);
        double value = fv.valueAtLocation(loc);
        double maxValue = feat_max[featId];
        double minValue = feat_min[featId];

        double newMaxValue = Math.max(value, maxValue);
        double newMinValue = Math.min(value, minValue);

        feat_max[featId] = newMaxValue;
        feat_min[featId] = newMinValue;
      }
    }

    // double lower = -1;
    // double upper = 1;

    for (int i = 0; i < ret.size(); i++) {
      Instance inst = ret.get(i);
      FeatureVector fv = (FeatureVector) inst.getData();

      for (int loc = 0; loc < fv.numLocations(); loc++) {
        int featId = fv.indexAtLocation(loc);
        double value = fv.valueAtLocation(loc);
        double maxValue = feat_max[featId];
        double minValue = feat_min[featId];
        double newValue = Double.NaN;
        if (maxValue == minValue) {
          newValue = value;
        } else if (value == minValue) {
          newValue = lower;
        } else if (value == maxValue) {
          newValue = upper;
        } else {
          newValue = lower + (upper - lower) * (value - minValue) / (maxValue - minValue);
        }

        fv.setValueAtLocation(loc, newValue);
      }
    }

    return ret;
  }
Пример #7
0
  public void count() {

    TIntIntHashMap docCounts = new TIntIntHashMap();

    int index = 0;

    if (instances.size() == 0) {
      logger.info("Instance list is empty");
      return;
    }

    if (instances.get(0).getData() instanceof FeatureSequence) {

      for (Instance instance : instances) {
        FeatureSequence features = (FeatureSequence) instance.getData();

        for (int i = 0; i < features.getLength(); i++) {
          docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1);
        }

        int[] keys = docCounts.keys();
        for (int i = 0; i < keys.length - 1; i++) {
          int feature = keys[i];
          featureCounts[feature] += docCounts.get(feature);
          documentFrequencies[feature]++;
        }

        docCounts = new TIntIntHashMap();

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else if (instances.get(0).getData() instanceof FeatureVector) {

      for (Instance instance : instances) {
        FeatureVector features = (FeatureVector) instance.getData();

        for (int location = 0; location < features.numLocations(); location++) {
          int feature = features.indexAtLocation(location);
          double value = features.valueAtLocation(location);

          documentFrequencies[feature]++;
          featureCounts[feature] += value;
        }

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else {
      logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName());
    }
  }
Пример #8
0
  public Instance pipe(Instance carrier) {

    if (carrier.getData() instanceof File) {
      try {
        // get file text
        File file = (File) carrier.getData();
        @SuppressWarnings("resource")
        String txt = new LineReader(new FileInputStream(file)).getText("\n");
        // update instance values
        carrier.setData(new TokenSequence(addRegexes(txt)));
        carrier.setSource(txt + " [file:" + file.getName() + "]");
      } catch (java.io.IOException e) {
        throw new IllegalArgumentException("IOException " + e);
      }

    } else if (carrier.getData() instanceof String) {
      String txt = (String) carrier.getData();
      // update instance values
      carrier.setData(new TokenSequence(addRegexes(txt)));
      carrier.setSource(txt);

    } else {
      throw new IllegalArgumentException("must be file or string " + carrier.getData());
    }
    return carrier;
  }
Пример #9
0
  public SVM train(InstanceList trainingList) {
    svm_problem problem = new svm_problem();
    problem.l = trainingList.size();
    problem.x = new svm_node[problem.l][];
    problem.y = new double[problem.l];

    for (int i = 0; i < trainingList.size(); i++) {
      Instance instance = trainingList.get(i);
      svm_node[] input = SVM.getSvmNodes(instance);
      if (input == null) {
        continue;
      }
      int labelIndex = ((Label) instance.getTarget()).getIndex();
      problem.x[i] = input;
      problem.y[i] = labelIndex;
    }

    int max_index = trainingList.getDataAlphabet().size();

    if (param.gamma == 0 && max_index > 0) {
      param.gamma = 1.0 / max_index;
    }

    // int numLabels = trainingList.getTargetAlphabet().size();
    // int[] weight_label = new int[numLabels];
    // double[] weight = trainingList.targetLabelDistribution().getValues();
    // double minValue = Double.MAX_VALUE;
    //
    // for (int i = 0; i < weight.length; i++) {
    // if (minValue > weight[i]) {
    // minValue = weight[i];
    // }
    // }
    //
    // for (int i = 0; i < weight.length; i++) {
    // weight_label[i] = i;
    // weight[i] = weight[i] / minValue;
    // }
    //
    // param.weight_label = weight_label;
    // param.weight = weight;

    String error_msg = svm.svm_check_parameter(problem, param);

    if (error_msg != null) {
      System.err.print("Error: " + error_msg + "\n");
      System.exit(1);
    }

    svm_model model = svm.svm_train(problem, param);

    classifier = new SVM(model, trainingList.getPipe());

    return classifier;
  }
  public Instance pipe(Instance carrier) {

    if (carrier.getData() instanceof String) {
      String data = (String) carrier.getData();
      String cleanedText = Jsoup.parse(data).text();
      carrier.setData(cleanedText);
    } else {
      throw new IllegalArgumentException(
          "CharSequenceLowercase expects a String, found a " + carrier.getData().getClass());
    }

    return carrier;
  }
Пример #11
0
 private boolean[][] labelConnectionsIn(InstanceList trainingSet) {
   int numLabels = outputAlphabet.size();
   boolean[][] connections = new boolean[numLabels][numLabels];
   for (Instance instance : trainingSet) {
     FeatureSequence output = (FeatureSequence) instance.getTarget();
     for (int j = 1; j < output.size(); j++) {
       int sourceIndex = outputAlphabet.lookupIndex(output.get(j - 1));
       int destIndex = outputAlphabet.lookupIndex(output.get(j));
       assert (sourceIndex >= 0 && destIndex >= 0);
       connections[sourceIndex][destIndex] = true;
     }
   }
   return connections;
 }
 public InstanceList readArray(String[] cleanTexts) {
   StringArrayIterator iterator = new StringArrayIterator(cleanTexts);
   // Construct a new instance list, passing it the pipe we want to use to
   // process instances.
   InstanceList instances = new InstanceList(pipe);
   int index = 0;
   for (Instance inst : instances) {
     inst.setName(name_id.get(index));
     inst.setTarget("english");
     index++;
   }
   // Now process each instance provided by the iterator.
   instances.addThruPipe(iterator);
   return instances;
 }
  /**
   * Process each sentence to add the feature if necessary.
   *
   * @param carrier Instance to be processed.
   * @return Instance with new features.
   */
  public Instance pipe(Instance carrier) {
    TokenSequence ts = (TokenSequence) carrier.getData();

    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      char[] text = t.getText().toCharArray();

      int numDigit = 0;
      for (int k = 0; k < text.length; k++) {
        if (Character.isDigit(text[k])) {
          numDigit++;
        }
      }

      if (numDigit == 1) {
        t.setFeatureValue("SingleDigit", 1.0);
      } else if (numDigit == 2) {
        t.setFeatureValue("TwoDigit", 1.0);
      } else if (numDigit == 3) {
        t.setFeatureValue("ThreeDigit", 1.0);
      } else if (numDigit >= 4) {
        t.setFeatureValue("MoreDigit", 1.0);
      }
    }
    return carrier;
  }
Пример #14
0
  public Classification classify(Instance instance) {
    FeatureVector fv = (FeatureVector) instance.getData();
    assert (instancePipe == null || fv.getAlphabet() == this.instancePipe.getDataAlphabet());

    Node leaf = getLeaf(m_root, fv);
    return new Classification(instance, this, leaf.getGainRatio().getBaseLabelDistribution());
  }
  public Instance pipe(Instance carrier) {
    Sequence data = (Sequence) carrier.getData();
    Sequence target = (Sequence) carrier.getTarget();

    if (data.size() != target.size())
      throw new IllegalArgumentException(
          "Trying to print into SimpleTagger format, where data and target lengths do not match\n"
              + "data.length = "
              + data.size()
              + ", target.length = "
              + target.size());

    int N = data.size();

    if (data instanceof TokenSequence) {
      throw new UnsupportedOperationException("Not yet implemented.");
    } else if (data instanceof FeatureVectorSequence) {

      FeatureVectorSequence fvs = (FeatureVectorSequence) data;
      Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null;

      for (int i = 0; i < N; i++) {
        Object label = target.get(i);
        writer.print(label);

        FeatureVector fv = fvs.getFeatureVector(i);
        for (int loc = 0; loc < fv.numLocations(); loc++) {
          writer.print(' ');
          String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString();
          double value = fv.valueAtLocation(loc);
          // if (!Maths.almostEquals(value, 1.0)) {
          //    throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector
          // not binary at time slice "+i+" fv:"+fv);
          // }
          writer.print(fname + String.valueOf(value));
        }
        writer.println();
      }
    } else {
      throw new IllegalArgumentException("Don't know how to print data of type " + data);
    }

    writer.println();
    // writer.print(getDataAlphabet());

    return carrier;
  }
Пример #16
0
 public void addInstance(Instance instance) {
   Sequence input = (Sequence) instance.getData();
   for (int j = 0; j < input.size(); j++) {
     FeatureVector fv = (FeatureVector) input.get(j);
     addToken(fv);
     // log.info( "token:" + j + " " + fv.toString( true ) );
   }
 }
Пример #17
0
  public Instance pipe(Instance carrier) {
    try {
      if (carrier.getData() instanceof URI) carrier.setData(pipe((URI) carrier.getData()));
      else if (carrier.getData() instanceof File) carrier.setData(pipe((File) carrier.getData()));
      else if (carrier.getData() instanceof Reader)
        carrier.setData(pipe((Reader) carrier.getData()));
      else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary
      else
        throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass());

    } catch (java.io.IOException e) {
      throw new IllegalArgumentException("IOException " + e);
    }

    //		System.out.println(carrier.getData().toString());
    return carrier;
  }
Пример #18
0
  public static InstanceList copy(InstanceList instances) {
    InstanceList ret = (InstanceList) instances.clone();
    // LabelAlphabet labelDict = (LabelAlphabet) ret.getTargetAlphabet();
    Alphabet featDict = ret.getDataAlphabet();

    for (int i = 0; i < ret.size(); i++) {
      Instance instance = ret.get(i);
      Instance clone = (Instance) instance.clone();
      FeatureVector fv = (FeatureVector) clone.getData();

      int[] indices = fv.getIndices();
      double[] values = fv.getValues();

      int[] newIndices = new int[indices.length];
      System.arraycopy(indices, 0, newIndices, 0, indices.length);

      double[] newValues = new double[indices.length];
      System.arraycopy(values, 0, newValues, 0, indices.length);

      FeatureVector newFv = new FeatureVector(featDict, newIndices, newValues);
      Instance newInstance =
          new Instance(newFv, instance.getTarget(), instance.getName(), instance.getSource());
      ret.set(i, newInstance);
    }

    return ret;
  }
Пример #19
0
  private double dataLogProbability(Instance instance, int labelIndex) {
    FeatureVector fv = (FeatureVector) instance.getData();
    int fvisize = fv.numLocations();
    double logProb = 0;

    for (int fvi = 0; fvi < fvisize; fvi++)
      logProb += fv.valueAtLocation(fvi) * p[labelIndex].logProbability(fv.indexAtLocation(fvi));
    return logProb;
  }
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   // xxx This doesn't seem so efficient.  Perhaps have TokenSequence
   // use a LinkedList, and remove Tokens from it? -?
   // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
   TokenSequence ret = new TokenSequence();
   Token prevToken = null;
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     if (!stoplist.contains(caseSensitive ? t.getText() : t.getText().toLowerCase())) {
       // xxx Should we instead make and add a copy of the Token?
       ret.add(t);
       prevToken = t;
     } else if (markDeletions && prevToken != null)
       prevToken.setProperty(FeatureSequenceWithBigrams.deletionMark, t.getText());
   }
   carrier.setData(ret);
   return carrier;
 }
Пример #21
0
 @Override
 public Instance pipe(Instance carrier) {
   if (synonymMap == null) {
     readSynonymFile();
   }
   TokenSequence in = (TokenSequence) carrier.getData();
   for (Token token : in) {
     if (synonymMap.containsKey(token.getText())) {
       token.setText(synonymMap.get(token.getText()));
     }
   }
   return carrier;
 }
Пример #22
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   TokenSequence targets =
       carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null;
   TokenSequence source =
       carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null;
   StringBuffer sb = new StringBuffer();
   if (prefix != null) sb.append(prefix);
   sb.append("name: " + carrier.getName() + "\n");
   for (int i = 0; i < ts.size(); i++) {
     if (source != null) {
       sb.append(source.get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof TokenSequence) {
       sb.append(((TokenSequence) carrier.getTarget()).get(i).getText());
       sb.append(' ');
     }
     if (carrier.getTarget() instanceof FeatureSequence) {
       sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString());
       sb.append(' ');
     }
     PropertyList pl = ts.get(i).getFeatures();
     if (pl != null) {
       PropertyList.Iterator iter = pl.iterator();
       while (iter.hasNext()) {
         iter.next();
         double v = iter.getNumericValue();
         if (v == 1.0) sb.append(iter.getKey());
         else sb.append(iter.getKey() + '=' + v);
         sb.append(' ');
       }
     }
     sb.append('\n');
   }
   System.out.print(sb.toString());
   return carrier;
 }
Пример #23
0
 public double dataLogLikelihood(InstanceList ilist) {
   double logLikelihood = 0;
   for (int ii = 0; ii < ilist.size(); ii++) {
     double instanceWeight = ilist.getInstanceWeight(ii);
     Instance inst = ilist.get(ii);
     Labeling labeling = inst.getLabeling();
     if (labeling != null)
       logLikelihood += instanceWeight * dataLogProbability(inst, labeling.getBestIndex());
     else {
       Labeling predicted = this.classify(inst).getLabeling();
       // System.err.println ("label = \n"+labeling);
       // System.err.println ("predicted = \n"+predicted);
       for (int lpos = 0; lpos < predicted.numLocations(); lpos++) {
         int li = predicted.indexAtLocation(lpos);
         double labelWeight = predicted.valueAtLocation(lpos);
         // System.err.print (", "+labelWeight);
         if (labelWeight == 0) continue;
         logLikelihood += instanceWeight * labelWeight * dataLogProbability(inst, li);
       }
     }
   }
   return logLikelihood;
 }
  @Test
  public void testLoadRareWords() throws UnsupportedEncodingException, FileNotFoundException {
    String dataset_fn = "src/main/resources/datasets/SmallTexts.txt";
    InstanceList nonPrunedInstances = LDAUtils.loadInstances(dataset_fn, "stoplist.txt", 0);
    System.out.println(LDAUtils.instancesToString(nonPrunedInstances));
    System.out.println("Non pruned Alphabet size: " + nonPrunedInstances.getDataAlphabet().size());
    System.out.println("No. instances: " + nonPrunedInstances.size());

    InstanceList originalInstances = LDAUtils.loadInstances(dataset_fn, "stoplist.txt", 2);
    System.out.println("Alphabet size: " + originalInstances.getDataAlphabet().size());
    System.out.println(LDAUtils.instancesToString(originalInstances));
    System.out.println("No. instances: " + originalInstances.size());

    int[] wordCounts = {0, 3, 3, 0, 0};
    int idx = 0;
    for (Instance instance : originalInstances) {
      FeatureSequence fs = (FeatureSequence) instance.getData();
      // This assertion would fail for eventhough the feature sequence
      // is "empty" the underlying array is 2 long.
      // assertEquals(wordCounts[idx++], fs.getFeatures().length);
      assertEquals(wordCounts[idx++], fs.size());
    }
  }
Пример #25
0
 public double labelLogLikelihood(InstanceList ilist) {
   double logLikelihood = 0;
   for (int ii = 0; ii < ilist.size(); ii++) {
     double instanceWeight = ilist.getInstanceWeight(ii);
     Instance inst = ilist.get(ii);
     Labeling labeling = inst.getLabeling();
     if (labeling == null) continue;
     Labeling predicted = this.classify(inst).getLabeling();
     // System.err.println ("label = \n"+labeling);
     // System.err.println ("predicted = \n"+predicted);
     if (labeling.numLocations() == 1) {
       logLikelihood += instanceWeight * Math.log(predicted.value(labeling.getBestIndex()));
     } else {
       for (int lpos = 0; lpos < labeling.numLocations(); lpos++) {
         int li = labeling.indexAtLocation(lpos);
         double labelWeight = labeling.valueAtLocation(lpos);
         // System.err.print (", "+labelWeight);
         if (labelWeight == 0) continue;
         logLikelihood += instanceWeight * labelWeight * Math.log(predicted.value(li));
       }
     }
   }
   return logLikelihood;
 }
Пример #26
0
 public Instance pipe(Instance carrier) {
   Object inputData = carrier.getData();
   Alphabet features = getDataAlphabet();
   LabelAlphabet labels;
   LabelSequence target = null;
   String[][] tokens;
   if (inputData instanceof String) tokens = parseSentence((String) inputData);
   else if (inputData instanceof String[][]) tokens = (String[][]) inputData;
   else throw new IllegalArgumentException("Not a String or String[][]; got " + inputData);
   FeatureVector[] fvs = new FeatureVector[tokens.length];
   if (isTargetProcessing()) {
     labels = (LabelAlphabet) getTargetAlphabet();
     target = new LabelSequence(labels, tokens.length);
   }
   for (int l = 0; l < tokens.length; l++) {
     int nFeatures;
     if (isTargetProcessing()) {
       if (tokens[l].length < 1)
         throw new IllegalStateException(
             "Missing label at line " + l + " instance " + carrier.getName());
       nFeatures = tokens[l].length - 1;
       target.add(tokens[l][nFeatures]);
     } else nFeatures = tokens[l].length;
     int featureIndices[] = new int[nFeatures];
     for (int f = 0; f < nFeatures; f++) featureIndices[f] = features.lookupIndex(tokens[l][f]);
     fvs[l] =
         featureInductionOption.value
             ? new AugmentableFeatureVector(
                 features, featureIndices, null, featureIndices.length)
             : new FeatureVector(features, featureIndices);
   }
   carrier.setData(new FeatureVectorSequence(fvs));
   if (isTargetProcessing()) carrier.setTarget(target);
   else carrier.setTarget(new LabelSequence(getTargetAlphabet()));
   return carrier;
 }
Пример #27
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     String s = t.getText();
     if (distinguishBorders) s = startBorderChar + s + endBorderChar;
     int slen = s.length();
     for (int j = 0; j < gramSizes.length; j++) {
       int size = gramSizes[j];
       for (int k = 0; k < (slen - size) + 1; k++)
         t.setFeatureValue((prefix + s.substring(k, k + size)).intern(), 1.0);
     }
   }
   return carrier;
 }
Пример #28
0
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     String s = t.getText();
     if (distinguishBorders) s = startBorderChar + s + endBorderChar;
     int slen = s.length();
     for (int j = 0; j < gramSizes.length; j++) {
       int size = gramSizes[j];
       for (int k = 0; k < slen - size; k++)
         t.setFeatureValue(
             s.substring(k, k + size), 1.0); // original was substring(k, size), changed by Fuchun
     }
   }
   return carrier;
 }
Пример #29
0
  /**
   * Classify an instance using NaiveBayes according to the trained data. The alphabet of the
   * featureVector of the instance must match the alphabe of the pipe used to train the classifier.
   *
   * @param instance to be classified. Data field must be a FeatureVector
   * @return Classification containing the labeling of the instance
   */
  public Classification classify(Instance instance) {
    // Note that the current size of the label alphabet can be larger
    // than it was at the time of training.  We are careful here
    // to correctly handle those labels here. For example,
    // we assume the log prior probability of those classes is
    // minus infinity.
    int numClasses = getLabelAlphabet().size();
    double[] scores = new double[numClasses];
    FeatureVector fv = (FeatureVector) instance.getData();
    // Make sure the feature vector's feature dictionary matches
    // what we are expecting from our data pipe (and thus our notion
    // of feature probabilities.
    assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet());
    int fvisize = fv.numLocations();

    prior.addLogProbabilities(scores);

    // Set the scores according to the feature weights and per-class probabilities
    for (int fvi = 0; fvi < fvisize; fvi++) {
      int fi = fv.indexAtLocation(fvi);
      for (int ci = 0; ci < numClasses; ci++) {
        // guard against dataAlphabet or target alphabet growing; can happen if classifying
        // a never before seen feature.  Ignore these.
        if (ci >= p.length || fi >= p[ci].size()) continue;

        scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi);
      }
    }

    // Get the scores in the range near zero, where exp() is more accurate
    double maxScore = Double.NEGATIVE_INFINITY;
    for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci];
    for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore;

    // Exponentiate and normalize
    double sum = 0;
    for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci]));
    for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum;

    // Create and return a Classification object
    return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores));
  }
Пример #30
0
  public Instance pipe(Instance carrier) {
    try {
      if (carrier.getData() instanceof URI) {
        carrier.setData(pipe((URI) carrier.getData()));
      } else if (carrier.getData() instanceof File) {
        carrier.setData(pipe((File) carrier.getData()));
      } else if (carrier.getData() instanceof Reader) {
        carrier.setData(pipe((Reader) carrier.getData()));
      } else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary
      else {
        throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass());
      }

      if (this.labelsInText) {
        String str = ((CharSequence) carrier.getData()).toString();

        if (str.startsWith("[")) {
          String[] labelsBoundary =
              str.substring(1)
                  . // remove initial '['
                  split("]", 2); // separate labels and str between ']'

          // String[] labelStrs = labelsBoundary[0].trim().split("[ \\t]");

          carrier.setData(labelsBoundary[1].trim());

          carrier.setTarget(labelsBoundary[0].trim()); // homer to do
        }
      }

    } catch (java.io.IOException e) {
      throw new IllegalArgumentException("IOException " + e);
    }

    //		System.out.println(carrier.getData().toString());
    return carrier;
  }