Exemplo n.º 1
0
  /**
   * converts the sentence based instance list into a token based one This is needed for the
   * ME-version of JET (JetMeClassifier)
   *
   * @param METrainerDummyPipe
   * @param inst just the features for one sentence to be transformed
   * @return
   */
  public static InstanceList convertFeatsforClassifier(
      final Pipe METrainerDummyPipe, final Instance inst) {

    final InstanceList iList = new InstanceList(METrainerDummyPipe);

    final FeatureVectorSequence fvs = (FeatureVectorSequence) inst.getData();
    final LabelSequence ls = (LabelSequence) inst.getTarget();
    final LabelAlphabet ldict = (LabelAlphabet) ls.getAlphabet();
    final Object source = inst.getSource();
    final Object name = inst.getName();

    if (ls.size() != fvs.size()) {
      System.err.println(
          "failed making token instances: size of labelsequence != size of featue vector sequence: "
              + ls.size()
              + " - "
              + fvs.size());
      System.exit(-1);
    }

    for (int j = 0; j < fvs.size(); j++) {
      final Instance I =
          new Instance(fvs.getFeatureVector(j), ldict.lookupLabel(ls.get(j)), name, source);
      iList.add(I);
    }

    return iList;
  }
Exemplo n.º 2
0
 public Instance pipe(Instance carrier) {
   Object inputData = carrier.getData();
   Alphabet features = getDataAlphabet();
   LabelAlphabet labels;
   LabelSequence target = null;
   String[][] tokens;
   if (inputData instanceof String) tokens = parseSentence((String) inputData);
   else if (inputData instanceof String[][]) tokens = (String[][]) inputData;
   else throw new IllegalArgumentException("Not a String or String[][]; got " + inputData);
   FeatureVector[] fvs = new FeatureVector[tokens.length];
   if (isTargetProcessing()) {
     labels = (LabelAlphabet) getTargetAlphabet();
     target = new LabelSequence(labels, tokens.length);
   }
   for (int l = 0; l < tokens.length; l++) {
     int nFeatures;
     if (isTargetProcessing()) {
       if (tokens[l].length < 1)
         throw new IllegalStateException(
             "Missing label at line " + l + " instance " + carrier.getName());
       nFeatures = tokens[l].length - 1;
       target.add(tokens[l][nFeatures]);
     } else nFeatures = tokens[l].length;
     int featureIndices[] = new int[nFeatures];
     for (int f = 0; f < nFeatures; f++) featureIndices[f] = features.lookupIndex(tokens[l][f]);
     fvs[l] =
         featureInductionOption.value
             ? new AugmentableFeatureVector(
                 features, featureIndices, null, featureIndices.length)
             : new FeatureVector(features, featureIndices);
   }
   carrier.setData(new FeatureVectorSequence(fvs));
   if (isTargetProcessing()) carrier.setTarget(target);
   else carrier.setTarget(new LabelSequence(getTargetAlphabet()));
   return carrier;
 }
Exemplo n.º 3
0
  protected void sampleTopicsForOneDoc(TopicAssignment doc // int[][] typeTopicCounts
      // ,
      // double[] cachedCoefficients,
      // int[] tokensPerTopic,
      // double betaSum,
      // double beta,
      // double smoothingOnlyMass,
      // int[][] lblTypeTopicCounts,
      // double[] cachedLabelCoefficients,
      // int[] labelsPerTopic,
      // double gammaSum,
      // double gamma,
      // double smoothingOnlyLblMass
      ) {

    FeatureSequence tokenSequence = (FeatureSequence) doc.instance.getData();

    LabelSequence topicSequence = (LabelSequence) doc.topicSequence;

    MassValue massValue = new MassValue();
    massValue.topicBetaMass = 0.0;
    massValue.topicTermMass = 0.0;
    massValue.smoothingOnlyMass = smoothingOnlyMass;

    int nonZeroTopics = 0;

    int[] oneDocTopics = topicSequence.getFeatures();
    int[] localTopicCounts = new int[numTopics];
    int[] localTopicIndex = new int[numTopics];

    // Label Init
    LabelSequence lblTopicSequence = (LabelSequence) doc.lblTopicSequence;
    FeatureSequence labelSequence = (FeatureSequence) doc.instance.getTarget();

    MassValue massLblValue = new MassValue();
    massLblValue.topicBetaMass = 0.0;
    massLblValue.topicTermMass = 0.0;
    massLblValue.smoothingOnlyMass = smoothingOnlyLabelMass;

    int[] oneDocLblTopics = lblTopicSequence.getFeatures();
    int[] localLblTopicCounts = new int[numTopics];

    // initSampling

    int docLength = tokenSequence.getLength();
    //		populate topic counts
    for (int position = 0; position < docLength; position++) {
      if (oneDocTopics[position] == ParallelTopicModel.UNASSIGNED_TOPIC) {
        continue;
      }
      localTopicCounts[oneDocTopics[position]]++;
    }

    docLength = labelSequence.getLength();
    //		populate topic counts
    for (int position = 0; position < docLength; position++) {
      if (oneDocLblTopics[position] == ParallelTopicModel.UNASSIGNED_TOPIC) {
        continue;
      }
      localLblTopicCounts[oneDocLblTopics[position]]++;
    }

    // Build an array that densely lists the topics that
    //  have non-zero counts.
    int denseIndex = 0;
    for (int topic = 0; topic < numTopics; topic++) {
      if (localTopicCounts[topic] != 0 || localLblTopicCounts[topic] != 0) {
        localTopicIndex[denseIndex] = topic;
        denseIndex++;
      }
    }

    // Record the total number of non-zero topics
    nonZeroTopics = denseIndex;
    if (nonZeroTopics < 20) {
      int a = 1;
    }

    // Initialize the topic count/beta sampling bucket
    // Initialize cached coefficients and the topic/beta
    //  normalizing constant.
    for (denseIndex = 0; denseIndex < nonZeroTopics; denseIndex++) {
      int topic = localTopicIndex[denseIndex];
      int n = localTopicCounts[topic];
      int nl = localLblTopicCounts[topic];

      if (ignoreLabels) {
        //	initialize the normalization constant for the (B * n_{t|d}) term
        massValue.topicBetaMass += beta * n / (tokensPerTopic[topic] + betaSum);
        // massLblValue.topicBetaMass += gamma * nl / (labelsPerTopic[topic] + gammaSum);
        //	update the coefficients for the non-zero topics
        cachedCoefficients[topic] = (alpha[topic] + n) / (tokensPerTopic[topic] + betaSum);
        // cachedLabelCoefficients[topic] = (alpha[topic] + nl) / (labelsPerTopic[topic] +
        // gammaSum);
      } else {
        massValue.topicBetaMass += beta * (n + lblWeight * nl) / (tokensPerTopic[topic] + betaSum);
        // massLblValue.topicBetaMass += gamma * (nl + (1 / lblWeight) * n) / (labelsPerTopic[topic]
        // + gammaSum);
        cachedCoefficients[topic] =
            ((1 + lblWeight) * alpha[topic] + n + lblWeight * nl)
                / (tokensPerTopic[topic] + betaSum);
        // cachedLabelCoefficients[topic] = ((1 + (1 / lblWeight)) * alpha[topic] + nl + (1 /
        // lblWeight) * n) / (labelsPerTopic[topic] + gammaSum);
      }
    }

    // end of Init Sampling

    double[] topicTermScores = new double[numTopics];
    int[] currentTypeTopicCounts;
    //	Iterate over the positions (words) in the document
    docLength = tokenSequence.getLength();
    for (int position = 0; position < docLength; position++) {

      int type = tokenSequence.getIndexAtPosition(position);
      currentTypeTopicCounts = typeTopicCounts[type];

      nonZeroTopics =
          removeOldTopicContribution(
              position,
              oneDocTopics,
              massValue,
              localTopicCounts,
              localLblTopicCounts,
              localTopicIndex,
              cachedCoefficients,
              tokensPerTopic,
              betaSum,
              beta,
              lblWeight,
              nonZeroTopics);

      // calcSamplingValuesPerType
      calcSamplingValuesPerType(
          // tokenSequence,
          position,
          oneDocTopics,
          massValue,
          topicTermScores,
          currentTypeTopicCounts,
          localTopicCounts,
          // localTopicIndex,
          cachedCoefficients,
          tokensPerTopic,
          betaSum,
          beta,
          typeTotals,
          type,
          typeSkewIndexes,
          skewWeight);

      double sample = 0;

      sample =
          random.nextUniform()
              * (massValue.smoothingOnlyMass + massValue.topicBetaMass + massValue.topicTermMass);

      double origSample = sample;

      //	Make sure it actually gets set
      int newTopic = -1;

      newTopic =
          findNewTopic(
              sample,
              massValue,
              topicTermScores,
              currentTypeTopicCounts,
              localTopicCounts,
              localLblTopicCounts,
              localTopicIndex,
              tokensPerTopic,
              betaSum,
              beta,
              nonZeroTopics,
              lblWeight);

      if (newTopic == -1) {
        System.err.println(
            "WorkerRunnable sampling error: "
                + origSample
                + " "
                + sample
                + " "
                + massValue.smoothingOnlyMass
                + " "
                + massValue.topicBetaMass
                + " "
                + massValue.topicTermMass);
        newTopic = numTopics - 1; // TODO is this appropriate
        // throw new IllegalStateException ("WorkerRunnable: New topic not sampled.");
      }
      // assert(newTopic != -1);

      //			Put that new topic into the counts
      oneDocTopics[position] = newTopic;

      if (ignoreLabels) {
        massValue.smoothingOnlyMass -=
            alpha[newTopic] * beta / (tokensPerTopic[newTopic] + betaSum);
        massValue.topicBetaMass -=
            beta * localTopicCounts[newTopic] / (tokensPerTopic[newTopic] + betaSum);
      } else {
        massValue.smoothingOnlyMass -=
            (1 + lblWeight) * alpha[newTopic] * beta / (tokensPerTopic[newTopic] + betaSum);
        massValue.topicBetaMass -=
            beta
                * (localTopicCounts[newTopic] + lblWeight * localLblTopicCounts[newTopic])
                / (tokensPerTopic[newTopic] + betaSum);
      }
      localTopicCounts[newTopic]++;

      // If this is a new topic for this document,
      //  add the topic to the dense index.
      if (localTopicCounts[newTopic] == 1 && localLblTopicCounts[newTopic] == 0) {

        // First find the point where we
        //  should insert the new topic by going to
        //  the end (which is the only reason we're keeping
        //  track of the number of non-zero
        //  topics) and working backwards

        denseIndex = nonZeroTopics;

        while (denseIndex > 0 && localTopicIndex[denseIndex - 1] > newTopic) {

          localTopicIndex[denseIndex] = localTopicIndex[denseIndex - 1];
          denseIndex--;
        }

        localTopicIndex[denseIndex] = newTopic;
        nonZeroTopics++;
      }

      tokensPerTopic[newTopic]++;

      if (ignoreLabels) {
        //	update the coefficients for the non-zero topics
        cachedCoefficients[newTopic] =
            (alpha[newTopic] + localTopicCounts[newTopic]) / (tokensPerTopic[newTopic] + betaSum);

        massValue.smoothingOnlyMass +=
            alpha[newTopic] * beta / (tokensPerTopic[newTopic] + betaSum);
        massValue.topicBetaMass +=
            beta * localTopicCounts[newTopic] / (tokensPerTopic[newTopic] + betaSum);
      } else {
        massValue.smoothingOnlyMass +=
            (1 + lblWeight) * alpha[newTopic] * beta / (tokensPerTopic[newTopic] + betaSum);
        massValue.topicBetaMass +=
            beta
                * (localTopicCounts[newTopic] + lblWeight * localLblTopicCounts[newTopic])
                / (tokensPerTopic[newTopic] + betaSum);

        cachedCoefficients[newTopic] =
            ((1 + lblWeight) * alpha[newTopic]
                    + localTopicCounts[newTopic]
                    + lblWeight * localLblTopicCounts[newTopic])
                / (tokensPerTopic[newTopic] + betaSum);
      }
    }

    // sample labels
    // init labels
    for (denseIndex = 0; denseIndex < nonZeroTopics; denseIndex++) {
      int topic = localTopicIndex[denseIndex];
      int n = localTopicCounts[topic];
      int nl = localLblTopicCounts[topic];

      if (ignoreLabels) {
        //	initialize the normalization constant for the (B * n_{t|d}) term
        // massValue.topicBetaMass += beta * n / (tokensPerTopic[topic] + betaSum);
        massLblValue.topicBetaMass += gamma * nl / (labelsPerTopic[topic] + gammaSum);
        //	update the coefficients for the non-zero topics
        // cachedCoefficients[topic] = (alpha[topic] + n) / (tokensPerTopic[topic] + betaSum);
        cachedLabelCoefficients[topic] = (alpha[topic] + nl) / (labelsPerTopic[topic] + gammaSum);
      } else {
        // massValue.topicBetaMass += beta * (n + lblWeight * nl) / (tokensPerTopic[topic] +
        // betaSum);
        massLblValue.topicBetaMass +=
            gamma * (nl + (1 / lblWeight) * n) / (labelsPerTopic[topic] + gammaSum);
        // cachedCoefficients[topic] = ((1 + lblWeight) * alpha[topic] + n + lblWeight * nl) /
        // (tokensPerTopic[topic] + betaSum);
        cachedLabelCoefficients[topic] =
            ((1 + (1 / lblWeight)) * alpha[topic] + nl + (1 / lblWeight) * n)
                / (labelsPerTopic[topic] + gammaSum);
      }
    }

    double[] topicLblTermScores = new double[numTopics];
    int[] currentLblTypeTopicCounts;
    int docLblLength = labelSequence.getLength();

    //	Iterate over the positions (words) in the document
    for (int position = 0; position < docLblLength; position++) {

      int type = labelSequence.getIndexAtPosition(position);
      currentLblTypeTopicCounts = lbltypeTopicCounts[type];

      nonZeroTopics =
          removeOldTopicContribution(
              position,
              oneDocLblTopics,
              massLblValue,
              localLblTopicCounts,
              localTopicCounts,
              localTopicIndex,
              cachedLabelCoefficients,
              labelsPerTopic,
              gammaSum,
              gamma,
              1 / lblWeight,
              nonZeroTopics);

      // calcSamplingValuesPerType
      calcSamplingValuesPerType(
          // labelSequence,
          position,
          oneDocLblTopics,
          massLblValue,
          topicLblTermScores,
          currentLblTypeTopicCounts,
          localLblTopicCounts,
          // localTopicIndex,
          cachedLabelCoefficients,
          labelsPerTopic,
          gammaSum,
          gamma,
          lblTypeTotals,
          type,
          lblTypeSkewIndexes,
          lblSkewWeight);
      // massLblValue.smoothingOnlyMass = 0; //ignore smoothing mass

      double sample =
          random.nextUniform()
              * (massLblValue.smoothingOnlyMass
                  + massLblValue.topicBetaMass
                  + massLblValue.topicTermMass);

      // double sample = random.nextUniform() * (massValue.smoothingOnlyMass +
      // massValue.topicBetaMass + massLblValue.smoothingOnlyMass + massLblValue.topicBetaMass +
      // massLblValue.topicTermMass);

      double origSample = sample;

      //	Make sure it actually gets set
      int newTopic = -1;

      newTopic =
          findNewTopic(
              sample,
              massLblValue,
              topicLblTermScores,
              currentLblTypeTopicCounts,
              localLblTopicCounts,
              localTopicCounts,
              localTopicIndex,
              labelsPerTopic,
              gammaSum,
              gamma,
              nonZeroTopics,
              1 / lblWeight);

      if (newTopic == -1) {
        System.err.println(
            "WorkerRunnable sampling labels error: "
                + origSample
                + " "
                + sample
                + " "
                + massLblValue.smoothingOnlyMass
                + " "
                + massLblValue.topicBetaMass
                + " "
                + massLblValue.topicTermMass);
        // newTopic = numTopics - 1; // TODO is this appropriate
        // throw new IllegalStateException ("WorkerRunnable: New topic not sampled.");
      }
      assert (newTopic != -1);

      //			Put that new topic into the counts
      oneDocLblTopics[position] = newTopic;

      if (ignoreLabels) {
        massLblValue.smoothingOnlyMass -=
            alpha[newTopic] * gamma / (labelsPerTopic[newTopic] + gammaSum);
        massLblValue.topicBetaMass -=
            gamma * localLblTopicCounts[newTopic] / (labelsPerTopic[newTopic] + gammaSum);
      } else {

        massLblValue.smoothingOnlyMass -=
            (1 + 1 / lblWeight) * alpha[newTopic] * gamma / (labelsPerTopic[newTopic] + gammaSum);
        massLblValue.topicBetaMass -=
            gamma
                * (localLblTopicCounts[newTopic] + (1 / lblWeight) * localTopicCounts[newTopic])
                / (labelsPerTopic[newTopic] + gammaSum);
      }

      localLblTopicCounts[newTopic]++;

      // If this is a new topic for this document,
      //  add the topic to the dense index.
      if (localLblTopicCounts[newTopic] == 1 && localTopicCounts[newTopic] == 0) {

        // First find the point where we
        //  should insert the new topic by going to
        //  the end (which is the only reason we're keeping
        //  track of the number of non-zero
        //  topics) and working backwards

        denseIndex = nonZeroTopics;

        while (denseIndex > 0 && localTopicIndex[denseIndex - 1] > newTopic) {

          localTopicIndex[denseIndex] = localTopicIndex[denseIndex - 1];
          denseIndex--;
        }

        localTopicIndex[denseIndex] = newTopic;
        nonZeroTopics++;
      }

      labelsPerTopic[newTopic]++;

      //	update the coefficients for the non-zero topics
      if (ignoreLabels) {
        cachedLabelCoefficients[newTopic] =
            (alpha[newTopic] + localLblTopicCounts[newTopic])
                / (labelsPerTopic[newTopic] + gammaSum);

        massLblValue.smoothingOnlyMass +=
            alpha[newTopic] * gamma / (labelsPerTopic[newTopic] + gammaSum);
        massLblValue.topicBetaMass +=
            gamma * localLblTopicCounts[newTopic] / (labelsPerTopic[newTopic] + gammaSum);

      } else {

        cachedLabelCoefficients[newTopic] =
            ((1 + 1 / lblWeight) * alpha[newTopic]
                    + localLblTopicCounts[newTopic]
                    + 1 / lblWeight * localTopicCounts[newTopic])
                / (labelsPerTopic[newTopic] + gammaSum);

        massLblValue.smoothingOnlyMass +=
            (1 + 1 / lblWeight) * alpha[newTopic] * gamma / (labelsPerTopic[newTopic] + gammaSum);
        massLblValue.topicBetaMass +=
            gamma
                * (localLblTopicCounts[newTopic] + (1 / lblWeight) * localTopicCounts[newTopic])
                / (labelsPerTopic[newTopic] + gammaSum);
      }
    }
    if (shouldSaveState) {
      // Update the document-topic count histogram,
      //  for dirichlet estimation
      docLengthCounts[docLength]++;

      for (denseIndex = 0; denseIndex < nonZeroTopics; denseIndex++) {
        int topic = localTopicIndex[denseIndex];

        topicDocCounts[topic][localTopicCounts[topic]]++;
      }

      docLblLengthCounts[docLblLength]++;

      for (denseIndex = 0; denseIndex < nonZeroTopics; denseIndex++) {
        int topic = localTopicIndex[denseIndex];

        topicLblDocCounts[topic][localLblTopicCounts[topic]]++;
      }
    }

    //	Clean up our mess: reset the coefficients to values with only
    //	smoothing. The next doc will update its own non-zero topics...
    for (denseIndex = 0; denseIndex < nonZeroTopics; denseIndex++) {

      int topic = localTopicIndex[denseIndex];

      if (ignoreLabels) {
        cachedCoefficients[topic] = alpha[topic] / (tokensPerTopic[topic] + betaSum);
        cachedLabelCoefficients[topic] = alpha[topic] / (labelsPerTopic[topic] + gammaSum);
      } else {
        cachedCoefficients[topic] =
            (1 + lblWeight) * alpha[topic] / (tokensPerTopic[topic] + betaSum);
        cachedLabelCoefficients[topic] =
            (1 + 1 / lblWeight) * alpha[topic] / (labelsPerTopic[topic] + gammaSum);
      }
    }

    smoothingOnlyMass = massValue.smoothingOnlyMass;
    smoothingOnlyLabelMass = massLblValue.smoothingOnlyMass;
  }