コード例 #1
0
  public void discoverSpecificComments() {
    System.out.println("topic similarity");
    String fileName = "./data/results/0108_9/topicSimilarity.txt";

    try {
      PrintWriter pw = new PrintWriter(new File(fileName));

      for (_Doc doc : m_trainSet) {
        if (doc instanceof _ParentDoc2) {
          pw.print(doc.getName() + "\t");
          double stnTopicSimilarity = 0.0;
          double docTopicSimilarity = 0.0;
          for (_ChildDoc2 cDoc : ((_ParentDoc2) doc).m_childDocs) {
            pw.print(cDoc.getName() + ":");

            docTopicSimilarity = computeSimilarity(((_ParentDoc2) doc).m_topics, cDoc.m_topics);
            pw.print(docTopicSimilarity);
            for (int i = 0; i < ((_ParentDoc2) doc).m_sentenceMap.size(); i++) {
              _Stn stnObj = ((_ParentDoc2) doc).m_sentenceMap.get(i);

              if (stnObj.m_stnLength == 0) {
                // // some sentences are normalized into zero
                // length sentences
                pw.print(":0");
                continue;
              }
              double[] stnTopics = stnObj.m_topics;

              stnTopicSimilarity = computeSimilarity(stnTopics, cDoc.m_topics);
              pw.print(":" + stnTopicSimilarity);
            }
            pw.print("\t");
          }
          pw.println();
        } else {
          continue;
        }
      }
      pw.flush();
      pw.close();
    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
コード例 #2
0
  public double calChildLogLikelihood2(_ChildDoc2 cDoc) {
    double likelihood = 0.0;

    int tid = 0;
    int wid = 0;
    double term1 = 0.0;
    double term2 = 0.0;

    for (int n = 0; n < cDoc.getTotalDocLength(); n++) {
      for (int k = 0; k < number_of_topics; k++) {
        wid = cDoc.m_words[n];
        // tid = cDoc.m_topicAssignment[n];

        if (cDoc.getTotalDocLength() == 0) return 0;

        term1 =
            (m_parentWordTopicSstat[k][wid] + m_childWordTopicSstat[k][wid])
                / (double) (m_parentSstat[k] + m_childSstat[k]);
        term2 =
            (cDoc.m_xTopicSstat[0][k] + cDoc.m_xTopicSstat[1][k])
                / (double) cDoc.getTotalDocLength();

        if ((term1 == 0)) {
          double eps = 1e-9;
          term1 += eps;
        }
        if ((term2 == 0)) {
          double eps = 1e-9;
          term2 += eps;
        }

        likelihood += Math.log(term1) + Math.log(term2);
      }
    }
    return likelihood;
  }
コード例 #3
0
  public void printChildTopicAssignment(_ChildDoc2 d) {
    String topicAssignmentfile =
        "./data/results/0108_9/childTopicAssignment/topicAssignment_" + d.getName() + "_.txt";
    try {
      PrintWriter pw = new PrintWriter(new File(topicAssignmentfile));

      for (int i = 0; i < d.m_index.length; i++) {
        int index = d.m_index[i];

        String featureName = m_corpus.getFeature(index);

        int positionInDoc = d.m_positionInDoc[i];
        int topic = d.m_topicAssignment[positionInDoc];

        if (index == d.m_words[positionInDoc]) pw.print(featureName + ":" + topic + "\t");
      }
      pw.flush();
      pw.close();
    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
コード例 #4
0
  void sampleChildDocTopic(_ChildDoc2 d) {
    int wid, tid, xid;

    double[][] xTopicProb = new double[2][number_of_topics];
    double prob;
    double normalizedProb = 0;

    for (int i = 0; i < d.m_words.length; i++) {
      int samplingX = 0;
      int samplingTopic = 0;
      prob = 0;
      normalizedProb = 0;

      wid = d.m_words[i];
      tid = d.m_topicAssignment[i];
      xid = d.m_xIndicator[i];

      d.m_xTopicSstat[xid][tid]--;
      d.m_xSstat[xid]--;
      if (m_collectCorpusStats) {
        m_childWordTopicSstat[tid][wid]--;
        m_childSstat[tid]--;
      }

      // p(z=tid,x=1) from specific
      for (tid = 0; tid < number_of_topics; tid++) {
        double term1 =
            (d_beta + m_parentWordTopicSstat[tid][wid] + m_childWordTopicSstat[tid][wid])
                / (d_beta * vocabulary_size + m_parentSstat[tid] + m_childSstat[tid]);
        double term2 =
            (d.m_xTopicSstat[1][tid] + d_alpha) / (number_of_topics * d_alpha + d.m_xSstat[1]);
        // double term3 =
        // (m_gamma[1]+d.m_xSstat[0])/(m_gamma[1]+m_gamma[2]+d.m_xSstat[0]+d.m_xSstat[1]);
        double term3 = (m_gamma[1] + d.m_xSstat[1]);
        xTopicProb[1][tid] = term1 * term2 * term3;
        normalizedProb += xTopicProb[1][tid];
      }

      if (d.m_parentDoc2 == null) {
        System.out.println("null parent in child doc" + d.getName());
      }

      double parentDocLen = d.m_parentDoc2.getTotalDocLength();
      // double parentDocLen = d.m_parentDoc2.getTotalDocLength();
      // p(z=tid, x=0) from background
      for (tid = 0; tid < number_of_topics; tid++) {
        double term1 =
            (d_beta + m_parentWordTopicSstat[tid][wid] + m_childWordTopicSstat[tid][wid])
                / (d_beta * vocabulary_size + m_parentSstat[tid] + m_childSstat[tid]);
        double term2 =
            (d_alpha + m_mu * d.m_parentDoc2.m_sstat[tid] / parentDocLen + d.m_xTopicSstat[0][tid])
                / (number_of_topics * d_alpha + m_mu + d.m_xSstat[0]);
        double term3 = (m_gamma[0] + d.m_xSstat[0]);
        xTopicProb[0][tid] = term1 * term2 * term3;
        normalizedProb += xTopicProb[0][tid];
      }

      boolean finishLoop = false;
      prob = normalizedProb * m_rand.nextDouble();
      for (xid = 0; xid < m_gamma.length; xid++) {
        for (tid = 0; tid < number_of_topics; tid++) {
          prob -= xTopicProb[xid][tid];
          if (prob <= 0) {
            finishLoop = true;
            break;
          }
        }
        if (finishLoop) {
          break;
        }
      }

      if (xid == 2) xid--;
      if (tid == number_of_topics) tid--;

      samplingX = xid;
      samplingTopic = tid;

      d.m_topicAssignment[i] = samplingTopic;
      d.m_xIndicator[i] = samplingX;

      d.m_xTopicSstat[samplingX][samplingTopic]++;
      d.m_xSstat[samplingX]++;
      if (m_collectCorpusStats) {
        m_childWordTopicSstat[samplingTopic][wid]++;
        m_childSstat[samplingTopic]++;
      }
    }
  }