public void discoverSpecificComments() { System.out.println("topic similarity"); String fileName = "./data/results/0108_9/topicSimilarity.txt"; try { PrintWriter pw = new PrintWriter(new File(fileName)); for (_Doc doc : m_trainSet) { if (doc instanceof _ParentDoc2) { pw.print(doc.getName() + "\t"); double stnTopicSimilarity = 0.0; double docTopicSimilarity = 0.0; for (_ChildDoc2 cDoc : ((_ParentDoc2) doc).m_childDocs) { pw.print(cDoc.getName() + ":"); docTopicSimilarity = computeSimilarity(((_ParentDoc2) doc).m_topics, cDoc.m_topics); pw.print(docTopicSimilarity); for (int i = 0; i < ((_ParentDoc2) doc).m_sentenceMap.size(); i++) { _Stn stnObj = ((_ParentDoc2) doc).m_sentenceMap.get(i); if (stnObj.m_stnLength == 0) { // // some sentences are normalized into zero // length sentences pw.print(":0"); continue; } double[] stnTopics = stnObj.m_topics; stnTopicSimilarity = computeSimilarity(stnTopics, cDoc.m_topics); pw.print(":" + stnTopicSimilarity); } pw.print("\t"); } pw.println(); } else { continue; } } pw.flush(); pw.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public double calChildLogLikelihood2(_ChildDoc2 cDoc) { double likelihood = 0.0; int tid = 0; int wid = 0; double term1 = 0.0; double term2 = 0.0; for (int n = 0; n < cDoc.getTotalDocLength(); n++) { for (int k = 0; k < number_of_topics; k++) { wid = cDoc.m_words[n]; // tid = cDoc.m_topicAssignment[n]; if (cDoc.getTotalDocLength() == 0) return 0; term1 = (m_parentWordTopicSstat[k][wid] + m_childWordTopicSstat[k][wid]) / (double) (m_parentSstat[k] + m_childSstat[k]); term2 = (cDoc.m_xTopicSstat[0][k] + cDoc.m_xTopicSstat[1][k]) / (double) cDoc.getTotalDocLength(); if ((term1 == 0)) { double eps = 1e-9; term1 += eps; } if ((term2 == 0)) { double eps = 1e-9; term2 += eps; } likelihood += Math.log(term1) + Math.log(term2); } } return likelihood; }
public void printChildTopicAssignment(_ChildDoc2 d) { String topicAssignmentfile = "./data/results/0108_9/childTopicAssignment/topicAssignment_" + d.getName() + "_.txt"; try { PrintWriter pw = new PrintWriter(new File(topicAssignmentfile)); for (int i = 0; i < d.m_index.length; i++) { int index = d.m_index[i]; String featureName = m_corpus.getFeature(index); int positionInDoc = d.m_positionInDoc[i]; int topic = d.m_topicAssignment[positionInDoc]; if (index == d.m_words[positionInDoc]) pw.print(featureName + ":" + topic + "\t"); } pw.flush(); pw.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
void sampleChildDocTopic(_ChildDoc2 d) { int wid, tid, xid; double[][] xTopicProb = new double[2][number_of_topics]; double prob; double normalizedProb = 0; for (int i = 0; i < d.m_words.length; i++) { int samplingX = 0; int samplingTopic = 0; prob = 0; normalizedProb = 0; wid = d.m_words[i]; tid = d.m_topicAssignment[i]; xid = d.m_xIndicator[i]; d.m_xTopicSstat[xid][tid]--; d.m_xSstat[xid]--; if (m_collectCorpusStats) { m_childWordTopicSstat[tid][wid]--; m_childSstat[tid]--; } // p(z=tid,x=1) from specific for (tid = 0; tid < number_of_topics; tid++) { double term1 = (d_beta + m_parentWordTopicSstat[tid][wid] + m_childWordTopicSstat[tid][wid]) / (d_beta * vocabulary_size + m_parentSstat[tid] + m_childSstat[tid]); double term2 = (d.m_xTopicSstat[1][tid] + d_alpha) / (number_of_topics * d_alpha + d.m_xSstat[1]); // double term3 = // (m_gamma[1]+d.m_xSstat[0])/(m_gamma[1]+m_gamma[2]+d.m_xSstat[0]+d.m_xSstat[1]); double term3 = (m_gamma[1] + d.m_xSstat[1]); xTopicProb[1][tid] = term1 * term2 * term3; normalizedProb += xTopicProb[1][tid]; } if (d.m_parentDoc2 == null) { System.out.println("null parent in child doc" + d.getName()); } double parentDocLen = d.m_parentDoc2.getTotalDocLength(); // double parentDocLen = d.m_parentDoc2.getTotalDocLength(); // p(z=tid, x=0) from background for (tid = 0; tid < number_of_topics; tid++) { double term1 = (d_beta + m_parentWordTopicSstat[tid][wid] + m_childWordTopicSstat[tid][wid]) / (d_beta * vocabulary_size + m_parentSstat[tid] + m_childSstat[tid]); double term2 = (d_alpha + m_mu * d.m_parentDoc2.m_sstat[tid] / parentDocLen + d.m_xTopicSstat[0][tid]) / (number_of_topics * d_alpha + m_mu + d.m_xSstat[0]); double term3 = (m_gamma[0] + d.m_xSstat[0]); xTopicProb[0][tid] = term1 * term2 * term3; normalizedProb += xTopicProb[0][tid]; } boolean finishLoop = false; prob = normalizedProb * m_rand.nextDouble(); for (xid = 0; xid < m_gamma.length; xid++) { for (tid = 0; tid < number_of_topics; tid++) { prob -= xTopicProb[xid][tid]; if (prob <= 0) { finishLoop = true; break; } } if (finishLoop) { break; } } if (xid == 2) xid--; if (tid == number_of_topics) tid--; samplingX = xid; samplingTopic = tid; d.m_topicAssignment[i] = samplingTopic; d.m_xIndicator[i] = samplingX; d.m_xTopicSstat[samplingX][samplingTopic]++; d.m_xSstat[samplingX]++; if (m_collectCorpusStats) { m_childWordTopicSstat[samplingTopic][wid]++; m_childSstat[samplingTopic]++; } } }