void ComputeEmissionProbsForDoc(_Doc d) { for (int i = 0; i < d.getSenetenceSize(); i++) { _Stn stn = d.getSentence(i); Arrays.fill(emission[i], 0); int start = 0, end = this.number_of_topics; if (i == 0 && d.getSourceType() == 2) { // first sentence is specially handled for newEgg // get the sentiment label of the first sentence int sentimentLabel = stn.getSentenceSenitmentLabel(); if (sentimentLabel == 0) { // positive sentiment in the first half end = this.number_of_topics / 2; for (int k = end; k < this.number_of_topics; k++) emission[i][k] = Double.NEGATIVE_INFINITY; } else if (sentimentLabel == 1) { // negative sentiment in the second half start = this.number_of_topics / 2; for (int k = 0; k < start; k++) emission[i][k] = Double.NEGATIVE_INFINITY; } } for (int k = start; k < end; k++) { for (_SparseFeature w : stn.getFv()) { emission[i][k] += w.getValue() * topic_term_probabilty[k][w.getIndex()]; // all in log-space } } } }
public static _SparseFeature[] createSpVct(ArrayList<HashMap<Integer, Double>> vcts) { HashMap<Integer, _SparseFeature> spVcts = new HashMap<Integer, _SparseFeature>(); HashMap<Integer, Double> vPtr; _SparseFeature spV; int dim = vcts.size(); for (int i = 0; i < dim; i++) { vPtr = vcts.get(i); if (vPtr == null || vPtr.isEmpty()) continue; // it is possible that we are missing this dimension // iterate through all the features in this section Iterator<Entry<Integer, Double>> it = vPtr.entrySet().iterator(); while (it.hasNext()) { Map.Entry<Integer, Double> pairs = (Map.Entry<Integer, Double>) it.next(); int index = pairs.getKey(); double value = pairs.getValue(); if (spVcts.containsKey(index)) { spV = spVcts.get(index); spV.addValue(value); // increase the total value } else { spV = new _SparseFeature(index, value, dim); spVcts.put(index, spV); } spV.setValue4Dim(value, i); } } int size = spVcts.size(); _SparseFeature[] resultVct = spVcts.values().toArray(new _SparseFeature[size]); Arrays.sort(resultVct); return resultVct; }
// Dot product of the random vector and document sparse vector. public static double dotProduct(double[] vct, _SparseFeature[] sf) { if (sf[sf.length - 1].getIndex() > vct.length) System.err.print("Error in computing dot product between a sparse vector and a full vector"); double value = 0; for (_SparseFeature fv : sf) value += vct[fv.getIndex()] * fv.getValue(); return value; }
public static Feature[] createLibLinearFV(HashMap<Integer, Double> spVct) { Feature[] node = new Feature[spVct.size()]; int fid = 0; for (_SparseFeature fv : createSpVct(spVct)) node[fid++] = new FeatureNode(1 + fv.getIndex(), fv.getValue()); // svm's feature index starts from 1 return node; }
// L2 normalization: fsValue/sqrt(sum of fsValue*fsValue) public static double sumOfFeaturesL2(_SparseFeature[] fs) { if (fs == null) return 0; double sum = 0; for (_SparseFeature feature : fs) { double value = feature.getValue(); sum += value * value; } return Math.sqrt(sum); }
// Get projectSpVct by building a hashmap<Integer, String> filter, added by Lin. public static _SparseFeature[] projectSpVct( _SparseFeature[] fv, HashMap<Integer, String> filter) { ArrayList<_SparseFeature> pFv = new ArrayList<_SparseFeature>(); for (_SparseFeature f : fv) { if (filter.containsKey(f.getIndex())) { pFv.add(new _SparseFeature(f.getIndex(), f.getValue())); } } if (pFv.isEmpty()) return null; else return pFv.toArray(new _SparseFeature[pFv.size()]); }
public static void L2Normalization(_SparseFeature[] fs) { double sum = sumOfFeaturesL2(fs); if (sum > 0) { for (_SparseFeature f : fs) { double normValue = f.getValue() / sum; f.setValue(normValue); } } else { for (_SparseFeature f : fs) { f.setValue(0.0); } } }
public static Feature[] createLibLinearFV(_SparseFeature[] spVct, int fSize) { Feature[] node; if (fSize > 0) // include bias term in the end node = new Feature[1 + spVct.length]; else // ignore bias term node = new Feature[spVct.length]; int fid = 0; for (_SparseFeature fv : spVct) node[fid++] = new FeatureNode(1 + fv.getIndex(), fv.getValue()); // svm's feature index starts from 1 if (fSize > 0) node[fid] = new FeatureNode(1 + fSize, 1.0); return node; }
public static _SparseFeature[] MergeSpVcts(ArrayList<_SparseFeature[]> vcts) { HashMap<Integer, Double> vct = new HashMap<Integer, Double>(); for (_SparseFeature[] fv : vcts) { for (_SparseFeature f : fv) { int x = f.getIndex(); if (vct.containsKey(x)) { vct.put(x, vct.get(x) + f.getValue()); } else { vct.put(x, f.getValue()); } } } return Utils.createSpVct(vct); }
// probabilities of topic assignment void accPhiStat(_Doc d) { double prob; for (int t = 0; t < d.getSenetenceSize(); t++) { _Stn s = d.getSentence(t); for (_SparseFeature f : s.getFv()) { int wid = f.getIndex(); double v = f.getValue(); // frequency for (int i = 0; i < number_of_topics; i++) { prob = this.p_dwzpsi[t][i]; for (int j = 1; j < constant; j++) prob += this.p_dwzpsi[t][i + j * number_of_topics]; this.sstat[i][wid] += v * prob; } } } }
// Calculate the similarity between two sparse vectors. public static double calculateSimilarity(_SparseFeature[] spVct1, _SparseFeature[] spVct2) { if (spVct1 == null || spVct2 == null) return 0; // What is the minimal value of similarity? double similarity = 0; int pointer1 = 0, pointer2 = 0; while (pointer1 < spVct1.length && pointer2 < spVct2.length) { _SparseFeature temp1 = spVct1[pointer1]; _SparseFeature temp2 = spVct2[pointer2]; if (temp1.getIndex() == temp2.getIndex()) { similarity += temp1.getValue() * temp2.getValue(); pointer1++; pointer2++; } else if (temp1.getIndex() > temp2.getIndex()) pointer2++; else pointer1++; } return similarity; }
public static double jaccard(_SparseFeature[] spVct1, _SparseFeature[] spVct2) { if (spVct1 == null || spVct2 == null) return 0; // What is the minimal value of similarity? double overlap = 0; int pointer1 = 0, pointer2 = 0; while (pointer1 < spVct1.length && pointer2 < spVct2.length) { _SparseFeature temp1 = spVct1[pointer1]; _SparseFeature temp2 = spVct2[pointer2]; if (temp1.getIndex() == temp2.getIndex()) { overlap++; pointer1++; pointer2++; } else if (temp1.getIndex() > temp2.getIndex()) pointer2++; else pointer1++; } return overlap / (spVct1.length + spVct2.length); }
// x_i - x_j public static _SparseFeature[] diffVector(_SparseFeature[] spVcti, _SparseFeature[] spVctj) { // first deal with special case if (spVcti == null && spVctj == null) return null; else if (spVctj == null) return spVcti; else if (spVcti == null) return negSpVct(spVctj); ArrayList<_SparseFeature> vectorList = new ArrayList<_SparseFeature>(); int i = 0, j = 0; _SparseFeature fi = spVcti[i], fj = spVctj[j]; double fv; while (i < spVcti.length && j < spVctj.length) { fi = spVcti[i]; fj = spVctj[j]; if (fi.getIndex() == fj.getIndex()) { fv = fi.getValue() - fj.getValue(); if (Math.abs(fv) > Double.MIN_VALUE) // otherwise it is too small vectorList.add(new _SparseFeature(fi.getIndex(), fv)); i++; j++; } else if (fi.getIndex() > fj.getIndex()) { vectorList.add(new _SparseFeature(fj.getIndex(), -fj.getValue())); j++; } else { vectorList.add(new _SparseFeature(fi.getIndex(), fi.getValue())); i++; } } while (i < spVcti.length) { fi = spVcti[i]; vectorList.add(new _SparseFeature(fi.getIndex(), fi.getValue())); i++; } while (j < spVctj.length) { fj = spVctj[j]; vectorList.add(new _SparseFeature(fj.getIndex(), -fj.getValue())); j++; } return vectorList.toArray(new _SparseFeature[vectorList.size()]); }
// L1 normalization: fsValue/sum(abs(fsValue)) public static double sumOfFeaturesL1(_SparseFeature[] fs) { double sum = 0; for (_SparseFeature feature : fs) sum += Math.abs(feature.getValue()); return sum; }