private void balanceSentiment( String file1, String fieldDelim1, int[] fields1, String file2, String fieldDelim2, int[] fields2, boolean append) throws IOException { Map<Integer, Integer> sentiFreq = TwitterCorpusStat.sentimentFreq(file1, fieldDelim1, fields1); int[] freq = new int[3]; freq[0] = sentiFreq.get(OMTweet.POLARITY_POSITIVE); freq[1] = sentiFreq.get(OMTweet.POLARITY_NEGATIVE); freq[2] = sentiFreq.get(OMTweet.POLARITY_NEUTRAL); int[][] indices = new int[3][]; for (int i = 0; i < 3; i++) { indices[i] = null; } int sbjDiff; if ((sbjDiff = Math.min(freq[0], freq[1]) * 2) > freq[2]) { sbjDiff = (sbjDiff - freq[3]) / 2; } else { sbjDiff = 0; } if (freq[0] != freq[1] || sbjDiff > 0) { int senti1 = 0; int senti2 = 1; if (freq[0] < freq[1]) { senti1 = 1; senti2 = 0; } indices[senti1] = randomIndices(freq[senti1], freq[senti2] - sbjDiff, System.currentTimeMillis()); freq[senti1] = freq[senti2] - sbjDiff; if (sbjDiff > 0) { indices[senti2] = randomIndices(freq[senti2], freq[senti2] - sbjDiff, System.currentTimeMillis()); freq[senti2] = freq[senti2] - sbjDiff; } } int sbj = freq[0] + freq[1]; if (sbj < freq[2]) { indices[2] = randomIndices(freq[2], sbj, System.currentTimeMillis()); freq[2] = sbj; } else if (sbj > freq[2]) { throw new IllegalStateException(); } int[] idx = new int[3]; int[] cursor = new int[3]; for (int i = 0; i < 3; i++) { idx[i] = 0; cursor[i] = 0; } OMTwitterCorpusFileReader reader = new OMTwitterCorpusFileReader(file1, fieldDelim1, fields1); OMTwitterCorpusFileWriter writer = new OMTwitterCorpusFileWriter(file2, fieldDelim2, fields2, append); int senti = 0; while (reader.hasNext()) { OMTweet tweet = reader.next(); switch (tweet.getPolarity()) { case OMTweet.POLARITY_POSITIVE: senti = 0; break; case OMTweet.POLARITY_NEGATIVE: senti = 1; break; case OMTweet.POLARITY_NEUTRAL: senti = 2; break; } if (indices[senti] == null) { writer.write(tweet); } else if (cursor[senti] < indices[senti].length && indices[senti][cursor[senti]] == idx[senti]) { writer.write(tweet); cursor[senti]++; } idx[senti]++; } writer.close(); reader.close(); }
@Override public void destroy() { try { Set<Entry<String, Integer>> set = map.entrySet(); ArrayList<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(); list.addAll(set); Collections.sort( list, new Comparator<Entry<String, Integer>>() { public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o1.getKey().compareTo(o2.getKey()); } }); ///////////////////////////////////////////// System.out.println("# NER"); for (Entry<String, Integer> e : list) { int idx = e.getValue(); double prec = (stat[idx][0] != 0) ? (double) stat[idx][2] / (double) stat[idx][0] : -1; double recall = (stat[idx][1] != 0) ? (double) stat[idx][2] / (double) stat[idx][1] : -1; double f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%02d %15s %3d/%3d=%7.4f %3d/%3d=%7.4f %7.4f\n", idx, e.getKey(), // stat[idx][2], stat[idx][0], prec, stat[idx][2], stat[idx][1], recall, f); System.out.format( "%02d\t%s\t%d\t%.4f\t%.4f\t%.4f\n", idx, e.getKey(), stat[idx][1], prec, recall, f); } System.out.println(); ///////////////////////////////////////////// System.out.format("%10s\t%12s\t%12s\t%12s\n", "Index", "Type", "Answer", "Classified"); for (int i = 0; i < list.size(); i += 3) { Entry<String, Integer> e = list.get(i); String s = e.getKey(); int idx = e.getValue(); if (idx != labelNoneIdx) { s = s.substring(0, s.lastIndexOf('_')); System.out.format( "%010d\t%12s\t%12d\t%12d\n", idx / 3, s, answerEntityCnt[idx / 3], classifiedEntityCnt[idx / 3]); } } System.out.println(); ///////////////////////////////////////////// System.out.println("# senti"); for (int i = 0; i < 3; i++) { int idx = i; double prec = (senti[idx][0] != 0) ? (double) senti[idx][2] / (double) senti[idx][0] : -1; double recall = (senti[idx][1] != 0) ? (double) senti[idx][2] / (double) senti[idx][1] : -1; double f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%s\t%d\t%.4f\t%.4f\t%.4f\n", sentiStr(idx), senti[idx][1], prec, // recall, f); System.out.format( "%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", idx, sentiStr(idx), senti[idx][2], senti[idx][0], prec, senti[idx][2], senti[idx][1], recall, f); } ///////////// System.out.println("# senti: sbj & neu"); double prec = (senti[0][0] + senti[1][0] != 0) ? (double) (senti[0][2] + senti[1][2]) / (double) (senti[0][0] + senti[1][0]) : -1; double recall = (senti[0][1] + senti[1][1] != 0) ? (double) (senti[0][2] + senti[1][2]) / (double) (senti[0][1] + senti[1][1]) : -1; double f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; System.out.format( "%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", 0, sentiStr(0) + "/" + sentiStr(1), senti[0][2] + senti[1][2], senti[0][0] + senti[1][0], prec, senti[0][2] + senti[1][2], senti[0][1] + senti[1][1], recall, f); int idx = 2; prec = (senti[idx][0] != 0) ? (double) senti[idx][2] / (double) senti[idx][0] : -1; recall = (senti[idx][1] != 0) ? (double) senti[idx][2] / (double) senti[idx][1] : -1; f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; System.out.format( "%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", idx, sentiStr(idx), senti[idx][2], senti[idx][0], prec, senti[idx][2], senti[idx][1], recall, f); ////////////// // System.out.println("# senti : pos & neg"); // prec = (senti[0][0]+senti[2][0] != 0) ? (double)(senti[0][2]+senti[2][2]) / // (double)(senti[0][0]+senti[2][0]) : -1; // recall = (senti[0][1]+senti[2][1] != 0) ? (double)(senti[0][2]+senti[2][2]) / // (double)(senti[0][1]+senti[2][1]) : -1; // f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", 0, // sentiStr(0)+"/"+sentiStr(2), senti[0][2]+senti[2][2], senti[0][0]+senti[2][0], prec, // senti[0][2]+senti[2][2], senti[0][1]+senti[2][1], recall, f); // // idx = 1; // prec = (senti[idx][0] != 0) ? (double)senti[idx][2] / (double)senti[idx][0] : -1; // recall = (senti[idx][1] != 0) ? (double)senti[idx][2] / (double)senti[idx][1] : -1; // f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", idx, sentiStr(idx), // senti[idx][2], senti[idx][0], prec, senti[idx][2], senti[idx][1], recall, f); evalCorpusReader.close(); } catch (Exception e) { e.printStackTrace(); } super.destroy(); }