private void printCorpusStat(String corpusFile, String fieldDelim, int[] fields) throws UnsupportedEncodingException, FileNotFoundException { Map<Integer, Integer> freq = TwitterCorpusStat.sentimentFreq(corpusFile, fieldDelim, fields); logger.info("+-------------------------------------------------------------+"); logger.info(OMTweet.POLARITY_STR_POSITIVE + " tweets: " + freq.get(OMTweet.POLARITY_POSITIVE)); logger.info(OMTweet.POLARITY_STR_NEGATIVE + " tweets: " + freq.get(OMTweet.POLARITY_NEGATIVE)); logger.info(OMTweet.POLARITY_STR_NEUTRAL + " tweets: " + freq.get(OMTweet.POLARITY_NEUTRAL)); logger.info("+-------------------------------------------------------------+"); logger.info( OMTweet.POLARITY_STR_SUBJECTIVE + " tweets: " + freq.get(OMTweet.POLARITY_SUBJECTIVE)); logger.info( OMTweet.POLARITY_STR_OBJECTIVE + " tweets: " + freq.get(OMTweet.POLARITY_OBJECTIVE)); logger.info("+-------------------------------------------------------------+"); logger.info( OMTweet.POLARITY_STR_NOT_SPECIFIED + " tweets: " + freq.get(OMTweet.POLARITY_NOT_SPECIFIED)); logger.info("+-------------------------------------------------------------+"); }
private void balanceSentiment( String file1, String fieldDelim1, int[] fields1, String file2, String fieldDelim2, int[] fields2, boolean append) throws IOException { Map<Integer, Integer> sentiFreq = TwitterCorpusStat.sentimentFreq(file1, fieldDelim1, fields1); int[] freq = new int[3]; freq[0] = sentiFreq.get(OMTweet.POLARITY_POSITIVE); freq[1] = sentiFreq.get(OMTweet.POLARITY_NEGATIVE); freq[2] = sentiFreq.get(OMTweet.POLARITY_NEUTRAL); int[][] indices = new int[3][]; for (int i = 0; i < 3; i++) { indices[i] = null; } int sbjDiff; if ((sbjDiff = Math.min(freq[0], freq[1]) * 2) > freq[2]) { sbjDiff = (sbjDiff - freq[3]) / 2; } else { sbjDiff = 0; } if (freq[0] != freq[1] || sbjDiff > 0) { int senti1 = 0; int senti2 = 1; if (freq[0] < freq[1]) { senti1 = 1; senti2 = 0; } indices[senti1] = randomIndices(freq[senti1], freq[senti2] - sbjDiff, System.currentTimeMillis()); freq[senti1] = freq[senti2] - sbjDiff; if (sbjDiff > 0) { indices[senti2] = randomIndices(freq[senti2], freq[senti2] - sbjDiff, System.currentTimeMillis()); freq[senti2] = freq[senti2] - sbjDiff; } } int sbj = freq[0] + freq[1]; if (sbj < freq[2]) { indices[2] = randomIndices(freq[2], sbj, System.currentTimeMillis()); freq[2] = sbj; } else if (sbj > freq[2]) { throw new IllegalStateException(); } int[] idx = new int[3]; int[] cursor = new int[3]; for (int i = 0; i < 3; i++) { idx[i] = 0; cursor[i] = 0; } OMTwitterCorpusFileReader reader = new OMTwitterCorpusFileReader(file1, fieldDelim1, fields1); OMTwitterCorpusFileWriter writer = new OMTwitterCorpusFileWriter(file2, fieldDelim2, fields2, append); int senti = 0; while (reader.hasNext()) { OMTweet tweet = reader.next(); switch (tweet.getPolarity()) { case OMTweet.POLARITY_POSITIVE: senti = 0; break; case OMTweet.POLARITY_NEGATIVE: senti = 1; break; case OMTweet.POLARITY_NEUTRAL: senti = 2; break; } if (indices[senti] == null) { writer.write(tweet); } else if (cursor[senti] < indices[senti].length && indices[senti][cursor[senti]] == idx[senti]) { writer.write(tweet); cursor[senti]++; } idx[senti]++; } writer.close(); reader.close(); }