private void balanceSentiment( String file1, String fieldDelim1, int[] fields1, String file2, String fieldDelim2, int[] fields2, boolean append) throws IOException { Map<Integer, Integer> sentiFreq = TwitterCorpusStat.sentimentFreq(file1, fieldDelim1, fields1); int[] freq = new int[3]; freq[0] = sentiFreq.get(OMTweet.POLARITY_POSITIVE); freq[1] = sentiFreq.get(OMTweet.POLARITY_NEGATIVE); freq[2] = sentiFreq.get(OMTweet.POLARITY_NEUTRAL); int[][] indices = new int[3][]; for (int i = 0; i < 3; i++) { indices[i] = null; } int sbjDiff; if ((sbjDiff = Math.min(freq[0], freq[1]) * 2) > freq[2]) { sbjDiff = (sbjDiff - freq[3]) / 2; } else { sbjDiff = 0; } if (freq[0] != freq[1] || sbjDiff > 0) { int senti1 = 0; int senti2 = 1; if (freq[0] < freq[1]) { senti1 = 1; senti2 = 0; } indices[senti1] = randomIndices(freq[senti1], freq[senti2] - sbjDiff, System.currentTimeMillis()); freq[senti1] = freq[senti2] - sbjDiff; if (sbjDiff > 0) { indices[senti2] = randomIndices(freq[senti2], freq[senti2] - sbjDiff, System.currentTimeMillis()); freq[senti2] = freq[senti2] - sbjDiff; } } int sbj = freq[0] + freq[1]; if (sbj < freq[2]) { indices[2] = randomIndices(freq[2], sbj, System.currentTimeMillis()); freq[2] = sbj; } else if (sbj > freq[2]) { throw new IllegalStateException(); } int[] idx = new int[3]; int[] cursor = new int[3]; for (int i = 0; i < 3; i++) { idx[i] = 0; cursor[i] = 0; } OMTwitterCorpusFileReader reader = new OMTwitterCorpusFileReader(file1, fieldDelim1, fields1); OMTwitterCorpusFileWriter writer = new OMTwitterCorpusFileWriter(file2, fieldDelim2, fields2, append); int senti = 0; while (reader.hasNext()) { OMTweet tweet = reader.next(); switch (tweet.getPolarity()) { case OMTweet.POLARITY_POSITIVE: senti = 0; break; case OMTweet.POLARITY_NEGATIVE: senti = 1; break; case OMTweet.POLARITY_NEUTRAL: senti = 2; break; } if (indices[senti] == null) { writer.write(tweet); } else if (cursor[senti] < indices[senti].length && indices[senti][cursor[senti]] == idx[senti]) { writer.write(tweet); cursor[senti]++; } idx[senti]++; } writer.close(); reader.close(); }
/* (non-Javadoc) * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS) */ public void processCas(CAS aCAS) throws ResourceProcessException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { logger.log(Level.SEVERE, e.getMessage()); throw new ResourceProcessException(e); } TweetAnnotation tweetAnn = (TweetAnnotation) jcas.getAnnotationIndex(TweetAnnotation.type).iterator().next(); OMTweet answerTweet = evalCorpusReader.next(); if (!answerTweet.getId().equals(tweetAnn.getId())) { logger.log( Level.SEVERE, "target corpus and evaluation corpus don't match to each other - " + answerTweet.getId() + ", " + tweetAnn.getId()); throw new ResourceProcessException(); } String[] entity = extractEntityTags(answerTweet.getText()); String classified = null; String prevClassified = null; StringBuffer sb = new StringBuffer(); try { sb.append("\n["); sb.append(answerTweet.getPolarityString()); sb.append("=>"); sb.append(tweetAnn.getPolarity()); sb.append("] "); sb.append(tweetAnn.getCoveredText()); sb.append('\n'); FSIterator<Annotation> tokenAnnIter = jcas.getAnnotationIndex(TokenAnnotation.type).iterator(); TokenAnnotation tokenAnn = null; int i = 0; int prevClassifiedIdx = labelNoneIdx; int prevAnswerIdx = labelNoneIdx; String classifiedEntityStr = ""; String answerEntityStr = ""; while (tokenAnnIter.hasNext()) { tokenAnn = (TokenAnnotation) tokenAnnIter.next(); classified = tokenAnn.getEntityLabel(); String answer = entity[i]; boolean correct = false; if (classified.equals(answer)) { correct = true; } int classifiedIdx = 0; int answerIdx = 0; try { answerIdx = map.get(answer); } catch (Exception e) { logger.log( Level.SEVERE, "wrong annotation on the evaluation corpus - tweet id: " + answerTweet.getId() + ", answerTag=" + answer); logger.log(Level.SEVERE, e.getMessage()); answerIdx = map.get(labelNone); } try { classifiedIdx = map.get(classified); } catch (Exception e) { logger.log( Level.SEVERE, "wrong annotation from the NER - tweet id: " + answerTweet.getId() + ", classifiedTag=" + classified); logger.log(Level.SEVERE, e.getMessage()); classifiedIdx = map.get(labelNone); } stat[classifiedIdx][0]++; stat[answerIdx][1]++; if (correct) { stat[classifiedIdx][2]++; } if (classifiedIdx != labelNoneIdx) { if (classifiedIdx / 3 != prevClassifiedIdx / 3) { classifiedEntityCnt[classifiedIdx / 3]++; if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); } classifiedEntityStr = tokenAnn.getCoveredText(); } else { classifiedEntityStr += " " + tokenAnn.getCoveredText(); } } else if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); classifiedEntityStr = ""; } prevClassifiedIdx = classifiedIdx; if (answerIdx != labelNoneIdx) { if (answerIdx / 3 != prevAnswerIdx / 3) { answerEntityCnt[answerIdx / 3]++; answerEntityStr = tokenAnn.getCoveredText(); } else { answerEntityStr += " " + tokenAnn.getCoveredText(); } } else if (prevAnswerIdx != labelNoneIdx) { answerEntityStr = ""; } prevAnswerIdx = answerIdx; prevClassified = classified; i++; } if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); } // senti String answerSenti = answerTweet.getPolarityString(); boolean correct = false; String classifiedSenti = tweetAnn.getPolarity(); if (classifiedSenti.equals(senti)) { correct = true; } int classifiedIdx = sentiIdx(classifiedSenti); int answerIdx = sentiIdx(answerSenti); senti[classifiedIdx][0]++; senti[answerIdx][1]++; if (classifiedIdx == answerIdx) { correct = true; } if (correct) { senti[classifiedIdx][2]++; } cnt++; logger.log(Level.INFO, sb.toString()); } catch (CASRuntimeException e) { throw new ResourceProcessException(e); } }