public void reduce(BigList in, BigList out) throws IOException { long length = in.size(); long wordnr = 0; for (long lineNo = 0; lineNo < length; ) { Word word = getWord(in, lineNo); List<Derivation> allDerivations = new ArrayList<Derivation>(); SortedSet<Derivation> normalizedDerivations = new TreeSet<Derivation>(); lineNo = findEquals(lineNo, in, allDerivations); for (Derivation d : allDerivations) { Derivation soa = normalize(d); normalizedDerivations.add(soa); } int n1 = allDerivations.size(); int n2 = normalizedDerivations.size(); System.out.println( "Word " + wordnr + ": keeping " + n2 + " of " + n1 + " = " + ((n2 * 100.0f) / n1) + "%"); wordnr++; for (Derivation d : normalizedDerivations) { writeOut(out, word, d); } } System.out.println( "Total: keeping " + out.size() + " of " + in.size() + " = " + ((out.size() * 100.0f) / in.size()) + "%"); }
/** * Reads all possible derivations of the word at the position "currentWordIndex" in the BigList * "in" and puts them into the List "derivations". * * @return the index of the next line containing a different word */ private final long findEquals(long currentWordIndex, BigList in, List<Derivation> derivations) throws IOException { String s1 = in.getLine(currentWordIndex); Word w1 = (Word) mapper.toSymbolSequence(s1); derivations.add(w1.getDerivation()); long pos = currentWordIndex + 1; long length = in.size(); while (pos < length) { Word w2 = getWord(in, pos); if (w1.equals(w2)) { derivations.add(w2.getDerivation()); } else { return pos; } pos++; } return pos; }
/** * @param to * @param w * @param soa * @throws IOException */ private final void writeOut(BigList to, Word w, Derivation soa) throws IOException { Word w2 = new Word(w, soa); String s2 = mapper.toString(w2); to.append(s2); }
/** * @param from * @param line * @return * @throws IOException */ private Word getWord(BigList from, long line) throws IOException { String s1 = from.getLine(line); Word word = (Word) mapper.toSymbolSequence(s1); return word; }