@Override public IntIterator iterator() { try { System.err.print("."); return new DataInputStreamAsIntIterator(CLIOpts.openInputAsMaybeZipped(file), W); } catch (IOException x) { throw new RuntimeException(x); } }
public static void main(String[] args) throws Exception { final CLIOpts opts = new CLIOpts(args); final boolean useUNK = opts.flag("unk", "Output low-frequency tokens as <UNK>"); final boolean stopWords = opts.flag("stopWords", "Retain stop words"); final File corpusFile = opts.roFile("corpus[.gz|.b2]", "The integerized corpus"); // final int W = opts.intValue("W", "Total number of distinct tokens in corpus"); final File wordMapFile = opts.roFile("wordMap", "The word map"); final File freqFile = opts.roFile("freqs", "The file of token frequencies"); final int freqMin = opts.intValue("freqMin", "The minimum frequency to accept"); final int lenMin = opts.intValue("lenMin", "The minimum document length to accept"); final File outFile = opts.woFile("out[.gz|.bz2]", "The file to write to"); if (!opts.verify(CleanCorpus.class)) { return; } final int W = WordMap.calcW(wordMapFile); int[] freqArray = readFreqs(freqFile, W); final int freqMax = stopWords ? Integer.MAX_VALUE : freqArray[0]; final DataInputStream corpusIn; if (corpusFile.getName().endsWith(".gz")) { corpusIn = new DataInputStream(new GZIPInputStream(new FileInputStream(corpusFile))); } else if (corpusFile.getName().endsWith(".bz2")) { corpusIn = new DataInputStream(new BZip2CompressorInputStream(new FileInputStream(corpusFile))); } else { corpusIn = new DataInputStream(new FileInputStream(corpusFile)); } final DataOutputStream dataOut; if (outFile.getName().endsWith(".gz")) { dataOut = new DataOutputStream(new GZIPOutputStream(new FileOutputStream(outFile))); } else if (corpusFile.getName().endsWith(".bz2")) { dataOut = new DataOutputStream(new BZip2CompressorOutputStream(new FileOutputStream(outFile))); } else { dataOut = new DataOutputStream(new FileOutputStream(outFile)); } int remainingVocab = 0; for (int i = 1; i < freqArray.length; i++) { if (freqArray[i] >= freqMin && freqArray[i] < freqMax) { remainingVocab++; } } System.err.println("Remaining: " + remainingVocab); cleanCorpus(corpusIn, dataOut, freqArray, freqMin, freqMax, lenMin, useUNK ? W + 1 : -1); }