@Override
 public IntIterator iterator() {
   try {
     System.err.print(".");
     return new DataInputStreamAsIntIterator(CLIOpts.openInputAsMaybeZipped(file), W);
   } catch (IOException x) {
     throw new RuntimeException(x);
   }
 }
示例#2
0
  public static void main(String[] args) throws Exception {
    final CLIOpts opts = new CLIOpts(args);

    final boolean useUNK = opts.flag("unk", "Output low-frequency tokens as <UNK>");

    final boolean stopWords = opts.flag("stopWords", "Retain stop words");

    final File corpusFile = opts.roFile("corpus[.gz|.b2]", "The integerized corpus");

    // final int W = opts.intValue("W", "Total number of distinct tokens in corpus");
    final File wordMapFile = opts.roFile("wordMap", "The word map");

    final File freqFile = opts.roFile("freqs", "The file of token frequencies");

    final int freqMin = opts.intValue("freqMin", "The minimum frequency to accept");

    final int lenMin = opts.intValue("lenMin", "The minimum document length to accept");

    final File outFile = opts.woFile("out[.gz|.bz2]", "The file to write to");

    if (!opts.verify(CleanCorpus.class)) {
      return;
    }

    final int W = WordMap.calcW(wordMapFile);

    int[] freqArray = readFreqs(freqFile, W);

    final int freqMax = stopWords ? Integer.MAX_VALUE : freqArray[0];

    final DataInputStream corpusIn;
    if (corpusFile.getName().endsWith(".gz")) {
      corpusIn = new DataInputStream(new GZIPInputStream(new FileInputStream(corpusFile)));
    } else if (corpusFile.getName().endsWith(".bz2")) {
      corpusIn =
          new DataInputStream(new BZip2CompressorInputStream(new FileInputStream(corpusFile)));
    } else {
      corpusIn = new DataInputStream(new FileInputStream(corpusFile));
    }

    final DataOutputStream dataOut;
    if (outFile.getName().endsWith(".gz")) {
      dataOut = new DataOutputStream(new GZIPOutputStream(new FileOutputStream(outFile)));
    } else if (corpusFile.getName().endsWith(".bz2")) {
      dataOut =
          new DataOutputStream(new BZip2CompressorOutputStream(new FileOutputStream(outFile)));
    } else {
      dataOut = new DataOutputStream(new FileOutputStream(outFile));
    }

    int remainingVocab = 0;

    for (int i = 1; i < freqArray.length; i++) {
      if (freqArray[i] >= freqMin && freqArray[i] < freqMax) {
        remainingVocab++;
      }
    }

    System.err.println("Remaining: " + remainingVocab);

    cleanCorpus(corpusIn, dataOut, freqArray, freqMin, freqMax, lenMin, useUNK ? W + 1 : -1);
  }