Esempio n. 1
0
  /**
   * The default file for words which do not account for <code>thresholdSplit</code> percent may
   * have grown large. In order to prevent an real large misc. file we split again.
   */
  private void processCreatedMiscFileAgain(
      int level, Comparator<String> comparator, int nextFileNumber) throws IOException {
    File folder = getOutputFolder(level);
    File misc = new File(folder, "99999999");

    if (!misc.exists()) {
      return;
    }

    FrequencyDistribution<String> letterFD = createFreqDistForMiscFile(misc);

    float oldThreshold = splitThreshold;
    // Make sure that the misc file is split into little pieces
    splitThreshold /= 10;

    Web1TFileSplitter splitter =
        new Web1TFileSplitter(misc, folder, "UTF-8", letterFD, splitThreshold, nextFileNumber);
    splitter.split();
    List<File> splittedFiles = splitter.getFiles();

    Web1TFileSorter sorter = new Web1TFileSorter(splittedFiles, comparator);
    sorter.sort();
    List<File> sortedFiles = splitter.getFiles();

    splitThreshold = oldThreshold;
    misc.delete();

    Web1TFileConsolidator consolidator =
        new Web1TFileConsolidator(sortedFiles, comparator, outputEncoding, minFrequency);
    consolidator.consolidate();

    LinkedList<File> consolidatedFiles = consolidator.getConsolidatedFiles();

    // rename consolidated files -> final index files
    for (File file : consolidatedFiles) {
      String name = Web1TUtil.cutOffUnderscoredSuffixFromFileName(file);
      file.renameTo(new File(name));
    }

    splitter.cleanUp();
    sorter.cleanUp();
    consolidator.cleanUp();
  }
Esempio n. 2
0
  private int processInputFileForLevel(int level, Comparator<String> comparator)
      throws IOException {

    File unsortedInputFile = new File(outputPath, level + ".txt");

    File outputFolder = getOutputFolder(level);
    outputFolder.mkdir();

    FrequencyDistribution<String> letterFD = letterFDs.get(level);

    Web1TFileSplitter splitter =
        new Web1TFileSplitter(
            unsortedInputFile, outputFolder, outputEncoding, letterFD, splitThreshold, 0);

    splitter.split();
    List<File> splitFiles = splitter.getFiles();

    Web1TFileSorter sorter = new Web1TFileSorter(splitFiles, comparator);
    sorter.sort();
    splitter.cleanUp(); // Remove files from previous step

    LinkedList<File> sortedFiles = sorter.getSortedFiles();

    Web1TFileConsolidator consolidator =
        new Web1TFileConsolidator(sortedFiles, comparator, outputEncoding, minFrequency);

    consolidator.consolidate();
    sorter.cleanUp(); // Remove files from previous step

    LinkedList<File> consolidatedFiles = consolidator.getConsolidatedFiles();

    // rename consolidated files -> final index files
    for (File file : consolidatedFiles) {
      String name = Web1TUtil.cutOffUnderscoredSuffixFromFileName(file);
      file.renameTo(new File(name));
    }

    consolidator.cleanUp();

    unsortedInputFile.delete();

    return splitter.getNextUnusedFileNumber();
  }