/** * The default file for words which do not account for <code>thresholdSplit</code> percent may * have grown large. In order to prevent an real large misc. file we split again. */ private void processCreatedMiscFileAgain( int level, Comparator<String> comparator, int nextFileNumber) throws IOException { File folder = getOutputFolder(level); File misc = new File(folder, "99999999"); if (!misc.exists()) { return; } FrequencyDistribution<String> letterFD = createFreqDistForMiscFile(misc); float oldThreshold = splitThreshold; // Make sure that the misc file is split into little pieces splitThreshold /= 10; Web1TFileSplitter splitter = new Web1TFileSplitter(misc, folder, "UTF-8", letterFD, splitThreshold, nextFileNumber); splitter.split(); List<File> splittedFiles = splitter.getFiles(); Web1TFileSorter sorter = new Web1TFileSorter(splittedFiles, comparator); sorter.sort(); List<File> sortedFiles = splitter.getFiles(); splitThreshold = oldThreshold; misc.delete(); Web1TFileConsolidator consolidator = new Web1TFileConsolidator(sortedFiles, comparator, outputEncoding, minFrequency); consolidator.consolidate(); LinkedList<File> consolidatedFiles = consolidator.getConsolidatedFiles(); // rename consolidated files -> final index files for (File file : consolidatedFiles) { String name = Web1TUtil.cutOffUnderscoredSuffixFromFileName(file); file.renameTo(new File(name)); } splitter.cleanUp(); sorter.cleanUp(); consolidator.cleanUp(); }
private int processInputFileForLevel(int level, Comparator<String> comparator) throws IOException { File unsortedInputFile = new File(outputPath, level + ".txt"); File outputFolder = getOutputFolder(level); outputFolder.mkdir(); FrequencyDistribution<String> letterFD = letterFDs.get(level); Web1TFileSplitter splitter = new Web1TFileSplitter( unsortedInputFile, outputFolder, outputEncoding, letterFD, splitThreshold, 0); splitter.split(); List<File> splitFiles = splitter.getFiles(); Web1TFileSorter sorter = new Web1TFileSorter(splitFiles, comparator); sorter.sort(); splitter.cleanUp(); // Remove files from previous step LinkedList<File> sortedFiles = sorter.getSortedFiles(); Web1TFileConsolidator consolidator = new Web1TFileConsolidator(sortedFiles, comparator, outputEncoding, minFrequency); consolidator.consolidate(); sorter.cleanUp(); // Remove files from previous step LinkedList<File> consolidatedFiles = consolidator.getConsolidatedFiles(); // rename consolidated files -> final index files for (File file : consolidatedFiles) { String name = Web1TUtil.cutOffUnderscoredSuffixFromFileName(file); file.renameTo(new File(name)); } consolidator.cleanUp(); unsortedInputFile.delete(); return splitter.getNextUnusedFileNumber(); }