/** * Extracts all words of tagged data file in a directory. * * @param directoryName * @return a set of words */ public static Set<String> extract(String directoryName) { Set<String> words = new TreeSet<String>(); // create a file filter TextFileFilter filter = new TextFileFilter(POS_FILE_EXTENSION); File directory = new File(directoryName); File[] files = FileIterator.listFiles(directory, filter); System.err.println("# of files = " + files.length); for (File file : files) { // get sentences String[] sentences = UTF8FileUtility.getLines(file.getAbsolutePath()); for (String s : sentences) { words.addAll(getWords(s)); } } return words; }
/** * Tokenizes all files in a directory. * * @param inputDir an input dir * @param outputDir an output dir */ public void tokenizeDirectory(String inputDir, String outputDir) { TextFileFilter fileFilter = new TextFileFilter(TokenizerOptions.TEXT_FILE_EXTENSION); File inputDirFile = new File(inputDir); // get the current dir String currentDir = new File(".").getAbsolutePath(); String inputDirPath = currentDir + File.separator + inputDir; String outputDirPath = currentDir + File.separator + outputDir; if (DEBUG) { System.out.println("currentDir = " + currentDir); System.out.println("inputDirPath = " + inputDirPath); System.out.println("outputDirPath = " + outputDirPath); } // get all input files File[] inputFiles = FileIterator.listFiles(inputDirFile, fileFilter); System.out.println("Tokenizing all files in the directory, please wait..."); long startTime = System.currentTimeMillis(); for (File aFile : inputFiles) { // get the simple name of the file String input = aFile.getName(); // the output file have the same name with the automatic file String output = outputDirPath + File.separator + input; // tokenize the file tokenize(aFile.getAbsolutePath(), output); } long endTime = System.currentTimeMillis(); float duration = (float) (endTime - startTime) / 1000; System.out.println( "Tokenized " + nTokens + " words of " + inputFiles.length + " files in " + duration + " (s).\n"); }