Exemplo n.º 1
0
  /**
   * Extracts all words of tagged data file in a directory.
   *
   * @param directoryName
   * @return a set of words
   */
  public static Set<String> extract(String directoryName) {
    Set<String> words = new TreeSet<String>();
    // create a file filter
    TextFileFilter filter = new TextFileFilter(POS_FILE_EXTENSION);

    File directory = new File(directoryName);

    File[] files = FileIterator.listFiles(directory, filter);
    System.err.println("# of files = " + files.length);

    for (File file : files) {
      // get sentences
      String[] sentences = UTF8FileUtility.getLines(file.getAbsolutePath());
      for (String s : sentences) {
        words.addAll(getWords(s));
      }
    }

    return words;
  }
  /**
   * Tokenizes all files in a directory.
   *
   * @param inputDir an input dir
   * @param outputDir an output dir
   */
  public void tokenizeDirectory(String inputDir, String outputDir) {
    TextFileFilter fileFilter = new TextFileFilter(TokenizerOptions.TEXT_FILE_EXTENSION);
    File inputDirFile = new File(inputDir);
    // get the current dir
    String currentDir = new File(".").getAbsolutePath();
    String inputDirPath = currentDir + File.separator + inputDir;
    String outputDirPath = currentDir + File.separator + outputDir;

    if (DEBUG) {
      System.out.println("currentDir = " + currentDir);
      System.out.println("inputDirPath = " + inputDirPath);
      System.out.println("outputDirPath = " + outputDirPath);
    }

    // get all input files
    File[] inputFiles = FileIterator.listFiles(inputDirFile, fileFilter);
    System.out.println("Tokenizing all files in the directory, please wait...");
    long startTime = System.currentTimeMillis();
    for (File aFile : inputFiles) {
      // get the simple name of the file
      String input = aFile.getName();
      // the output file have the same name with the automatic file
      String output = outputDirPath + File.separator + input;
      // tokenize the file
      tokenize(aFile.getAbsolutePath(), output);
    }
    long endTime = System.currentTimeMillis();
    float duration = (float) (endTime - startTime) / 1000;
    System.out.println(
        "Tokenized "
            + nTokens
            + " words of "
            + inputFiles.length
            + " files in "
            + duration
            + " (s).\n");
  }