public void run(String[] args) {
    Parameters params = validateAndParseParams(args, Parameters.class);

    File testData = new File(params.getCensusData());
    File dictOutFile = new File(params.getDict());

    CmdLineUtil.checkInputFile("Name data", testData);
    CmdLineUtil.checkOutputFile("Dictionary file", dictOutFile);

    FileInputStream sampleDataIn = CmdLineUtil.openInFile(testData);
    ObjectStream<StringList> sampleStream =
        new NameFinderCensus90NameStream(sampleDataIn, Charset.forName(params.getEncoding()));

    Dictionary mDictionary;
    try {
      System.out.println("Creating Dictionary...");
      mDictionary = createDictionary(sampleStream);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while reading training data or indexing data: " + e.getMessage(), e);
    } finally {
      try {
        sampleStream.close();
      } catch (IOException e) {
        // sorry this can fail..
      }
    }

    System.out.println("Saving Dictionary...");

    OutputStream out = null;

    try {
      out = new FileOutputStream(dictOutFile);
      mDictionary.serialize(out);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while writing dictionary file: " + e.getMessage(), e);
    } finally {
      if (out != null)
        try {
          out.close();
        } catch (IOException e) {
          // file might be damaged
          throw new TerminateToolException(
              -1, "Attention: Failed to correctly write dictionary:" + e.getMessage(), e);
        }
    }
  }
  /**
   * Creates a dictionary.
   *
   * @param sampleStream stream of samples.
   * @return a {@code Dictionary} class containing the name dictionary built from the input file.
   * @throws IOException IOException
   */
  public static Dictionary createDictionary(ObjectStream<StringList> sampleStream)
      throws IOException {

    Dictionary mNameDictionary = new Dictionary(true);
    StringList entry;

    entry = sampleStream.read();
    while (entry != null) {
      if (!mNameDictionary.contains(entry)) {
        mNameDictionary.put(entry);
      }
      entry = sampleStream.read();
    }

    return mNameDictionary;
  }
예제 #3
0
  /**
   * Reads a dictionary which has one entry per line. The tokens inside an entry are whitespace
   * delimited.
   *
   * @param in
   * @return the parsed dictionary
   * @throws IOException
   */
  public static Dictionary parseOneEntryPerLine(Reader in) throws IOException {
    BufferedReader lineReader = new BufferedReader(in);

    Dictionary dictionary = new Dictionary();

    String line;

    while ((line = lineReader.readLine()) != null) {
      StringTokenizer whiteSpaceTokenizer = new StringTokenizer(line, " ");

      String tokens[] = new String[whiteSpaceTokenizer.countTokens()];

      if (tokens.length > 0) {
        int tokenIndex = 0;
        while (whiteSpaceTokenizer.hasMoreTokens()) {
          tokens[tokenIndex++] = whiteSpaceTokenizer.nextToken();
        }

        dictionary.put(new StringList(tokens));
      }
    }

    return dictionary;
  }