예제 #1
0
  public void load() throws IOException {
    log.info("Loading lexicon...");
    File dataFile = new File("data/lexicon.txt.gz");
    Reader reader =
        new BufferedReader(
            new InputStreamReader(new GZIPInputStream(new FileInputStream(dataFile))));

    LineIterator iterator = IOUtils.lineIterator(reader);

    while (iterator.hasNext()) {
      String line = iterator.nextLine();
      String[] splits = line.split("\\s");
      for (int x = 1; x < splits.length; ++x) {
        POSTag tag = POSTag.fromString(splits[x]);
        if (tag == null) log.warn("Unknown tag: {0}", splits[x]);
        else lexiconMap.put(splits[0], tag);
      }
    }

    iterator.close();
    log.info("Lexicon loaded!");
  }
예제 #2
0
  public static void main(String[] args) throws Exception {
    File testFile = new File(args[0]);

    Reader reader = new FileReader(testFile);
    Document doc = new Document(IOUtils.toString(reader));
    reader.close();

    Tokenizer tokenizer = new Tokenizer();
    tokenizer.tokenize(doc);

    POSTagger tagger = new POSTagger();
    tagger.load();
    tagger.process(doc);

    for (Token token : doc) {
      log.info("Token: {0}", token);
    }
  }