public void load() throws IOException { log.info("Loading lexicon..."); File dataFile = new File("data/lexicon.txt.gz"); Reader reader = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(dataFile)))); LineIterator iterator = IOUtils.lineIterator(reader); while (iterator.hasNext()) { String line = iterator.nextLine(); String[] splits = line.split("\\s"); for (int x = 1; x < splits.length; ++x) { POSTag tag = POSTag.fromString(splits[x]); if (tag == null) log.warn("Unknown tag: {0}", splits[x]); else lexiconMap.put(splits[0], tag); } } iterator.close(); log.info("Lexicon loaded!"); }
public static void main(String[] args) throws Exception { File testFile = new File(args[0]); Reader reader = new FileReader(testFile); Document doc = new Document(IOUtils.toString(reader)); reader.close(); Tokenizer tokenizer = new Tokenizer(); tokenizer.tokenize(doc); POSTagger tagger = new POSTagger(); tagger.load(); tagger.process(doc); for (Token token : doc) { log.info("Token: {0}", token); } }