@SuppressWarnings("static-access") public static void main(String[] args) { Options options = new Options(); options.addOption( OptionBuilder.withArgName("full path to model file or directory") .hasArg() .withDescription("model file") .create("model")); options.addOption( OptionBuilder.withArgName("full path to input file") .hasArg() .withDescription("input file") .isRequired() .create("input")); options.addOption( OptionBuilder.withArgName("full path to output file") .hasArg() .withDescription("output file") .isRequired() .create("output")); options.addOption( OptionBuilder.withArgName("en | zh | de | fr | ar | tr | es") .hasArg() .withDescription("2-character language code") .isRequired() .create("lang")); options.addOption( OptionBuilder.withArgName("path to stopwords list") .hasArg() .withDescription("one stopword per line") .create("stopword")); options.addOption( OptionBuilder.withArgName("path to stemmed stopwords list") .hasArg() .withDescription("one stemmed stopword per line") .create("stemmed_stopword")); options.addOption( OptionBuilder.withArgName("true|false") .hasArg() .withDescription("turn on/off stemming") .create("stem")); options.addOption( OptionBuilder.withDescription("Hadoop option to load external jars") .withArgName("jar packages") .hasArg() .create("libjars")); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { String stopwordList = null, stemmedStopwordList = null, modelFile = null; boolean isStem = true; cmdline = parser.parse(options, args); if (cmdline.hasOption("stopword")) { stopwordList = cmdline.getOptionValue("stopword"); } if (cmdline.hasOption("stemmed_stopword")) { stemmedStopwordList = cmdline.getOptionValue("stemmed_stopword"); } if (cmdline.hasOption("stem")) { isStem = Boolean.parseBoolean(cmdline.getOptionValue("stem")); } if (cmdline.hasOption("model")) { modelFile = cmdline.getOptionValue("model"); } ivory.core.tokenize.Tokenizer tokenizer = TokenizerFactory.createTokenizer( cmdline.getOptionValue("lang"), modelFile, isStem, stopwordList, stemmedStopwordList, null); BufferedWriter out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream(cmdline.getOptionValue("output")), "UTF8")); BufferedReader in = new BufferedReader( new InputStreamReader(new FileInputStream(cmdline.getOptionValue("input")), "UTF8")); String line = null; while ((line = in.readLine()) != null) { String[] tokens = tokenizer.processContent(line); String s = ""; for (String token : tokens) { s += token + " "; } out.write(s.trim() + "\n"); } in.close(); out.close(); } catch (Exception exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("Tokenizer", options); System.exit(-1); } }