public void testKuromojiUserDict() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_user_dict"); String source = "私は制限スピードを超える。"; String[] expected = new String[] {"私", "は", "制限スピード", "を", "超える"}; Tokenizer tokenizer = tokenizerFactory.create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenizer, expected); }
private static <T extends Label> void runOnTwoArrays( TokenizerFactory<T> tokFactory, String[] inputs, String[][] desired) { assertEquals("Test data arrays don't match in length", inputs.length, desired.length); for (int sent = 0; sent < inputs.length; sent++) { Tokenizer<T> tok = tokFactory.getTokenizer(new StringReader(inputs[sent])); for (int i = 0; tok.hasNext() || i < desired[sent].length; i++) { if (!tok.hasNext()) { fail( "PTBTokenizer generated too few tokens for sentence " + sent + "! Missing " + desired[sent][i]); } T w = tok.next(); if (i >= desired[sent].length) { fail( "PTBTokenizer generated too many tokens for sentence " + sent + "! Added " + w.value()); } else { assertEquals("PTBTokenizer got wrong token", desired[sent][i], w.value()); } } } }
/** * Main entry point for ToolRunner (see ToolRunner docs) * * @param argv The parameters passed to this program. * @return 0 on success, non zero on error. */ @Override public int run(String[] argv) throws Exception { int exitCode = 0; Options options = buildOptions(); if (argv.length == 0) { printHelp(); return -1; } CommandLineParser parser = new PosixParser(); CommandLine cmd; try { cmd = parser.parse(options, argv); } catch (ParseException e) { System.out.println("Error parsing command-line options: " + e.getMessage()); printHelp(); return -1; } if (cmd.hasOption("h")) { // print help and exit printHelp(); return -1; } boolean printToScreen = false; String inputFilenameArg = cmd.getOptionValue("i"); String outputFilenameArg = cmd.getOptionValue("o"); String processor = cmd.getOptionValue("p"); if (processor == null) { processor = defaultProcessor; } if (cmd.hasOption("v")) { // print output to screen too printToScreen = true; System.out.println("input [" + inputFilenameArg + "]"); System.out.println("output [" + outputFilenameArg + "]"); } try { go( EditsVisitorFactory.getEditsVisitor( outputFilenameArg, processor, TokenizerFactory.getTokenizer(inputFilenameArg), printToScreen)); } catch (EOFException e) { System.err.println("Input file ended unexpectedly. Exiting"); } catch (IOException e) { System.err.println("Encountered exception. Exiting: " + e.getMessage()); } return exitCode; }
@SuppressWarnings("static-access") public static void main(String[] args) { Options options = new Options(); options.addOption( OptionBuilder.withArgName("full path to model file or directory") .hasArg() .withDescription("model file") .create("model")); options.addOption( OptionBuilder.withArgName("full path to input file") .hasArg() .withDescription("input file") .isRequired() .create("input")); options.addOption( OptionBuilder.withArgName("full path to output file") .hasArg() .withDescription("output file") .isRequired() .create("output")); options.addOption( OptionBuilder.withArgName("en | zh | de | fr | ar | tr | es") .hasArg() .withDescription("2-character language code") .isRequired() .create("lang")); options.addOption( OptionBuilder.withArgName("path to stopwords list") .hasArg() .withDescription("one stopword per line") .create("stopword")); options.addOption( OptionBuilder.withArgName("path to stemmed stopwords list") .hasArg() .withDescription("one stemmed stopword per line") .create("stemmed_stopword")); options.addOption( OptionBuilder.withArgName("true|false") .hasArg() .withDescription("turn on/off stemming") .create("stem")); options.addOption( OptionBuilder.withDescription("Hadoop option to load external jars") .withArgName("jar packages") .hasArg() .create("libjars")); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { String stopwordList = null, stemmedStopwordList = null, modelFile = null; boolean isStem = true; cmdline = parser.parse(options, args); if (cmdline.hasOption("stopword")) { stopwordList = cmdline.getOptionValue("stopword"); } if (cmdline.hasOption("stemmed_stopword")) { stemmedStopwordList = cmdline.getOptionValue("stemmed_stopword"); } if (cmdline.hasOption("stem")) { isStem = Boolean.parseBoolean(cmdline.getOptionValue("stem")); } if (cmdline.hasOption("model")) { modelFile = cmdline.getOptionValue("model"); } ivory.core.tokenize.Tokenizer tokenizer = TokenizerFactory.createTokenizer( cmdline.getOptionValue("lang"), modelFile, isStem, stopwordList, stemmedStopwordList, null); BufferedWriter out = new BufferedWriter( new OutputStreamWriter( new FileOutputStream(cmdline.getOptionValue("output")), "UTF8")); BufferedReader in = new BufferedReader( new InputStreamReader(new FileInputStream(cmdline.getOptionValue("input")), "UTF8")); String line = null; while ((line = in.readLine()) != null) { String[] tokens = tokenizer.processContent(line); String s = ""; for (String token : tokens) { s += token + " "; } out.write(s.trim() + "\n"); } in.close(); out.close(); } catch (Exception exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("Tokenizer", options); System.exit(-1); } }