コード例 #1
0
  public void testKuromojiUserDict() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_user_dict");
    String source = "私は制限スピードを超える。";
    String[] expected = new String[] {"私", "は", "制限スピード", "を", "超える"};

    Tokenizer tokenizer = tokenizerFactory.create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenizer, expected);
  }
コード例 #2
0
ファイル: PTBTokenizerTest.java プロジェクト: wayzou/CoreNLP
 private static <T extends Label> void runOnTwoArrays(
     TokenizerFactory<T> tokFactory, String[] inputs, String[][] desired) {
   assertEquals("Test data arrays don't match in length", inputs.length, desired.length);
   for (int sent = 0; sent < inputs.length; sent++) {
     Tokenizer<T> tok = tokFactory.getTokenizer(new StringReader(inputs[sent]));
     for (int i = 0; tok.hasNext() || i < desired[sent].length; i++) {
       if (!tok.hasNext()) {
         fail(
             "PTBTokenizer generated too few tokens for sentence "
                 + sent
                 + "! Missing "
                 + desired[sent][i]);
       }
       T w = tok.next();
       if (i >= desired[sent].length) {
         fail(
             "PTBTokenizer generated too many tokens for sentence "
                 + sent
                 + "! Added "
                 + w.value());
       } else {
         assertEquals("PTBTokenizer got wrong token", desired[sent][i], w.value());
       }
     }
   }
 }
コード例 #3
0
  /**
   * Main entry point for ToolRunner (see ToolRunner docs)
   *
   * @param argv The parameters passed to this program.
   * @return 0 on success, non zero on error.
   */
  @Override
  public int run(String[] argv) throws Exception {
    int exitCode = 0;

    Options options = buildOptions();
    if (argv.length == 0) {
      printHelp();
      return -1;
    }

    CommandLineParser parser = new PosixParser();
    CommandLine cmd;

    try {
      cmd = parser.parse(options, argv);
    } catch (ParseException e) {
      System.out.println("Error parsing command-line options: " + e.getMessage());
      printHelp();
      return -1;
    }

    if (cmd.hasOption("h")) { // print help and exit
      printHelp();
      return -1;
    }

    boolean printToScreen = false;
    String inputFilenameArg = cmd.getOptionValue("i");
    String outputFilenameArg = cmd.getOptionValue("o");
    String processor = cmd.getOptionValue("p");
    if (processor == null) {
      processor = defaultProcessor;
    }

    if (cmd.hasOption("v")) { // print output to screen too
      printToScreen = true;
      System.out.println("input  [" + inputFilenameArg + "]");
      System.out.println("output [" + outputFilenameArg + "]");
    }

    try {
      go(
          EditsVisitorFactory.getEditsVisitor(
              outputFilenameArg,
              processor,
              TokenizerFactory.getTokenizer(inputFilenameArg),
              printToScreen));
    } catch (EOFException e) {
      System.err.println("Input file ended unexpectedly. Exiting");
    } catch (IOException e) {
      System.err.println("Encountered exception. Exiting: " + e.getMessage());
    }

    return exitCode;
  }
コード例 #4
0
ファイル: Tokenizer.java プロジェクト: ferhanture/Ivory
  @SuppressWarnings("static-access")
  public static void main(String[] args) {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("full path to model file or directory")
            .hasArg()
            .withDescription("model file")
            .create("model"));
    options.addOption(
        OptionBuilder.withArgName("full path to input file")
            .hasArg()
            .withDescription("input file")
            .isRequired()
            .create("input"));
    options.addOption(
        OptionBuilder.withArgName("full path to output file")
            .hasArg()
            .withDescription("output file")
            .isRequired()
            .create("output"));
    options.addOption(
        OptionBuilder.withArgName("en | zh | de | fr | ar | tr | es")
            .hasArg()
            .withDescription("2-character language code")
            .isRequired()
            .create("lang"));
    options.addOption(
        OptionBuilder.withArgName("path to stopwords list")
            .hasArg()
            .withDescription("one stopword per line")
            .create("stopword"));
    options.addOption(
        OptionBuilder.withArgName("path to stemmed stopwords list")
            .hasArg()
            .withDescription("one stemmed stopword per line")
            .create("stemmed_stopword"));
    options.addOption(
        OptionBuilder.withArgName("true|false")
            .hasArg()
            .withDescription("turn on/off stemming")
            .create("stem"));
    options.addOption(
        OptionBuilder.withDescription("Hadoop option to load external jars")
            .withArgName("jar packages")
            .hasArg()
            .create("libjars"));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      String stopwordList = null, stemmedStopwordList = null, modelFile = null;
      boolean isStem = true;
      cmdline = parser.parse(options, args);
      if (cmdline.hasOption("stopword")) {
        stopwordList = cmdline.getOptionValue("stopword");
      }
      if (cmdline.hasOption("stemmed_stopword")) {
        stemmedStopwordList = cmdline.getOptionValue("stemmed_stopword");
      }
      if (cmdline.hasOption("stem")) {
        isStem = Boolean.parseBoolean(cmdline.getOptionValue("stem"));
      }
      if (cmdline.hasOption("model")) {
        modelFile = cmdline.getOptionValue("model");
      }

      ivory.core.tokenize.Tokenizer tokenizer =
          TokenizerFactory.createTokenizer(
              cmdline.getOptionValue("lang"),
              modelFile,
              isStem,
              stopwordList,
              stemmedStopwordList,
              null);
      BufferedWriter out =
          new BufferedWriter(
              new OutputStreamWriter(
                  new FileOutputStream(cmdline.getOptionValue("output")), "UTF8"));
      BufferedReader in =
          new BufferedReader(
              new InputStreamReader(new FileInputStream(cmdline.getOptionValue("input")), "UTF8"));

      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = tokenizer.processContent(line);
        String s = "";
        for (String token : tokens) {
          s += token + " ";
        }
        out.write(s.trim() + "\n");
      }
      in.close();
      out.close();

    } catch (Exception exp) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("Tokenizer", options);
      System.exit(-1);
    }
  }