Esempio n. 1
0
  @SuppressWarnings("static-access")
  public static void main(String[] args) {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("full path to model file or directory")
            .hasArg()
            .withDescription("model file")
            .create("model"));
    options.addOption(
        OptionBuilder.withArgName("full path to input file")
            .hasArg()
            .withDescription("input file")
            .isRequired()
            .create("input"));
    options.addOption(
        OptionBuilder.withArgName("full path to output file")
            .hasArg()
            .withDescription("output file")
            .isRequired()
            .create("output"));
    options.addOption(
        OptionBuilder.withArgName("en | zh | de | fr | ar | tr | es")
            .hasArg()
            .withDescription("2-character language code")
            .isRequired()
            .create("lang"));
    options.addOption(
        OptionBuilder.withArgName("path to stopwords list")
            .hasArg()
            .withDescription("one stopword per line")
            .create("stopword"));
    options.addOption(
        OptionBuilder.withArgName("path to stemmed stopwords list")
            .hasArg()
            .withDescription("one stemmed stopword per line")
            .create("stemmed_stopword"));
    options.addOption(
        OptionBuilder.withArgName("true|false")
            .hasArg()
            .withDescription("turn on/off stemming")
            .create("stem"));
    options.addOption(
        OptionBuilder.withDescription("Hadoop option to load external jars")
            .withArgName("jar packages")
            .hasArg()
            .create("libjars"));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      String stopwordList = null, stemmedStopwordList = null, modelFile = null;
      boolean isStem = true;
      cmdline = parser.parse(options, args);
      if (cmdline.hasOption("stopword")) {
        stopwordList = cmdline.getOptionValue("stopword");
      }
      if (cmdline.hasOption("stemmed_stopword")) {
        stemmedStopwordList = cmdline.getOptionValue("stemmed_stopword");
      }
      if (cmdline.hasOption("stem")) {
        isStem = Boolean.parseBoolean(cmdline.getOptionValue("stem"));
      }
      if (cmdline.hasOption("model")) {
        modelFile = cmdline.getOptionValue("model");
      }

      ivory.core.tokenize.Tokenizer tokenizer =
          TokenizerFactory.createTokenizer(
              cmdline.getOptionValue("lang"),
              modelFile,
              isStem,
              stopwordList,
              stemmedStopwordList,
              null);
      BufferedWriter out =
          new BufferedWriter(
              new OutputStreamWriter(
                  new FileOutputStream(cmdline.getOptionValue("output")), "UTF8"));
      BufferedReader in =
          new BufferedReader(
              new InputStreamReader(new FileInputStream(cmdline.getOptionValue("input")), "UTF8"));

      String line = null;
      while ((line = in.readLine()) != null) {
        String[] tokens = tokenizer.processContent(line);
        String s = "";
        for (String token : tokens) {
          s += token + " ";
        }
        out.write(s.trim() + "\n");
      }
      in.close();
      out.close();

    } catch (Exception exp) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("Tokenizer", options);
      System.exit(-1);
    }
  }