public void run(String format, String[] args) {
    super.run(format, args);

    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
    if (mlParams == null) {
      mlParams = ModelUtil.createTrainingParameters(params.getIterations(), params.getCutoff());
    }

    File modelOutFile = params.getModel();
    CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile);

    ChunkerModel model;
    try {
      ChunkerFactory chunkerFactory = ChunkerFactory.create(params.getFactory());
      model = ChunkerME.train(params.getLang(), sampleStream, mlParams, chunkerFactory);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while reading training data or indexing data: " + e.getMessage(), e);
    } finally {
      try {
        sampleStream.close();
      } catch (IOException e) {
        // sorry that this can fail
      }
    }

    CmdLineUtil.writeModel("chunker", modelOutFile, model);
  }
  public void run(String[] args) {
    Parameters params = validateAndParseParams(args, Parameters.class);

    File testData = new File(params.getCensusData());
    File dictOutFile = new File(params.getDict());

    CmdLineUtil.checkInputFile("Name data", testData);
    CmdLineUtil.checkOutputFile("Dictionary file", dictOutFile);

    FileInputStream sampleDataIn = CmdLineUtil.openInFile(testData);
    ObjectStream<StringList> sampleStream =
        new NameFinderCensus90NameStream(sampleDataIn, Charset.forName(params.getEncoding()));

    Dictionary mDictionary;
    try {
      System.out.println("Creating Dictionary...");
      mDictionary = createDictionary(sampleStream);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while reading training data or indexing data: " + e.getMessage(), e);
    } finally {
      try {
        sampleStream.close();
      } catch (IOException e) {
        // sorry this can fail..
      }
    }

    System.out.println("Saving Dictionary...");

    OutputStream out = null;

    try {
      out = new FileOutputStream(dictOutFile);
      mDictionary.serialize(out);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while writing dictionary file: " + e.getMessage(), e);
    } finally {
      if (out != null)
        try {
          out.close();
        } catch (IOException e) {
          // file might be damaged
          throw new TerminateToolException(
              -1, "Attention: Failed to correctly write dictionary:" + e.getMessage(), e);
        }
    }
  }
  public void run(String[] args) {

    if (0 == args.length) {
      System.out.println(getHelp());
    } else {

      DoccatModel model = new DoccatModelLoader().load(new File(args[0]));

      DocumentCategorizerME doccat = new DocumentCategorizerME(model);

      ObjectStream<String> documentStream =
          new ParagraphStream(new PlainTextByLineStream(new InputStreamReader(System.in)));

      PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
      perfMon.start();

      try {
        String document;
        while ((document = documentStream.read()) != null) {
          double prob[] = doccat.categorize(WhitespaceTokenizer.INSTANCE.tokenize(document));
          String category = doccat.getBestCategory(prob);

          DocumentSample sample = new DocumentSample(category, document);
          System.out.println(sample.toString());

          perfMon.incrementCounter();
        }
      } catch (IOException e) {
        CmdLineUtil.handleStdinIoError(e);
      }

      perfMon.stopAndPrintFinalResult();
    }
  }
  public ObjectStream<ChunkSample> create(String[] args) {

    Parameters params = ArgumentParser.parse(args, Parameters.class);

    language = params.getLang();

    FileInputStream sampleDataIn = CmdLineUtil.openInFile(params.getData());

    ObjectStream<String> lineStream =
        new PlainTextByLineStream(sampleDataIn.getChannel(), params.getEncoding());

    ADChunkBasedShallowParserSampleStream sampleStream =
        new ADChunkBasedShallowParserSampleStream(
            lineStream,
            params.getFunctTags(),
            params.getIsIncludePOSTags(),
            params.getUseCGTags(),
            params.getExpandME());

    if (params.getStart() != null && params.getStart() > -1) {
      sampleStream.setStart(params.getStart());
    }

    if (params.getEnd() != null && params.getEnd() > -1) {
      sampleStream.setEnd(params.getEnd());
    }

    return sampleStream;
  }
  /**
   * Read the file into an {@code ObjectStream}.
   *
   * @param infile the string pointing to the file
   * @return the object stream
   */
  public static ObjectStream<String> readFileIntoMarkableStreamFactory(final String infile) {

    InputStreamFactory inputStreamFactory = null;
    try {
      inputStreamFactory = new MarkableFileInputStreamFactory(new File(infile));
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    }
    ObjectStream<String> lineStream = null;
    try {
      lineStream = new PlainTextByLineStream((inputStreamFactory), "UTF-8");
    } catch (IOException e) {
      CmdLineUtil.handleCreateObjectStreamError(e);
    }
    return lineStream;
  }
Beispiel #6
0
  public void run(String[] args) {
    Params params = validateAndParseParams(args, Params.class);

    String lang = params.getLang();
    CmdLineUtil.checkLanguageCode(lang);

    String country = params.getCountry();
    if (StringsUtil.isNullOrEmpty(country)) {
      throw new TerminateToolException(1, "Country cannot be empty. Example country: BR");
    }

    long start = System.nanoTime();

    ComponentFactory factory;
    try {
      factory = ComponentFactory.create(new Locale(lang, country));
    } catch (InitializationException e) {
      e.printStackTrace();
      throw new TerminateToolException(
          1,
          "Could not find configuration for "
              + lang
              + ". Only "
              + new Locale("pt", "BR")
              + " might be supported for now.");
    }
    Analyzer cogroo = factory.createPipe();

    System.out.println("Loading time [" + ((System.nanoTime() - start) / 1000000) + "ms]");

    Scanner kb = new Scanner(System.in);
    System.out.print("Enter the sentence or 'q' to quit: ");
    String input = kb.nextLine();

    while (!input.equals("q")) {

      CheckDocument document = new CheckDocument();
      document.setText(input);
      cogroo.analyze(document);

      System.out.println(TextUtils.nicePrint(document));

      System.out.print("Enter the sentence or 'q' to quit: ");
      input = kb.nextLine();
    }
  }
  public ObjectStream<NameSample> create(String[] args) {

    Parameters params = ArgumentParser.parse(args, Parameters.class);
    int typesToGenerate = 0;

    if (params.getTypes().contains("DNA")) {
      typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES;
    } else if (params.getTypes().contains("protein")) {
      typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES;
    } else if (params.getTypes().contains("cell_type")) {
      typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES;
    } else if (params.getTypes().contains("cell_line")) {
      typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES;
    } else if (params.getTypes().contains("RNA")) {
      typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES;
    }

    return new BioNLP2004NameSampleStream(
        CmdLineUtil.openInFile(new File(params.getData())), typesToGenerate);
  }
  public void run(String format, String[] args) {
    super.run(format, args);

    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true);
    if (mlParams != null && !TrainerFactory.isValid(mlParams.getSettings())) {
      throw new TerminateToolException(
          1, "Training parameters file '" + params.getParams() + "' is invalid!");
    }

    if (mlParams == null) {
      mlParams = ModelUtil.createDefaultTrainingParameters();
      mlParams.put(TrainingParameters.ALGORITHM_PARAM, getModelType(params.getType()).toString());
    }

    File modelOutFile = params.getModel();
    CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile);

    Dictionary ngramDict = null;

    Integer ngramCutoff = params.getNgram();

    if (ngramCutoff != null) {
      System.err.print("Building ngram dictionary ... ");
      try {
        ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff);
        sampleStream.reset();
      } catch (IOException e) {
        throw new TerminateToolException(
            -1, "IO error while building NGram Dictionary: " + e.getMessage(), e);
      }
      System.err.println("done");
    }

    POSTaggerFactory postaggerFactory = null;
    try {
      postaggerFactory = POSTaggerFactory.create(params.getFactory(), ngramDict, null);
    } catch (InvalidFormatException e) {
      throw new TerminateToolException(-1, e.getMessage(), e);
    }

    if (params.getDict() != null) {
      try {
        postaggerFactory.setTagDictionary(postaggerFactory.createTagDictionary(params.getDict()));
      } catch (IOException e) {
        throw new TerminateToolException(
            -1, "IO error while loading POS Dictionary: " + e.getMessage(), e);
      }
    }

    if (params.getTagDictCutoff() != null) {
      try {
        TagDictionary dict = postaggerFactory.getTagDictionary();
        if (dict == null) {
          dict = postaggerFactory.createEmptyTagDictionary();
          postaggerFactory.setTagDictionary(dict);
        }
        if (dict instanceof MutableTagDictionary) {
          POSTaggerME.populatePOSDictionary(
              sampleStream, (MutableTagDictionary) dict, params.getTagDictCutoff());
        } else {
          throw new IllegalArgumentException(
              "Can't extend a POSDictionary that does not implement MutableTagDictionary.");
        }
        sampleStream.reset();
      } catch (IOException e) {
        throw new TerminateToolException(
            -1, "IO error while creating/extending POS Dictionary: " + e.getMessage(), e);
      }
    }

    POSModel model;
    try {
      model =
          opennlp.tools.postag.POSTaggerME.train(
              params.getLang(), sampleStream, mlParams, postaggerFactory);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while reading training data or indexing data: " + e.getMessage(), e);
    } finally {
      try {
        sampleStream.close();
      } catch (IOException e) {
        // sorry that this can fail
      }
    }

    CmdLineUtil.writeModel("pos tagger", modelOutFile, model);
  }