public void run(String format, String[] args) { super.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(params.getIterations(), params.getCutoff()); } File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile); ChunkerModel model; try { ChunkerFactory chunkerFactory = ChunkerFactory.create(params.getFactory()); model = ChunkerME.train(params.getLang(), sampleStream, mlParams, chunkerFactory); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while reading training data or indexing data: " + e.getMessage(), e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry that this can fail } } CmdLineUtil.writeModel("chunker", modelOutFile, model); }
public void run(String[] args) { Parameters params = validateAndParseParams(args, Parameters.class); File testData = new File(params.getCensusData()); File dictOutFile = new File(params.getDict()); CmdLineUtil.checkInputFile("Name data", testData); CmdLineUtil.checkOutputFile("Dictionary file", dictOutFile); FileInputStream sampleDataIn = CmdLineUtil.openInFile(testData); ObjectStream<StringList> sampleStream = new NameFinderCensus90NameStream(sampleDataIn, Charset.forName(params.getEncoding())); Dictionary mDictionary; try { System.out.println("Creating Dictionary..."); mDictionary = createDictionary(sampleStream); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while reading training data or indexing data: " + e.getMessage(), e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry this can fail.. } } System.out.println("Saving Dictionary..."); OutputStream out = null; try { out = new FileOutputStream(dictOutFile); mDictionary.serialize(out); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while writing dictionary file: " + e.getMessage(), e); } finally { if (out != null) try { out.close(); } catch (IOException e) { // file might be damaged throw new TerminateToolException( -1, "Attention: Failed to correctly write dictionary:" + e.getMessage(), e); } } }
public void run(String[] args) { if (0 == args.length) { System.out.println(getHelp()); } else { DoccatModel model = new DoccatModelLoader().load(new File(args[0])); DocumentCategorizerME doccat = new DocumentCategorizerME(model); ObjectStream<String> documentStream = new ParagraphStream(new PlainTextByLineStream(new InputStreamReader(System.in))); PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc"); perfMon.start(); try { String document; while ((document = documentStream.read()) != null) { double prob[] = doccat.categorize(WhitespaceTokenizer.INSTANCE.tokenize(document)); String category = doccat.getBestCategory(prob); DocumentSample sample = new DocumentSample(category, document); System.out.println(sample.toString()); perfMon.incrementCounter(); } } catch (IOException e) { CmdLineUtil.handleStdinIoError(e); } perfMon.stopAndPrintFinalResult(); } }
public ObjectStream<ChunkSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); language = params.getLang(); FileInputStream sampleDataIn = CmdLineUtil.openInFile(params.getData()); ObjectStream<String> lineStream = new PlainTextByLineStream(sampleDataIn.getChannel(), params.getEncoding()); ADChunkBasedShallowParserSampleStream sampleStream = new ADChunkBasedShallowParserSampleStream( lineStream, params.getFunctTags(), params.getIsIncludePOSTags(), params.getUseCGTags(), params.getExpandME()); if (params.getStart() != null && params.getStart() > -1) { sampleStream.setStart(params.getStart()); } if (params.getEnd() != null && params.getEnd() > -1) { sampleStream.setEnd(params.getEnd()); } return sampleStream; }
/** * Read the file into an {@code ObjectStream}. * * @param infile the string pointing to the file * @return the object stream */ public static ObjectStream<String> readFileIntoMarkableStreamFactory(final String infile) { InputStreamFactory inputStreamFactory = null; try { inputStreamFactory = new MarkableFileInputStreamFactory(new File(infile)); } catch (FileNotFoundException e) { e.printStackTrace(); } ObjectStream<String> lineStream = null; try { lineStream = new PlainTextByLineStream((inputStreamFactory), "UTF-8"); } catch (IOException e) { CmdLineUtil.handleCreateObjectStreamError(e); } return lineStream; }
public void run(String[] args) { Params params = validateAndParseParams(args, Params.class); String lang = params.getLang(); CmdLineUtil.checkLanguageCode(lang); String country = params.getCountry(); if (StringsUtil.isNullOrEmpty(country)) { throw new TerminateToolException(1, "Country cannot be empty. Example country: BR"); } long start = System.nanoTime(); ComponentFactory factory; try { factory = ComponentFactory.create(new Locale(lang, country)); } catch (InitializationException e) { e.printStackTrace(); throw new TerminateToolException( 1, "Could not find configuration for " + lang + ". Only " + new Locale("pt", "BR") + " might be supported for now."); } Analyzer cogroo = factory.createPipe(); System.out.println("Loading time [" + ((System.nanoTime() - start) / 1000000) + "ms]"); Scanner kb = new Scanner(System.in); System.out.print("Enter the sentence or 'q' to quit: "); String input = kb.nextLine(); while (!input.equals("q")) { CheckDocument document = new CheckDocument(); document.setText(input); cogroo.analyze(document); System.out.println(TextUtils.nicePrint(document)); System.out.print("Enter the sentence or 'q' to quit: "); input = kb.nextLine(); } }
public ObjectStream<NameSample> create(String[] args) { Parameters params = ArgumentParser.parse(args, Parameters.class); int typesToGenerate = 0; if (params.getTypes().contains("DNA")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES; } else if (params.getTypes().contains("protein")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES; } else if (params.getTypes().contains("cell_type")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES; } else if (params.getTypes().contains("cell_line")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES; } else if (params.getTypes().contains("RNA")) { typesToGenerate = typesToGenerate | BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES; } return new BioNLP2004NameSampleStream( CmdLineUtil.openInFile(new File(params.getData())), typesToGenerate); }
public void run(String format, String[] args) { super.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), true); if (mlParams != null && !TrainerFactory.isValid(mlParams.getSettings())) { throw new TerminateToolException( 1, "Training parameters file '" + params.getParams() + "' is invalid!"); } if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); mlParams.put(TrainingParameters.ALGORITHM_PARAM, getModelType(params.getType()).toString()); } File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile); Dictionary ngramDict = null; Integer ngramCutoff = params.getNgram(); if (ngramCutoff != null) { System.err.print("Building ngram dictionary ... "); try { ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff); sampleStream.reset(); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while building NGram Dictionary: " + e.getMessage(), e); } System.err.println("done"); } POSTaggerFactory postaggerFactory = null; try { postaggerFactory = POSTaggerFactory.create(params.getFactory(), ngramDict, null); } catch (InvalidFormatException e) { throw new TerminateToolException(-1, e.getMessage(), e); } if (params.getDict() != null) { try { postaggerFactory.setTagDictionary(postaggerFactory.createTagDictionary(params.getDict())); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while loading POS Dictionary: " + e.getMessage(), e); } } if (params.getTagDictCutoff() != null) { try { TagDictionary dict = postaggerFactory.getTagDictionary(); if (dict == null) { dict = postaggerFactory.createEmptyTagDictionary(); postaggerFactory.setTagDictionary(dict); } if (dict instanceof MutableTagDictionary) { POSTaggerME.populatePOSDictionary( sampleStream, (MutableTagDictionary) dict, params.getTagDictCutoff()); } else { throw new IllegalArgumentException( "Can't extend a POSDictionary that does not implement MutableTagDictionary."); } sampleStream.reset(); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while creating/extending POS Dictionary: " + e.getMessage(), e); } } POSModel model; try { model = opennlp.tools.postag.POSTaggerME.train( params.getLang(), sampleStream, mlParams, postaggerFactory); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while reading training data or indexing data: " + e.getMessage(), e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry that this can fail } } CmdLineUtil.writeModel("pos tagger", modelOutFile, model); }