@Test public void test1() throws Exception { AnalysisEngine dataWriterAnnotator = AnalysisEngineFactory.createEngine( Test1Annotator.class, DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, outputDirectoryName, DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, MaxentBooleanOutcomeDataWriter.class.getName()); dataWriterAnnotator.process(jCas); dataWriterAnnotator.collectionProcessComplete(); File trainFile = new MaxentBooleanOutcomeClassifierBuilder().getTrainingDataFile(this.outputDirectory); String[] lines = FileUtil.loadListOfStrings(trainFile); assertEquals("true pos_NN distance=3.0 precision=1.234", lines[0]); assertEquals("false name_2PO p's=2.0", lines[1]); assertEquals("true null=0.0", lines[2]); assertEquals("false A_B_AB", lines[3]); // simply train four different models where each one writes over the previous HideOutput hider = new HideOutput(); Train.main(outputDirectoryName, "10", "1"); hider.restoreOutput(); }
public static void main(String[] args) throws Exception { CollectionReader cr = createReader(WhiteTextCollectionReader.class); SimplePipeline.runPipeline( cr, AnalysisEngineFactory.createEngine(WhiteTextCollectionReaderTest.class)); }
/** * Creates a uima sentence iterator with the given path * * @param path the path to the root directory or file to read from * @return the uima sentence iterator for the given root dir or file * @throws Exception */ public static SentenceIterator createWithPath(String path) throws Exception { return new UimaSentenceIterator( path, new UimaResource( AnalysisEngineFactory.createEngine( AnalysisEngineFactory.createEngineDescription( TokenizerAnnotator.getDescription(), SentenceAnnotator.getDescription())))); }
public void runForArabic() throws UIMAException { this.stopwords = new Stopwords(Stopwords.STOPWORD_AR); this.stopwords = new Stopwords("semeval2015-3/arabic-corpus-specific-stopwords.txt"); this.pfArabic = new PairFeatureFactoryArabic(this.alphabet); this.pfArabic.setupMeasures(RichNode.OUTPUT_PAR_TOKEN_LOWERCASE, this.stopwords); this.language = LANG_ARABIC; this.preliminaryCas = JCasFactory.createJCas(); /** Specify the task label For Arabic there is just one task */ this.a_labels.add("direct"); this.a_labels.add("related"); this.a_labels.add("irrelevant"); /** * Instantiate the QCRI Analyzer, but for now we are using the analysis engines instantiated * later on */ if (USE_QCRI_ALT_TOOLS) { this.analyzer = new Analyzer(new UIMANoPersistence()); analyzer.addAE( AnalysisEngineFactory.createEngine(createEngineDescription(ArabicAnalyzer.class))); } else { /** * Whitespace tokenizer. The Stanford Segmenter for Arabic has a very bad bug and the * tokenization is completely wrong. */ AnalysisEngine segmenter = createEngine(createEngineDescription(WhitespaceTokenizer.class)); /** Stanford POS-Tagger */ AnalysisEngine postagger = createEngine( createEngineDescription( StanfordPosTagger.class, StanfordPosTagger.PARAM_LANGUAGE, "ar", StanfordPosTagger.PARAM_VARIANT, "accurate")); /** Putting together the UIMA DKPro annotators */ this.analysisEngineList = new AnalysisEngine[2]; this.analysisEngineList[0] = segmenter; this.analysisEngineList[1] = postagger; } try { processArabicFile(analyzer, CQA_QL_TRAIN_AR, "train"); processArabicFile(analyzer, CQA_QL_DEV_AR, "dev"); } catch (SimilarityException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
/** * Return a a sentence segmenter * * @return a sentence segmenter */ public static AnalysisEngine segmenter() { try { if (defaultAnalysisEngine == null) defaultAnalysisEngine = AnalysisEngineFactory.createEngine( AnalysisEngineFactory.createEngineDescription(SentenceAnnotator.getDescription())); return defaultAnalysisEngine; } catch (Exception e) { throw new RuntimeException(e); } }
@Test public void testDataWriterDescriptor() throws UIMAException { AnalysisEngine engine = AnalysisEngineFactory.createEngine( ExamplePosAnnotator.getWriterDescription(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY)); String outputDir = (String) engine.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY); outputDir = outputDir.replace(File.separatorChar, '/'); Assert.assertEquals(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY, outputDir); String expectedDataWriterFactory = (ViterbiDataWriterFactory.class.getName()); Object dataWriter = engine.getConfigParameterValue( CleartkSequenceAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME); Assert.assertEquals(expectedDataWriterFactory, dataWriter); engine.collectionProcessComplete(); }
@Test public void testAnnotatorDescriptor() throws Exception { HideOutput hider = new HideOutput(); BuildTestExamplePosModel.main(); hider.restoreOutput(); String modelFileName = JarClassifierBuilder.getModelJarFile(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY) .getPath(); AnalysisEngineDescription posTaggerDescription = ExamplePosAnnotator.getClassifierDescription(modelFileName); AnalysisEngine engine = AnalysisEngineFactory.createEngine(posTaggerDescription); Object classifierJar = engine.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH); Assert.assertEquals(modelFileName, classifierJar); engine.collectionProcessComplete(); }
@Test public void test() throws UIMAException { AnalysisEngine ae = AnalysisEngineFactory.createEngine( FeatureValueReplacer.class, tsd, FeatureValueReplacer.PARAM_ANNO_TYPE, DocumentMetadata.class.getName(), FeatureValueReplacer.PARAM_FEATURE_PATH, "sourceUri", FeatureValueReplacer.PARAM_PATTERN, "file:.+/([^/]+)$", FeatureValueReplacer.PARAM_REPLACE_BY, "$1"); JCas cas = ae.newCAS().getJCas(); cas.setDocumentText("Bla bla"); DocumentMetadata metaAnno = new DocumentMetadata(cas); metaAnno.setBegin(0); metaAnno.setEnd(0); metaAnno.setSourceUri("file:/d:/somefolder/somemore/foobar.txt"); metaAnno.addToIndexes(); ae.process(cas); metaAnno = (DocumentMetadata) cas.getAnnotationIndex(DocumentMetadata.type).iterator().next(); assertEquals("foobar.txt", metaAnno.getSourceUri()); // next trial cas = ae.newCAS().getJCas(); cas.setDocumentText("Bla bla more"); metaAnno = new DocumentMetadata(cas); metaAnno.setBegin(0); metaAnno.setEnd(0); metaAnno.setSourceUri("http://example.org/qwerty.txt"); metaAnno.addToIndexes(); ae.process(cas); metaAnno = (DocumentMetadata) cas.getAnnotationIndex(DocumentMetadata.type).iterator().next(); assertEquals("http://example.org/qwerty.txt", metaAnno.getSourceUri()); }
/** Here we test that an exception is thrown if an instance with no outcome */ @Test public void test4() throws Exception { HideOutput hider = new HideOutput(); AnalysisEngine dataWriterAnnotator = AnalysisEngineFactory.createEngine( Test4Annotator.class, DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, outputDirectoryName, DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, MaxentBooleanOutcomeDataWriter.class.getName()); AnalysisEngineProcessException aepe = null; try { dataWriterAnnotator.process(jCas); } catch (AnalysisEngineProcessException e) { aepe = e; } dataWriterAnnotator.collectionProcessComplete(); assertNotNull(aepe); hider.restoreOutput(); }
@Test public void testSimpleSentence() throws Exception { AnalysisEngine engine = AnalysisEngineFactory.createEngine( ExamplePosAnnotator.class, CleartkSequenceAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME, PublicFieldSequenceDataWriter.StringFactory.class.getName(), DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, "."); // create some tokens with part of speech tags tokenBuilder.buildTokens( jCas, "The Absurdis retreated in 2003.", "The Absurdis retreated in 2003 .", // the tokenized version of the text "DT NNP VBD IN CD ."); List<Instance<String>> instances = PublicFieldSequenceDataWriter.StringFactory.collectInstances(engine, jCas); List<String> featureValues; // check "The" featureValues = Arrays.asList( "The", // word "the", // lower case "INITIAL_UPPERCASE", // capital type // numeric type "he", // last 2 chars "The", // last 3 chars "OOB2", // left 2 words "OOB1", "Absurdis", // right 2 words "retreated"); Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(0))); Assert.assertEquals("DT", instances.get(0).getOutcome()); // check "Absurdis" featureValues = Arrays.asList( "Absurdis", // word "absurdis", // lower case "INITIAL_UPPERCASE", // capital type // numeric type "is", // last 2 chars "dis", // last 3 chars "OOB1", // left 2 words "The", "retreated", // right 2 words "in"); Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(1))); Assert.assertEquals("NNP", instances.get(1).getOutcome()); // check "retreated" featureValues = Arrays.asList( "retreated", // word "retreated", // lower case "ALL_LOWERCASE", // capital type // numeric type "ed", // last 2 chars "ted", // last 3 chars "The", // left 2 words "Absurdis", // right 2 words "in", "2003"); Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(2))); Assert.assertEquals("VBD", instances.get(2).getOutcome()); // check "in" featureValues = Arrays.asList( "in", // word "in", // lower case "ALL_LOWERCASE", // capital type // numeric type "in", // last 2 chars // last 3 chars "Absurdis", // left 2 words "retreated", "2003", // right 2 words "."); Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(3))); Assert.assertEquals("IN", instances.get(3).getOutcome()); // check "2003" featureValues = Arrays.asList( "2003", // word "2003", // lower case // capital type "YEAR_DIGITS", // numeric type "03", // last 2 chars "003", // last 3 chars "retreated", // left 2 words "in", ".", // right 2 words "OOB1"); Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(4))); Assert.assertEquals("CD", instances.get(4).getOutcome()); // check "." featureValues = Arrays.asList( ".", // word ".", // lower case // capital type // numeric type // last 2 chars // last 3 chars "in", // left 2 words "2003", "OOB1", // right 2 words "OOB2"); Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(5))); Assert.assertEquals(".", instances.get(5).getOutcome()); }