@Test
  public void test1() throws Exception {
    AnalysisEngine dataWriterAnnotator =
        AnalysisEngineFactory.createEngine(
            Test1Annotator.class,
            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
            outputDirectoryName,
            DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
            MaxentBooleanOutcomeDataWriter.class.getName());

    dataWriterAnnotator.process(jCas);
    dataWriterAnnotator.collectionProcessComplete();

    File trainFile =
        new MaxentBooleanOutcomeClassifierBuilder().getTrainingDataFile(this.outputDirectory);
    String[] lines = FileUtil.loadListOfStrings(trainFile);
    assertEquals("true pos_NN distance=3.0 precision=1.234", lines[0]);
    assertEquals("false name_2PO p's=2.0", lines[1]);
    assertEquals("true null=0.0", lines[2]);
    assertEquals("false A_B_AB", lines[3]);

    // simply train four different models where each one writes over the previous
    HideOutput hider = new HideOutput();
    Train.main(outputDirectoryName, "10", "1");
    hider.restoreOutput();
  }
  public static void main(String[] args) throws Exception {

    CollectionReader cr = createReader(WhiteTextCollectionReader.class);

    SimplePipeline.runPipeline(
        cr, AnalysisEngineFactory.createEngine(WhiteTextCollectionReaderTest.class));
  }
 /**
  * Creates a uima sentence iterator with the given path
  *
  * @param path the path to the root directory or file to read from
  * @return the uima sentence iterator for the given root dir or file
  * @throws Exception
  */
 public static SentenceIterator createWithPath(String path) throws Exception {
   return new UimaSentenceIterator(
       path,
       new UimaResource(
           AnalysisEngineFactory.createEngine(
               AnalysisEngineFactory.createEngineDescription(
                   TokenizerAnnotator.getDescription(), SentenceAnnotator.getDescription()))));
 }
Exemplo n.º 4
0
  public void runForArabic() throws UIMAException {
    this.stopwords = new Stopwords(Stopwords.STOPWORD_AR);
    this.stopwords = new Stopwords("semeval2015-3/arabic-corpus-specific-stopwords.txt");

    this.pfArabic = new PairFeatureFactoryArabic(this.alphabet);
    this.pfArabic.setupMeasures(RichNode.OUTPUT_PAR_TOKEN_LOWERCASE, this.stopwords);

    this.language = LANG_ARABIC;

    this.preliminaryCas = JCasFactory.createJCas();

    /** Specify the task label For Arabic there is just one task */
    this.a_labels.add("direct");
    this.a_labels.add("related");
    this.a_labels.add("irrelevant");

    /**
     * Instantiate the QCRI Analyzer, but for now we are using the analysis engines instantiated
     * later on
     */
    if (USE_QCRI_ALT_TOOLS) {
      this.analyzer = new Analyzer(new UIMANoPersistence());
      analyzer.addAE(
          AnalysisEngineFactory.createEngine(createEngineDescription(ArabicAnalyzer.class)));
    } else {
      /**
       * Whitespace tokenizer. The Stanford Segmenter for Arabic has a very bad bug and the
       * tokenization is completely wrong.
       */
      AnalysisEngine segmenter = createEngine(createEngineDescription(WhitespaceTokenizer.class));
      /** Stanford POS-Tagger */
      AnalysisEngine postagger =
          createEngine(
              createEngineDescription(
                  StanfordPosTagger.class,
                  StanfordPosTagger.PARAM_LANGUAGE,
                  "ar",
                  StanfordPosTagger.PARAM_VARIANT,
                  "accurate"));
      /** Putting together the UIMA DKPro annotators */
      this.analysisEngineList = new AnalysisEngine[2];
      this.analysisEngineList[0] = segmenter;
      this.analysisEngineList[1] = postagger;
    }

    try {
      processArabicFile(analyzer, CQA_QL_TRAIN_AR, "train");
      processArabicFile(analyzer, CQA_QL_DEV_AR, "dev");
    } catch (SimilarityException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
  /**
   * Return a a sentence segmenter
   *
   * @return a sentence segmenter
   */
  public static AnalysisEngine segmenter() {
    try {
      if (defaultAnalysisEngine == null)
        defaultAnalysisEngine =
            AnalysisEngineFactory.createEngine(
                AnalysisEngineFactory.createEngineDescription(SentenceAnnotator.getDescription()));

      return defaultAnalysisEngine;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  @Test
  public void testDataWriterDescriptor() throws UIMAException {
    AnalysisEngine engine =
        AnalysisEngineFactory.createEngine(
            ExamplePosAnnotator.getWriterDescription(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY));

    String outputDir =
        (String) engine.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY);
    outputDir = outputDir.replace(File.separatorChar, '/');
    Assert.assertEquals(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY, outputDir);

    String expectedDataWriterFactory = (ViterbiDataWriterFactory.class.getName());
    Object dataWriter =
        engine.getConfigParameterValue(
            CleartkSequenceAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME);
    Assert.assertEquals(expectedDataWriterFactory, dataWriter);
    engine.collectionProcessComplete();
  }
  @Test
  public void testAnnotatorDescriptor() throws Exception {
    HideOutput hider = new HideOutput();
    BuildTestExamplePosModel.main();
    hider.restoreOutput();

    String modelFileName =
        JarClassifierBuilder.getModelJarFile(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY)
            .getPath();
    AnalysisEngineDescription posTaggerDescription =
        ExamplePosAnnotator.getClassifierDescription(modelFileName);
    AnalysisEngine engine = AnalysisEngineFactory.createEngine(posTaggerDescription);

    Object classifierJar =
        engine.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH);
    Assert.assertEquals(modelFileName, classifierJar);

    engine.collectionProcessComplete();
  }
  @Test
  public void test() throws UIMAException {
    AnalysisEngine ae =
        AnalysisEngineFactory.createEngine(
            FeatureValueReplacer.class,
            tsd,
            FeatureValueReplacer.PARAM_ANNO_TYPE,
            DocumentMetadata.class.getName(),
            FeatureValueReplacer.PARAM_FEATURE_PATH,
            "sourceUri",
            FeatureValueReplacer.PARAM_PATTERN,
            "file:.+/([^/]+)$",
            FeatureValueReplacer.PARAM_REPLACE_BY,
            "$1");
    JCas cas = ae.newCAS().getJCas();
    cas.setDocumentText("Bla bla");
    DocumentMetadata metaAnno = new DocumentMetadata(cas);
    metaAnno.setBegin(0);
    metaAnno.setEnd(0);
    metaAnno.setSourceUri("file:/d:/somefolder/somemore/foobar.txt");
    metaAnno.addToIndexes();

    ae.process(cas);

    metaAnno = (DocumentMetadata) cas.getAnnotationIndex(DocumentMetadata.type).iterator().next();
    assertEquals("foobar.txt", metaAnno.getSourceUri());

    // next trial
    cas = ae.newCAS().getJCas();
    cas.setDocumentText("Bla bla more");
    metaAnno = new DocumentMetadata(cas);
    metaAnno.setBegin(0);
    metaAnno.setEnd(0);
    metaAnno.setSourceUri("http://example.org/qwerty.txt");
    metaAnno.addToIndexes();

    ae.process(cas);

    metaAnno = (DocumentMetadata) cas.getAnnotationIndex(DocumentMetadata.type).iterator().next();
    assertEquals("http://example.org/qwerty.txt", metaAnno.getSourceUri());
  }
  /** Here we test that an exception is thrown if an instance with no outcome */
  @Test
  public void test4() throws Exception {

    HideOutput hider = new HideOutput();

    AnalysisEngine dataWriterAnnotator =
        AnalysisEngineFactory.createEngine(
            Test4Annotator.class,
            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
            outputDirectoryName,
            DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
            MaxentBooleanOutcomeDataWriter.class.getName());

    AnalysisEngineProcessException aepe = null;
    try {
      dataWriterAnnotator.process(jCas);
    } catch (AnalysisEngineProcessException e) {
      aepe = e;
    }
    dataWriterAnnotator.collectionProcessComplete();
    assertNotNull(aepe);
    hider.restoreOutput();
  }
  @Test
  public void testSimpleSentence() throws Exception {
    AnalysisEngine engine =
        AnalysisEngineFactory.createEngine(
            ExamplePosAnnotator.class,
            CleartkSequenceAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
            PublicFieldSequenceDataWriter.StringFactory.class.getName(),
            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
            ".");

    // create some tokens with part of speech tags
    tokenBuilder.buildTokens(
        jCas,
        "The Absurdis retreated in 2003.",
        "The Absurdis retreated in 2003 .", // the tokenized version of the text
        "DT NNP VBD IN CD .");

    List<Instance<String>> instances =
        PublicFieldSequenceDataWriter.StringFactory.collectInstances(engine, jCas);

    List<String> featureValues;

    // check "The"
    featureValues =
        Arrays.asList(
            "The", // word
            "the", // lower case
            "INITIAL_UPPERCASE", // capital type
            // numeric type
            "he", // last 2 chars
            "The", // last 3 chars
            "OOB2", // left 2 words
            "OOB1",
            "Absurdis", // right 2 words
            "retreated");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(0)));
    Assert.assertEquals("DT", instances.get(0).getOutcome());

    // check "Absurdis"
    featureValues =
        Arrays.asList(
            "Absurdis", // word
            "absurdis", // lower case
            "INITIAL_UPPERCASE", // capital type
            // numeric type
            "is", // last 2 chars
            "dis", // last 3 chars
            "OOB1", // left 2 words
            "The",
            "retreated", // right 2 words
            "in");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(1)));
    Assert.assertEquals("NNP", instances.get(1).getOutcome());

    // check "retreated"
    featureValues =
        Arrays.asList(
            "retreated", // word
            "retreated", // lower case
            "ALL_LOWERCASE", // capital type
            // numeric type
            "ed", // last 2 chars
            "ted", // last 3 chars
            "The", // left 2 words
            "Absurdis", // right 2 words
            "in",
            "2003");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(2)));
    Assert.assertEquals("VBD", instances.get(2).getOutcome());

    // check "in"
    featureValues =
        Arrays.asList(
            "in", // word
            "in", // lower case
            "ALL_LOWERCASE", // capital type
            // numeric type
            "in", // last 2 chars
            // last 3 chars
            "Absurdis", // left 2 words
            "retreated",
            "2003", // right 2 words
            ".");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(3)));
    Assert.assertEquals("IN", instances.get(3).getOutcome());

    // check "2003"
    featureValues =
        Arrays.asList(
            "2003", // word
            "2003", // lower case
            // capital type
            "YEAR_DIGITS", // numeric type
            "03", // last 2 chars
            "003", // last 3 chars
            "retreated", // left 2 words
            "in",
            ".", // right 2 words
            "OOB1");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(4)));
    Assert.assertEquals("CD", instances.get(4).getOutcome());

    // check "."
    featureValues =
        Arrays.asList(
            ".", // word
            ".", // lower case
            // capital type
            // numeric type
            // last 2 chars
            // last 3 chars
            "in", // left 2 words
            "2003",
            "OOB1", // right 2 words
            "OOB2");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(5)));
    Assert.assertEquals(".", instances.get(5).getOutcome());
  }