@Test
  public void emptyDocumentTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader =
        CollectionReaderFactory.createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/empty/",
            TextReader.PARAM_LANGUAGE,
            "en",
            TextReader.PARAM_PATTERNS,
            "empty*.txt");

    AnalysisEngineDescription segmenter =
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector =
        AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
      //            System.out.println(jcas.getDocumentText().length());
    }
  }
 /**
  * Creates a uima sentence iterator with the given path
  *
  * @param path the path to the root directory or file to read from
  * @return the uima sentence iterator for the given root dir or file
  * @throws Exception
  */
 public static SentenceIterator createWithPath(String path) throws Exception {
   return new UimaSentenceIterator(
       path,
       new UimaResource(
           AnalysisEngineFactory.createEngine(
               AnalysisEngineFactory.createEngineDescription(
                   TokenizerAnnotator.getDescription(), SentenceAnnotator.getDescription()))));
 }
 public static void main(String[] args) throws Exception {
   SimplePipeline.runPipeline(
       CollectionReaderFactory.createReader(
           ReaderExample.class,
           ReaderExample.PARAM_INPUT_FILE,
           "src/test/resources/test/input.txt"),
       AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
       AnalysisEngineFactory.createEngineDescription(BaselineExample.class),
       AnalysisEngineFactory.createEngineDescription(EvaluatorExample.class));
 }
  /**
   * Return a a sentence segmenter
   *
   * @return a sentence segmenter
   */
  public static AnalysisEngine segmenter() {
    try {
      if (defaultAnalysisEngine == null)
        defaultAnalysisEngine =
            AnalysisEngineFactory.createEngine(
                AnalysisEngineFactory.createEngineDescription(SentenceAnnotator.getDescription()));

      return defaultAnalysisEngine;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  public static void main(String[] args) throws Exception {

    CollectionReader cr = createReader(WhiteTextCollectionReader.class);

    SimplePipeline.runPipeline(
        cr, AnalysisEngineFactory.createEngine(WhiteTextCollectionReaderTest.class));
  }
  public static void main(String[] args) throws Exception {
    JCas jCas = JCasFactory.createJCas();
    jCas.setDocumentLanguage("de");
    jCas.setDocumentText(
        "Die Fossillagerstätte Geiseltal befindet sich im ehemaligen Braunkohlerevier des Geiseltales südlich der Stadt Halle in Sachsen-Anhalt. Sie ist eine bedeutende Fundstelle heute ausgestorbener Pflanzen und Tiere aus der Zeit des Mittleren Eozäns vor 48 bis 41 Millionen Jahren. Im Geiseltal wurde nachweislich seit 1698 erstmals Kohle gefördert, die ersten Fossilien kamen aber erst Anfang des 20. Jahrhunderts eher zufällig zu Tage. Planmäßige wissenschaftliche Ausgrabungen begannen 1925 seitens der Martin-Luther-Universität Halle-Wittenberg. Unterbrochen durch den Zweiten Weltkrieg, können die Untersuchungen in zwei Forschungsphasen untergliedert werden. Aufgrund der zunehmenden Auskohlung der Rohstofflager kamen die Ausgrabungen Mitte der 1980er allmählich zum Erliegen und endeten endgültig zu Beginn des dritten Jahrtausends.");

    SimplePipeline.runPipeline(
        jCas,
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
        AnalysisEngineFactory.createEngineDescription(StanfordNamedEntityRecognizer.class),
        AnalysisEngineFactory.createEngineDescription(CasDumpWriter.class));

    for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
      System.out.println("Found NE: " + ne.getValue() + ", " + ne.getCoveredText());
    }
  }
  //	@Test
  public void allAggregationStrategies_1segment_expectCorrectRanking() throws Exception {
    String testDocument = "foo bar baz";

    List<Class<? extends AggregationStrategy>> aggregationStrategies =
        new ArrayList<Class<? extends AggregationStrategy>>();
    aggregationStrategies.add(MaximumAggregation.class);

    for (Class<? extends AggregationStrategy> aggregationStrategy : aggregationStrategies) {

      AnalysisEngineDescription aed =
          AnalysisEngineFactory.createPrimitiveDescription(
              BookIndexPhraseAggregationAnnotator.class);

      bindResource(aed, RankedPhraseAggregationAnnotator.AGGREGATION_STRATEGY, aggregationStrategy);

      AnalysisEngine ae = createPrimitive(aed);
      JCas jcas = setup_1segment(testDocument, ae);

      ae.process(jcas);

      List<String> expectedBookIndexPhrases = new ArrayList<String>();
      expectedBookIndexPhrases.add("bar");
      expectedBookIndexPhrases.add("foo");
      expectedBookIndexPhrases.add("baz");

      List<String> resultBookIndexPhrases = new ArrayList<String>();
      for (BookIndexPhrase b : JCasUtil.select(jcas, BookIndexPhrase.class)) {
        resultBookIndexPhrases.add(b.getPhrase());
      }

      assertEquals(expectedBookIndexPhrases, resultBookIndexPhrases);
    }
  }
  @Test
  public void test1() throws Exception {
    AnalysisEngine dataWriterAnnotator =
        AnalysisEngineFactory.createEngine(
            Test1Annotator.class,
            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
            outputDirectoryName,
            DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
            MaxentBooleanOutcomeDataWriter.class.getName());

    dataWriterAnnotator.process(jCas);
    dataWriterAnnotator.collectionProcessComplete();

    File trainFile =
        new MaxentBooleanOutcomeClassifierBuilder().getTrainingDataFile(this.outputDirectory);
    String[] lines = FileUtil.loadListOfStrings(trainFile);
    assertEquals("true pos_NN distance=3.0 precision=1.234", lines[0]);
    assertEquals("false name_2PO p's=2.0", lines[1]);
    assertEquals("true null=0.0", lines[2]);
    assertEquals("false A_B_AB", lines[3]);

    // simply train four different models where each one writes over the previous
    HideOutput hider = new HideOutput();
    Train.main(outputDirectoryName, "10", "1");
    hider.restoreOutput();
  }
 public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
   return AnalysisEngineFactory.createPrimitiveDescription(
       SentenceAnnotator.class,
       PARAM_SENTENCE_MODEL_PATH,
       ParamUtil.getParameterValue(PARAM_SENTENCE_MODEL_PATH, "/models/en-sent.bin"),
       PARAM_WINDOW_CLASS_NAMES,
       ParamUtil.getParameterValue(PARAM_WINDOW_CLASS_NAMES, null));
 }
Exemple #10
0
  public void runForArabic() throws UIMAException {
    this.stopwords = new Stopwords(Stopwords.STOPWORD_AR);
    this.stopwords = new Stopwords("semeval2015-3/arabic-corpus-specific-stopwords.txt");

    this.pfArabic = new PairFeatureFactoryArabic(this.alphabet);
    this.pfArabic.setupMeasures(RichNode.OUTPUT_PAR_TOKEN_LOWERCASE, this.stopwords);

    this.language = LANG_ARABIC;

    this.preliminaryCas = JCasFactory.createJCas();

    /** Specify the task label For Arabic there is just one task */
    this.a_labels.add("direct");
    this.a_labels.add("related");
    this.a_labels.add("irrelevant");

    /**
     * Instantiate the QCRI Analyzer, but for now we are using the analysis engines instantiated
     * later on
     */
    if (USE_QCRI_ALT_TOOLS) {
      this.analyzer = new Analyzer(new UIMANoPersistence());
      analyzer.addAE(
          AnalysisEngineFactory.createEngine(createEngineDescription(ArabicAnalyzer.class)));
    } else {
      /**
       * Whitespace tokenizer. The Stanford Segmenter for Arabic has a very bad bug and the
       * tokenization is completely wrong.
       */
      AnalysisEngine segmenter = createEngine(createEngineDescription(WhitespaceTokenizer.class));
      /** Stanford POS-Tagger */
      AnalysisEngine postagger =
          createEngine(
              createEngineDescription(
                  StanfordPosTagger.class,
                  StanfordPosTagger.PARAM_LANGUAGE,
                  "ar",
                  StanfordPosTagger.PARAM_VARIANT,
                  "accurate"));
      /** Putting together the UIMA DKPro annotators */
      this.analysisEngineList = new AnalysisEngine[2];
      this.analysisEngineList[0] = segmenter;
      this.analysisEngineList[1] = postagger;
    }

    try {
      processArabicFile(analyzer, CQA_QL_TRAIN_AR, "train");
      processArabicFile(analyzer, CQA_QL_DEV_AR, "dev");
    } catch (SimilarityException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Exemple #11
0
 public static AnalysisEngineDescription getDescription(String languageCode)
     throws ResourceInitializationException {
   String modelPath = String.format("/models/%s-pos-maxent.bin", languageCode);
   return AnalysisEngineFactory.createEngineDescription(
       PoStagger.class,
       UimaUtil.MODEL_PARAMETER,
       ExternalResourceFactory.createExternalResourceDescription(
           POSModelResourceImpl.class, PoStagger.class.getResource(modelPath).toString()),
       UimaUtil.SENTENCE_TYPE_PARAMETER,
       Sentence.class.getName(),
       UimaUtil.TOKEN_TYPE_PARAMETER,
       Token.class.getName(),
       UimaUtil.POS_FEATURE_PARAMETER,
       "pos");
 }
  @Test
  public void testDataWriterDescriptor() throws UIMAException {
    AnalysisEngine engine =
        AnalysisEngineFactory.createEngine(
            ExamplePosAnnotator.getWriterDescription(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY));

    String outputDir =
        (String) engine.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY);
    outputDir = outputDir.replace(File.separatorChar, '/');
    Assert.assertEquals(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY, outputDir);

    String expectedDataWriterFactory = (ViterbiDataWriterFactory.class.getName());
    Object dataWriter =
        engine.getConfigParameterValue(
            CleartkSequenceAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME);
    Assert.assertEquals(expectedDataWriterFactory, dataWriter);
    engine.collectionProcessComplete();
  }
Exemple #13
0
  public static void main(String[] args) throws Exception {
    Options options = CliFactory.parseArguments(Options.class, args);

    List<File> testFiles =
        DocumentClassificationEvaluation.getFilesFromDirectory(options.getTestDirectory());

    DocumentClassificationEvaluation evaluation =
        new DocumentClassificationEvaluation(options.getModelsDirectory());
    CollectionReader collectionReader = evaluation.getCollectionReader(testFiles);

    AggregateBuilder builder =
        DocumentClassificationEvaluation.createDocumentClassificationAggregate(
            options.getModelsDirectory(), AnnotatorMode.CLASSIFY);

    SimplePipeline.runPipeline(
        collectionReader,
        builder.createAggregateDescription(),
        AnalysisEngineFactory.createEngineDescription(PrintClassificationsAnnotator.class));
  }
  @Test
  public void testAnnotatorDescriptor() throws Exception {
    HideOutput hider = new HideOutput();
    BuildTestExamplePosModel.main();
    hider.restoreOutput();

    String modelFileName =
        JarClassifierBuilder.getModelJarFile(ExamplePosAnnotator.DEFAULT_OUTPUT_DIRECTORY)
            .getPath();
    AnalysisEngineDescription posTaggerDescription =
        ExamplePosAnnotator.getClassifierDescription(modelFileName);
    AnalysisEngine engine = AnalysisEngineFactory.createEngine(posTaggerDescription);

    Object classifierJar =
        engine.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH);
    Assert.assertEquals(modelFileName, classifierJar);

    engine.collectionProcessComplete();
  }
  @Override
  protected AnalysisEngine[] createAnalysisEngines() throws ResourceInitializationException {

    final ExternalResourceDescription parserChunkingDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "parserChunking", SharedOpenNLPModel.class);

    // Add in the OpenNLP implementation too, as its a prerequisite
    // (in theory we should test OpenNLPParser in isolation, but in practise
    // it as this as a
    // dependency
    // so better test they work together)

    final ExternalResourceDescription tokensDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "tokens", SharedOpenNLPModel.class);
    final ExternalResourceDescription sentencesDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "sentences", SharedOpenNLPModel.class);
    final ExternalResourceDescription posDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "posTags", SharedOpenNLPModel.class);
    final ExternalResourceDescription chunksDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "phraseChunks", SharedOpenNLPModel.class);

    AnalysisEngineFactory.createEngineDescription();

    return asArray(
        createAnalysisEngine(
            OpenNLP.class,
            "tokens",
            tokensDesc,
            "sentences",
            sentencesDesc,
            "posTags",
            posDesc,
            "phraseChunks",
            chunksDesc),
        createAnalysisEngine(OpenNLPParser.class, "parserChunking", parserChunkingDesc));
  }
  @Test
  public void test() throws UIMAException {
    AnalysisEngine ae =
        AnalysisEngineFactory.createEngine(
            FeatureValueReplacer.class,
            tsd,
            FeatureValueReplacer.PARAM_ANNO_TYPE,
            DocumentMetadata.class.getName(),
            FeatureValueReplacer.PARAM_FEATURE_PATH,
            "sourceUri",
            FeatureValueReplacer.PARAM_PATTERN,
            "file:.+/([^/]+)$",
            FeatureValueReplacer.PARAM_REPLACE_BY,
            "$1");
    JCas cas = ae.newCAS().getJCas();
    cas.setDocumentText("Bla bla");
    DocumentMetadata metaAnno = new DocumentMetadata(cas);
    metaAnno.setBegin(0);
    metaAnno.setEnd(0);
    metaAnno.setSourceUri("file:/d:/somefolder/somemore/foobar.txt");
    metaAnno.addToIndexes();

    ae.process(cas);

    metaAnno = (DocumentMetadata) cas.getAnnotationIndex(DocumentMetadata.type).iterator().next();
    assertEquals("foobar.txt", metaAnno.getSourceUri());

    // next trial
    cas = ae.newCAS().getJCas();
    cas.setDocumentText("Bla bla more");
    metaAnno = new DocumentMetadata(cas);
    metaAnno.setBegin(0);
    metaAnno.setEnd(0);
    metaAnno.setSourceUri("http://example.org/qwerty.txt");
    metaAnno.addToIndexes();

    ae.process(cas);

    metaAnno = (DocumentMetadata) cas.getAnnotationIndex(DocumentMetadata.type).iterator().next();
    assertEquals("http://example.org/qwerty.txt", metaAnno.getSourceUri());
  }
  /** Here we test that an exception is thrown if an instance with no outcome */
  @Test
  public void test4() throws Exception {

    HideOutput hider = new HideOutput();

    AnalysisEngine dataWriterAnnotator =
        AnalysisEngineFactory.createEngine(
            Test4Annotator.class,
            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
            outputDirectoryName,
            DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
            MaxentBooleanOutcomeDataWriter.class.getName());

    AnalysisEngineProcessException aepe = null;
    try {
      dataWriterAnnotator.process(jCas);
    } catch (AnalysisEngineProcessException e) {
      aepe = e;
    }
    dataWriterAnnotator.collectionProcessComplete();
    assertNotNull(aepe);
    hider.restoreOutput();
  }
  public static AnalysisEngineDescription getDescription(
      Class<? extends Annotation> parentheticalClass,
      Class<? extends Annotation> windowClass,
      char leftParen,
      char rightParen)
      throws ResourceInitializationException {
    AnalysisEngineDescription aed =
        AnalysisEngineFactory.createEngineDescription(
            ParentheticalAnnotator.class,
            PARAM_LEFT_PARENTHESIS,
            "" + leftParen,
            PARAM_RIGHT_PARENTHESIS,
            "" + rightParen,
            PARAM_PARENTHETICAL_TYPE_NAME,
            parentheticalClass.getName());

    if (windowClass != null) {
      ConfigurationParameterFactory.addConfigurationParameters(
          aed, PARAM_WINDOW_TYPE_NAME, windowClass.getName());
    }

    return aed;
  }
 public static AnalysisEngineDescription getDescription(String outputDir)
     throws ResourceInitializationException {
   return AnalysisEngineFactory.createEngineDescription(
       TempEval2007Writer.class, PARAM_OUTPUT_DIRECTORY_NAME, outputDir);
 }
Exemple #20
0
 /**
  * @return AE description instance
  * @throws UIMAException
  * @throws IOException
  */
 public static AnalysisEngineDescription getAEDescription() throws UIMAException, IOException {
   return AnalysisEngineFactory.createEngineDescription(AE_TOKENIZER);
 }
  @Test
  public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader =
        CollectionReaderFactory.createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/data/",
            TextReader.PARAM_LANGUAGE,
            "en",
            TextReader.PARAM_PATTERNS,
            "text*.txt");

    AnalysisEngineDescription segmenter =
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector =
        AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
      //            System.out.println(jcas.getDocumentText().length());
    }

    int i = 0;
    IndexReader index;
    try {
      index = DirectoryReader.open(FSDirectory.open(tmpDir));
      Fields fields = MultiFields.getFields(index);
      if (fields != null) {
        Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
        if (terms != null) {
          TermsEnum termsEnum = terms.iterator(null);
          //                    Bits liveDocs = MultiFields.getLiveDocs(index);
          //                    DocsEnum docs = termsEnum.docs(liveDocs, null);
          //                    int docId;
          //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
          //                        index.g
          //                    }
          BytesRef text = null;
          while ((text = termsEnum.next()) != null) {
            //                        System.out.println(text.utf8ToString() + " - " +
            // termsEnum.totalTermFreq());
            //                        System.out.println(termsEnum.docFreq());

            if (text.utf8ToString().equals("this")) {
              assertEquals(2, termsEnum.docFreq());
              assertEquals(3, termsEnum.totalTermFreq());
            }

            i++;
          }
        }
      }
    } catch (Exception e) {
      throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
  }
  @Test
  public void testSimpleSentence() throws Exception {
    AnalysisEngine engine =
        AnalysisEngineFactory.createEngine(
            ExamplePosAnnotator.class,
            CleartkSequenceAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
            PublicFieldSequenceDataWriter.StringFactory.class.getName(),
            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
            ".");

    // create some tokens with part of speech tags
    tokenBuilder.buildTokens(
        jCas,
        "The Absurdis retreated in 2003.",
        "The Absurdis retreated in 2003 .", // the tokenized version of the text
        "DT NNP VBD IN CD .");

    List<Instance<String>> instances =
        PublicFieldSequenceDataWriter.StringFactory.collectInstances(engine, jCas);

    List<String> featureValues;

    // check "The"
    featureValues =
        Arrays.asList(
            "The", // word
            "the", // lower case
            "INITIAL_UPPERCASE", // capital type
            // numeric type
            "he", // last 2 chars
            "The", // last 3 chars
            "OOB2", // left 2 words
            "OOB1",
            "Absurdis", // right 2 words
            "retreated");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(0)));
    Assert.assertEquals("DT", instances.get(0).getOutcome());

    // check "Absurdis"
    featureValues =
        Arrays.asList(
            "Absurdis", // word
            "absurdis", // lower case
            "INITIAL_UPPERCASE", // capital type
            // numeric type
            "is", // last 2 chars
            "dis", // last 3 chars
            "OOB1", // left 2 words
            "The",
            "retreated", // right 2 words
            "in");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(1)));
    Assert.assertEquals("NNP", instances.get(1).getOutcome());

    // check "retreated"
    featureValues =
        Arrays.asList(
            "retreated", // word
            "retreated", // lower case
            "ALL_LOWERCASE", // capital type
            // numeric type
            "ed", // last 2 chars
            "ted", // last 3 chars
            "The", // left 2 words
            "Absurdis", // right 2 words
            "in",
            "2003");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(2)));
    Assert.assertEquals("VBD", instances.get(2).getOutcome());

    // check "in"
    featureValues =
        Arrays.asList(
            "in", // word
            "in", // lower case
            "ALL_LOWERCASE", // capital type
            // numeric type
            "in", // last 2 chars
            // last 3 chars
            "Absurdis", // left 2 words
            "retreated",
            "2003", // right 2 words
            ".");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(3)));
    Assert.assertEquals("IN", instances.get(3).getOutcome());

    // check "2003"
    featureValues =
        Arrays.asList(
            "2003", // word
            "2003", // lower case
            // capital type
            "YEAR_DIGITS", // numeric type
            "03", // last 2 chars
            "003", // last 3 chars
            "retreated", // left 2 words
            "in",
            ".", // right 2 words
            "OOB1");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(4)));
    Assert.assertEquals("CD", instances.get(4).getOutcome());

    // check "."
    featureValues =
        Arrays.asList(
            ".", // word
            ".", // lower case
            // capital type
            // numeric type
            // last 2 chars
            // last 3 chars
            "in", // left 2 words
            "2003",
            "OOB1", // right 2 words
            "OOB2");
    Assert.assertEquals(featureValues, this.getFeatureValues(instances.get(5)));
    Assert.assertEquals(".", instances.get(5).getOutcome());
  }