@Test
  public void emptyDocumentTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader =
        CollectionReaderFactory.createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/empty/",
            TextReader.PARAM_LANGUAGE,
            "en",
            TextReader.PARAM_PATTERNS,
            "empty*.txt");

    AnalysisEngineDescription segmenter =
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector =
        AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
      //            System.out.println(jcas.getDocumentText().length());
    }
  }
 public static void main(String[] args) throws Exception {
   SimplePipeline.runPipeline(
       CollectionReaderFactory.createReader(
           ReaderExample.class,
           ReaderExample.PARAM_INPUT_FILE,
           "src/test/resources/test/input.txt"),
       AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
       AnalysisEngineFactory.createEngineDescription(BaselineExample.class),
       AnalysisEngineFactory.createEngineDescription(EvaluatorExample.class));
 }
Пример #3
0
  public static void main(String[] args) throws Exception {
    JCas jCas = JCasFactory.createJCas();
    jCas.setDocumentLanguage("de");
    jCas.setDocumentText(
        "Die Fossillagerstätte Geiseltal befindet sich im ehemaligen Braunkohlerevier des Geiseltales südlich der Stadt Halle in Sachsen-Anhalt. Sie ist eine bedeutende Fundstelle heute ausgestorbener Pflanzen und Tiere aus der Zeit des Mittleren Eozäns vor 48 bis 41 Millionen Jahren. Im Geiseltal wurde nachweislich seit 1698 erstmals Kohle gefördert, die ersten Fossilien kamen aber erst Anfang des 20. Jahrhunderts eher zufällig zu Tage. Planmäßige wissenschaftliche Ausgrabungen begannen 1925 seitens der Martin-Luther-Universität Halle-Wittenberg. Unterbrochen durch den Zweiten Weltkrieg, können die Untersuchungen in zwei Forschungsphasen untergliedert werden. Aufgrund der zunehmenden Auskohlung der Rohstofflager kamen die Ausgrabungen Mitte der 1980er allmählich zum Erliegen und endeten endgültig zu Beginn des dritten Jahrtausends.");

    SimplePipeline.runPipeline(
        jCas,
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
        AnalysisEngineFactory.createEngineDescription(StanfordNamedEntityRecognizer.class),
        AnalysisEngineFactory.createEngineDescription(CasDumpWriter.class));

    for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
      System.out.println("Found NE: " + ne.getValue() + ", " + ne.getCoveredText());
    }
  }
 /**
  * Creates a uima sentence iterator with the given path
  *
  * @param path the path to the root directory or file to read from
  * @return the uima sentence iterator for the given root dir or file
  * @throws Exception
  */
 public static SentenceIterator createWithPath(String path) throws Exception {
   return new UimaSentenceIterator(
       path,
       new UimaResource(
           AnalysisEngineFactory.createEngine(
               AnalysisEngineFactory.createEngineDescription(
                   TokenizerAnnotator.getDescription(), SentenceAnnotator.getDescription()))));
 }
  /**
   * Return a a sentence segmenter
   *
   * @return a sentence segmenter
   */
  public static AnalysisEngine segmenter() {
    try {
      if (defaultAnalysisEngine == null)
        defaultAnalysisEngine =
            AnalysisEngineFactory.createEngine(
                AnalysisEngineFactory.createEngineDescription(SentenceAnnotator.getDescription()));

      return defaultAnalysisEngine;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
Пример #6
0
 public static AnalysisEngineDescription getDescription(String languageCode)
     throws ResourceInitializationException {
   String modelPath = String.format("/models/%s-pos-maxent.bin", languageCode);
   return AnalysisEngineFactory.createEngineDescription(
       PoStagger.class,
       UimaUtil.MODEL_PARAMETER,
       ExternalResourceFactory.createExternalResourceDescription(
           POSModelResourceImpl.class, PoStagger.class.getResource(modelPath).toString()),
       UimaUtil.SENTENCE_TYPE_PARAMETER,
       Sentence.class.getName(),
       UimaUtil.TOKEN_TYPE_PARAMETER,
       Token.class.getName(),
       UimaUtil.POS_FEATURE_PARAMETER,
       "pos");
 }
Пример #7
0
  public static void main(String[] args) throws Exception {
    Options options = CliFactory.parseArguments(Options.class, args);

    List<File> testFiles =
        DocumentClassificationEvaluation.getFilesFromDirectory(options.getTestDirectory());

    DocumentClassificationEvaluation evaluation =
        new DocumentClassificationEvaluation(options.getModelsDirectory());
    CollectionReader collectionReader = evaluation.getCollectionReader(testFiles);

    AggregateBuilder builder =
        DocumentClassificationEvaluation.createDocumentClassificationAggregate(
            options.getModelsDirectory(), AnnotatorMode.CLASSIFY);

    SimplePipeline.runPipeline(
        collectionReader,
        builder.createAggregateDescription(),
        AnalysisEngineFactory.createEngineDescription(PrintClassificationsAnnotator.class));
  }
  @Override
  protected AnalysisEngine[] createAnalysisEngines() throws ResourceInitializationException {

    final ExternalResourceDescription parserChunkingDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "parserChunking", SharedOpenNLPModel.class);

    // Add in the OpenNLP implementation too, as its a prerequisite
    // (in theory we should test OpenNLPParser in isolation, but in practise
    // it as this as a
    // dependency
    // so better test they work together)

    final ExternalResourceDescription tokensDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "tokens", SharedOpenNLPModel.class);
    final ExternalResourceDescription sentencesDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "sentences", SharedOpenNLPModel.class);
    final ExternalResourceDescription posDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "posTags", SharedOpenNLPModel.class);
    final ExternalResourceDescription chunksDesc =
        ExternalResourceFactory.createExternalResourceDescription(
            "phraseChunks", SharedOpenNLPModel.class);

    AnalysisEngineFactory.createEngineDescription();

    return asArray(
        createAnalysisEngine(
            OpenNLP.class,
            "tokens",
            tokensDesc,
            "sentences",
            sentencesDesc,
            "posTags",
            posDesc,
            "phraseChunks",
            chunksDesc),
        createAnalysisEngine(OpenNLPParser.class, "parserChunking", parserChunkingDesc));
  }
  public static AnalysisEngineDescription getDescription(
      Class<? extends Annotation> parentheticalClass,
      Class<? extends Annotation> windowClass,
      char leftParen,
      char rightParen)
      throws ResourceInitializationException {
    AnalysisEngineDescription aed =
        AnalysisEngineFactory.createEngineDescription(
            ParentheticalAnnotator.class,
            PARAM_LEFT_PARENTHESIS,
            "" + leftParen,
            PARAM_RIGHT_PARENTHESIS,
            "" + rightParen,
            PARAM_PARENTHETICAL_TYPE_NAME,
            parentheticalClass.getName());

    if (windowClass != null) {
      ConfigurationParameterFactory.addConfigurationParameters(
          aed, PARAM_WINDOW_TYPE_NAME, windowClass.getName());
    }

    return aed;
  }
  @Test
  public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader =
        CollectionReaderFactory.createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/data/",
            TextReader.PARAM_LANGUAGE,
            "en",
            TextReader.PARAM_PATTERNS,
            "text*.txt");

    AnalysisEngineDescription segmenter =
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector =
        AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
      //            System.out.println(jcas.getDocumentText().length());
    }

    int i = 0;
    IndexReader index;
    try {
      index = DirectoryReader.open(FSDirectory.open(tmpDir));
      Fields fields = MultiFields.getFields(index);
      if (fields != null) {
        Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
        if (terms != null) {
          TermsEnum termsEnum = terms.iterator(null);
          //                    Bits liveDocs = MultiFields.getLiveDocs(index);
          //                    DocsEnum docs = termsEnum.docs(liveDocs, null);
          //                    int docId;
          //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
          //                        index.g
          //                    }
          BytesRef text = null;
          while ((text = termsEnum.next()) != null) {
            //                        System.out.println(text.utf8ToString() + " - " +
            // termsEnum.totalTermFreq());
            //                        System.out.println(termsEnum.docFreq());

            if (text.utf8ToString().equals("this")) {
              assertEquals(2, termsEnum.docFreq());
              assertEquals(3, termsEnum.totalTermFreq());
            }

            i++;
          }
        }
      }
    } catch (Exception e) {
      throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
  }
Пример #11
0
 /**
  * @return AE description instance
  * @throws UIMAException
  * @throws IOException
  */
 public static AnalysisEngineDescription getAEDescription() throws UIMAException, IOException {
   return AnalysisEngineFactory.createEngineDescription(AE_TOKENIZER);
 }
Пример #12
0
 public static AnalysisEngineDescription getDescription(String outputDir)
     throws ResourceInitializationException {
   return AnalysisEngineFactory.createEngineDescription(
       TempEval2007Writer.class, PARAM_OUTPUT_DIRECTORY_NAME, outputDir);
 }