Exemplo n.º 1
0
  private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID)
      throws ExecutionException {
    for (Annotation annot : toTransfer) {
      Mapping m = mappings.get(annot.getType());

      String name = (m == null || m.newName == null ? annot.getType() : m.newName);

      try {
        FeatureMap params = Factory.newFeatureMap();
        params.putAll(annot.getFeatures());
        if (newID) {
          to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params);
        } else {
          to.add(
              annot.getId(),
              annot.getStartNode().getOffset(),
              annot.getEndNode().getOffset(),
              name,
              params);
        }
      } catch (InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }
  }
Exemplo n.º 2
0
  // generate annotations for ngrams over a larger span e.g all couples inside
  // a span of 5 tokens
  // this allows to match more variants e.g. with adjectives in the middle
  // we do not generate intermediate annotations here
  // do with only bigrams for the moment
  private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);
    try {
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;

        // create a temporary list containing all the annotations
        // at position 0
        List<Annotation> headannots = boxes.get(b);
        for (Annotation newAnn : headannots) {
          // remembering positions
          loStart = newAnn.getStartNode().getOffset();
          if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
          else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
            hiEnd = newAnn.getEndNode().getOffset();

          String string = (String) newAnn.getFeatures().get(inputAnnotationFeature);
          tempAnnotationsStartingHere.add(string);

          if (this.generateIntermediateAnnotations) {
            FeatureMap fm = Factory.newFeatureMap();
            fm.put(this.outputAnnotationFeature, string);
            outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
          }
        }

        for (int z = 1; z < window && (b + z < boxes.size()); z++) {
          // generate all possible bi-grams
          List<Annotation> current = boxes.get(b + z);
          for (Annotation newAnn : current) {
            // remembering positions
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);

            // take what is in the buffer
            // and make a new annotation out of that
            for (String s : tempAnnotationsStartingHere) {
              String combination = s + getNgramSeparator() + newString;

              // create an annotation for the combination
              FeatureMap fm = Factory.newFeatureMap();
              fm.put(this.outputAnnotationFeature, combination);
              outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
            }
          }
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Exemplo n.º 3
0
  private void generateNGrams(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);

    try {
      // now do the actual n-grams
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;
        for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) {
          // do the combination and dump what we've done at every step
          // e.g generate 1 grams as well as 2-grams
          List<Annotation> current = boxes.get(b + z);
          List<String> temptemp = new ArrayList<String>();
          for (Annotation newAnn : current) {
            // remembering positions
            if (loStart == null) loStart = newAnn.getStartNode().getOffset();
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);
            // TODO : what if there is no such value????
            if (tempAnnotationsStartingHere.size() == 0) {
              // create an annotation for the current annotation
              if (this.generateIntermediateAnnotations) {
                FeatureMap fm = Factory.newFeatureMap();
                fm.put(this.outputAnnotationFeature, newString);
                outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
              }
              // add it to the temp
              temptemp.add(newString);
            } else
              for (String existing : tempAnnotationsStartingHere) {
                String combination = existing + getNgramSeparator() + newString;
                temptemp.add(combination);

                if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) {
                  // create an annotation for the combination
                  FeatureMap fm = Factory.newFeatureMap();
                  fm.put(this.outputAnnotationFeature, combination);
                  outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
                }
              }
          }
          tempAnnotationsStartingHere = temptemp;
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Exemplo n.º 4
0
  public void execute() throws ExecutionException {

    AnnotationSet outputAS = document.getAnnotations(outputASName);

    List<Annotation> tokens =
        new ArrayList<Annotation>(
            document.getAnnotations(inputASName).get(ANNIEConstants.TOKEN_ANNOTATION_TYPE));
    Collections.sort(tokens, new OffsetComparator());

    String[] strings = new String[tokens.size()];

    for (int i = 0; i < tokens.size(); ++i) {
      strings[i] = (String) tokens.get(i).getFeatures().get("string");
    }

    try {
      TagList tags = tagger.tag(strings);

      Iterator<Tag> it = tags.iterator();
      while (it.hasNext()) {

        Tag tag = it.next();

        outputAS.add(
            tokens.get(tag.getTokenStartIndex()).getStartNode().getOffset(),
            tokens.get(tag.getTokenEndIndex()).getEndNode().getOffset(),
            tag.getTagname(),
            Factory.newFeatureMap());
      }
    } catch (Exception ioe) {
      throw new ExecutionException("Tagger Failed", ioe);
    }
  }
  public void tokenize() {
    AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) {

      Annotation currentTokenAnnotation = it.next();
      FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures();
      FeatureMap curFeaturesMap = Factory.newFeatureMap();

      if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) {
        curFeaturesMap.put("string", tokenFeaturesMap.get("string"));
        curFeaturesMap.put("root", tokenFeaturesMap.get("lemma"));
        curFeaturesMap.put("category", tokenFeaturesMap.get("POS"));

        // Add the new Token to the Annotation Set

        defaultAs.add(
            currentTokenAnnotation.getStartNode(),
            currentTokenAnnotation.getEndNode(),
            currentTokenAnnotation.getType(),
            curFeaturesMap);
      }
    }
    gateDocument.removeAnnotationSet("Tokenization");
  }
Exemplo n.º 6
0
  /**
   * Generation of a GATE document from a Behemoth one
   *
   * @param key URL of the input doc
   * @param inputDoc
   * @return
   * @throws ResourceInstantiationException
   * @throws InvalidOffsetException
   * @throws IOException
   */
  public gate.Document generateGATEDoc(BehemothDocument inputDoc)
      throws ResourceInstantiationException, InvalidOffsetException, IOException {

    gate.Document gatedocument = null;

    // if no text is available (e.g. Tika has not extracted it)
    // let GATE do the parsing itself from the binary content
    if (inputDoc.getText() == null) {
      try {
        gatedocument = generateGATEDocFromLocalDump(inputDoc);

        // transfer the text from GATE to Behemoth
        String textContent = gatedocument.getContent().toString();
        inputDoc.setText(textContent);

        return gatedocument;
      } catch (Exception e) {
        LOG.error("Can't generate GATE doc from byte dump", e);
      }
    }

    // if the input document does not have any text -> create a doc with an
    // empty text

    String text = inputDoc.getText();
    if (inputDoc.getText() == null) text = "";
    else text = inputDoc.getText();

    gatedocument = Factory.newDocument(text);

    // then the metadata as document features
    FeatureMap docFeatures = gatedocument.getFeatures();
    String docUrl = inputDoc.getUrl();
    if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl);
    if (inputDoc.getMetadata() != null) {
      Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator();
      while (iter.hasNext()) {
        Entry<Writable, Writable> entry = iter.next();
        String skey = entry.getKey().toString().trim();
        String svalue = null;
        if (entry.getValue() != null) svalue = entry.getValue().toString().trim();
        docFeatures.put(skey, svalue);
      }
    }

    // finally the annotations as original markups
    // TODO change the name of the annotation set via config
    AnnotationSet outputAS = gatedocument.getAnnotations("Original markups");
    for (Annotation annot : inputDoc.getAnnotations()) {
      // add to outputAS as a GATE annotation
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());
      outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features);
    }
    return gatedocument;
  }
  public void splitter() {
    AnnotationSet sDetectionAS = gateDocument.getAnnotations("SentenceDetection");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = sDetectionAS.iterator(); it.hasNext(); ) {

      Annotation currentSentenceAnnotation = it.next();

      // Add the Sentence to the Annotation Set
      defaultAs.add(
          currentSentenceAnnotation.getStartNode(),
          currentSentenceAnnotation.getEndNode(),
          "Sentence",
          null);
    }
    gateDocument.removeAnnotationSet("SentenceDetection");
  }
Exemplo n.º 8
0
  /**
   * Creates the Lookup annotations according to a gazetteer match.
   *
   * @param matchingState the final FSMState that was reached while matching.
   * @param matchedRegionStart the start of the matched text region.
   * @param matchedRegionEnd the end of the matched text region.
   * @param annotationSet the annotation set where the new annotations should be added.
   */
  protected void createLookups(
      FSMState matchingState,
      long matchedRegionStart,
      long matchedRegionEnd,
      AnnotationSet annotationSet) {
    Iterator lookupIter = matchingState.getLookupSet().iterator();
    while (lookupIter.hasNext()) {
      Lookup currentLookup = (Lookup) lookupIter.next();
      FeatureMap fm = Factory.newFeatureMap();
      fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
      if (null != currentLookup.oClass && null != currentLookup.ontology) {
        fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass);
        fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology);
      }

      if (null != currentLookup.minorType)
        fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
      if (null != currentLookup.languages)
        fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages);
      if (null != currentLookup.features) {
        fm.putAll(currentLookup.features);
      }
      try {
        //        if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){
        //          annotationSet.add(new Long(matchedRegionStart),
        //                          new Long(matchedRegionEnd + 1),
        //                          LOOKUP_ANNOTATION_TYPE,
        //                          fm);
        //        }else{
        annotationSet.add(
            new Long(matchedRegionStart),
            new Long(matchedRegionEnd + 1),
            currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag.
            fm);
        // }
      } catch (InvalidOffsetException ioe) {
        throw new GateRuntimeException(ioe.toString());
      }
    } // while(lookupIter.hasNext())
  }
Exemplo n.º 9
0
  public void execute() throws ExecutionException {
    AnnotationSet outputAS = document.getAnnotations(annotationSetName);

    String text = document.getContent().toString();

    Span[] tokens = tokenizer.getTokens(text);
    try {
      for (Span token : tokens) {
        FeatureMap features = Factory.newFeatureMap();
        features.put(
            ANNIEConstants.TOKEN_STRING_FEATURE_NAME,
            text.substring(token.getStart(), token.getEnd()));

        outputAS.add(
            (long) token.getStart(),
            (long) token.getEnd(),
            ANNIEConstants.TOKEN_ANNOTATION_TYPE,
            features);
      }
    } catch (Exception e) {
      throw new ExecutionException("error running tokenizer", e);
    }
  }
 /**
  * Rename annotation
  *
  * @param outputAS output annotation set
  * @param oldType old annotation name
  * @param newType new annotation name
  */
 private void renameAnnotations(AnnotationSet outputAS, String oldType, String newType) {
   AnnotationSet tmpAnatomyAS = outputAS.get(oldType);
   for (Annotation tmpAnn : tmpAnatomyAS) {
     Long startOffset = tmpAnn.getStartNode().getOffset();
     Long endOffset = tmpAnn.getEndNode().getOffset();
     AnnotationSet existingAS = outputAS.getCovering(newType, startOffset, endOffset);
     // If we've already got an annotation of the same name in the same place, don't add a new one
     // just delete the old one
     if (existingAS.isEmpty()) {
       FeatureMap tmpFm = tmpAnn.getFeatures();
       FeatureMap fm = Factory.newFeatureMap();
       fm.putAll(tmpFm);
       try {
         outputAS.add(startOffset, endOffset, newType, fm);
         outputAS.remove(tmpAnn);
       } catch (InvalidOffsetException ie) {
         // shouldn't happen
       }
     } else {
       outputAS.remove(tmpAnn);
     }
   }
 }
  /**
   * @param inputAS input annotation set
   * @param outputAS output annotation set
   * @param term String matched
   * @param startOffset match start offset
   * @param endOffset match end offset
   */
  private void addLookup(
      AnnotationSet inputAS,
      AnnotationSet outputAS,
      String term,
      String outputASType,
      Long startOffset,
      Long endOffset,
      boolean useNounChunk) {
    if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) {
      AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset);
      if (!nounChunkAS.isEmpty()) {
        startOffset = nounChunkAS.firstNode().getOffset();
        endOffset = nounChunkAS.lastNode().getOffset();
      }
    }
    try {
      AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset);
      if (diseaseAS.isEmpty()) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("match", term);
        outputAS.add(startOffset, endOffset, outputASType, fm);
      } else {
        Annotation disease = diseaseAS.iterator().next();
        FeatureMap fm = disease.getFeatures();
        String meta = (String) fm.get("match");
        if (meta != null) {
          meta = meta + " " + term;
        }
        fm.put("match", meta);
      }

    } catch (InvalidOffsetException ie) {
      // shouldn't happen
      gate.util.Err.println(ie);
    }
  }
  public void execute() throws ExecutionException {

    // get the sentence splitter file from the URL provided
    File splitter = Files.fileFromURL(splitterBinary);

    // get the document content and replace non-breaking spaces with spaces
    // TODO replace new-lines with spaces so we don't get a sentence per line
    String docContent = document.getContent().toString().replace((char) 160, ' ');

    try {
      // create temporary files to use with the external sentence splitter
      File tmpIn = File.createTempFile("GENIA", ".txt");
      File tmpOut = File.createTempFile("GENIA", ".txt");

      // store the document content in the input file
      FileOutputStream fos = new FileOutputStream(tmpIn);
      fos.write(docContent.getBytes("utf8"));
      fos.close();

      // setup the command line to run the sentence splitter
      String[] args =
          new String[] {
            splitter.getAbsolutePath(), tmpIn.getAbsolutePath(), tmpOut.getAbsolutePath()
          };

      // run the sentence splitter over the docuement
      manager.runProcess(
          args, splitter.getParentFile(), (debug ? System.out : null), (debug ? System.err : null));

      // get the annotation set we are going to store results in
      AnnotationSet annotationSet = document.getAnnotations(annotationSetName);

      // we haven't found any sentence yet so start looking for the next one
      // from the beginning of the document
      int end = 0;

      // read in the output from the sentence splitter one line at a time
      BufferedReader in = new BufferedReader(new FileReader(tmpOut));
      String sentence = in.readLine();
      while (sentence != null) {

        // trim the sentence so we don't annotate extranious white space,
        // this isn't python code after all :)
        sentence = sentence.trim();

        // find the start of the sentence
        // TODO throw a sensible exception if the sentence can't be found?
        int start = docContent.indexOf(sentence, end);

        // work out where the sentence ends
        end = start + sentence.length();

        if (end > start) {
          // the sentence has a length so annotate it
          annotationSet.add((long) start, (long) end, "Sentence", Factory.newFeatureMap());
        }

        // get the next line from the output from the tagger
        sentence = in.readLine();
      }

      // delete the temp files
      if (!debug && !tmpIn.delete()) tmpIn.deleteOnExit();
      if (!debug && !tmpOut.delete()) tmpOut.deleteOnExit();

    } catch (Exception ioe) {
      throw new ExecutionException("An error occured running the splitter", ioe);
    }
  }
 /**
  * This method annotates paragraphs in a GATE document. The investigated text spans beetween start
  * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName
  * is null then they are creted in the default annotation set.
  *
  * @param aDoc is the gate document on which the paragraph detection would be performed.If it is
  *     null or its content it's null then the method woul simply return doing nothing.
  * @param startOffset is the index form the document content from which the paragraph detection
  *     will start
  * @param endOffset is the offset where the detection will end.
  * @param annotSetName is the name of the set in which paragraph annotation would be created.The
  *     annotation type created will be "paragraph"
  */
 public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName)
     throws DocumentFormatException {
   // Simply return if the document is null or its content
   if (aDoc == null || aDoc.getContent() == null) return;
   // Simply return if the start is > than the end
   if (startOffset > endOffset) return;
   // Decide where to put the newly detected annotations
   AnnotationSet annotSet = null;
   if (annotSetName == null) annotSet = aDoc.getAnnotations();
   else annotSet = aDoc.getAnnotations(annotSetName);
   // Extract the document content
   String content = aDoc.getContent().toString();
   // This is the offset marking the start of a para
   int startOffsetPara = startOffset;
   // This marks the ned of a para
   int endOffsetPara = endOffset;
   // The initial sate of the FSA
   int state = 1;
   // This field marks that a BR entity was read
   // A BR entity can be NL or NL CR, depending on the operating system (UNIX
   // or DOS)
   boolean readBR = false;
   int index = startOffset;
   while (index < endOffset) {
     // Read the current char
     char ch = content.charAt(index);
     // Test if a BR entity was read
     if (ch == '\n') {
       readBR = true;
       // If \n is followed by a \r then advance the index in order to read a
       // BR entity
       while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
     } // End if
     switch (state) {
         // It is the initial and also a final state
         // Stay in state 1 while it reads whitespaces
       case 1:
         {
           // If reads a non whitespace char then move to state 2 and record
           // the beggining of a paragraph
           if (!Character.isWhitespace(ch)) {
             state = 2;
             startOffsetPara = index;
           } // End if
         }
         break;
         // It can be also a final state.
       case 2:
         {
           // Stay in state 2 while reading chars != BR entities
           if (readBR) {
             // If you find a BR char go to state 3. The possible end of the para
             // can be index. This will be confirmed by state 3. So, this is why
             // the end of a para is recorded here.
             readBR = false;
             endOffsetPara = index;
             state = 3;
           } // End if
         }
         break;
         // It can be also a final state
         // From state 3 there are only 2 possible ways: (state 2 or state1)
         // In state 1 it needs to read a BR
         // For state 2 it nead to read something different then a BR
       case 3:
         {
           if (readBR) {
             // A BR was read. Go to state 1
             readBR = false;
             state = 1;
             // Create an annotation type paragraph
             try {
               annotSet.add(
                   new Long(startOffsetPara),
                   new Long(endOffsetPara),
                   "paragraph",
                   Factory.newFeatureMap());
             } catch (gate.util.InvalidOffsetException ioe) {
               throw new DocumentFormatException(
                   "Coudn't create a paragraph" + " annotation", ioe);
             } // End try
           } else {
             // Go to state 2 an keep reading chars
             state = 2;
           } // End if
         }
         break;
     } // End switch
     // Prepare to read the next char.
     index++;
   } // End while
   endOffsetPara = index;
   // Investigate where the finite automata has stoped
   if (state == 2 || state == 3) {
     // Create an annotation type paragraph
     try {
       annotSet.add(
           new Long(startOffsetPara),
           // Create the final annotation using the endOffset
           new Long(endOffsetPara),
           "paragraph",
           Factory.newFeatureMap());
     } catch (gate.util.InvalidOffsetException ioe) {
       throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
     } // End try
   } // End if
 } // End annotateParagraphs();