Esempio n. 1
0
  public void annotationUpdated(AnnotationEvent e) {
    if (e.getType() == AnnotationEvent.FEATURES_UPDATED) {
      if (!disableListener) {
        Annotation annot = (Annotation) e.getSource();
        // lets find out which annotation set it belongs to
        AnnotationSet as = null;
        if (getAnnotations().contains(annot)) {
          as = getAnnotations();
        } else {
          Map<String, AnnotationSet> ass = getNamedAnnotationSets();
          if (ass == null) return;
          Iterator<String> namesIter = getNamedAnnotationSets().keySet().iterator();
          while (namesIter.hasNext()) {
            String name = namesIter.next();
            as = getNamedAnnotationSets().get(name);
            if (as.contains(annot)) {
              break;
            } else as = null;
          }
        }

        if (as == null) return;
        for (String docID : combinedDocumentIds) {
          OffsetDetails od = getOffsetDetails(docID, as.getName(), annot);
          if (od == null) continue;
          Annotation toUse = od.getOriginalAnnotation();
          toUse.setFeatures(annot.getFeatures());
        }
      }
    }
  }
Esempio n. 2
0
  private void transferAnnotations(List<Annotation> toTransfer, AnnotationSet to, boolean newID)
      throws ExecutionException {
    for (Annotation annot : toTransfer) {
      Mapping m = mappings.get(annot.getType());

      String name = (m == null || m.newName == null ? annot.getType() : m.newName);

      try {
        FeatureMap params = Factory.newFeatureMap();
        params.putAll(annot.getFeatures());
        if (newID) {
          to.add(annot.getStartNode().getOffset(), annot.getEndNode().getOffset(), name, params);
        } else {
          to.add(
              annot.getId(),
              annot.getStartNode().getOffset(),
              annot.getEndNode().getOffset(),
              name,
              params);
        }
      } catch (InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
    }
  }
  @Override
  public void execute() throws ExecutionException {
    initBeforeExecute();

    AnnotationSet tokensAndDependenciesAS = inputAS;
    TreeIndex index =
        new GateAwareTreeIndex(
            tokensAndDependenciesAS.get(null, Utils.setFromArray(new String[] {"args"})));

    QueryData data =
        new QueryData(index, new GateAnnotationsNodeAttributes(tokensAndDependenciesAS));

    Iterable<QueryMatch> results = queryObject.evaluate(data);

    int queryMatchOrd = 0;

    for (QueryMatch result : results) {
      queryMatchOrd++;
      for (NodeMatch match : result.getMatchingNodes()) {
        String name = match.getQueryNode().getName();
        if (name != null) {
          Annotation matchingAnnot = tokensAndDependenciesAS.get(match.getNodeId());
          FeatureMap fm = Factory.newFeatureMap();
          fm.put("matchingNodeId", match.getNodeId());
          fm.put(
              "queryMatchId",
              String.format("%s_%03d", buildQueryStringHash(getQueryString()), queryMatchOrd));
          outputAS.add(matchingAnnot.getStartNode(), matchingAnnot.getEndNode(), name, fm);
        }
      }
    }
  }
  public void execute() throws ExecutionException {

    AnnotationSet outputAS = document.getAnnotations(outputASName);

    List<Annotation> tokens =
        new ArrayList<Annotation>(
            document.getAnnotations(inputASName).get(ANNIEConstants.TOKEN_ANNOTATION_TYPE));
    Collections.sort(tokens, new OffsetComparator());

    String[] strings = new String[tokens.size()];

    for (int i = 0; i < tokens.size(); ++i) {
      strings[i] = (String) tokens.get(i).getFeatures().get("string");
    }

    try {
      TagList tags = tagger.tag(strings);

      Iterator<Tag> it = tags.iterator();
      while (it.hasNext()) {

        Tag tag = it.next();

        outputAS.add(
            tokens.get(tag.getTokenStartIndex()).getStartNode().getOffset(),
            tokens.get(tag.getTokenEndIndex()).getEndNode().getOffset(),
            tag.getTagname(),
            Factory.newFeatureMap());
      }
    } catch (Exception ioe) {
      throw new ExecutionException("Tagger Failed", ioe);
    }
  }
  public void tokenize() {
    AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) {

      Annotation currentTokenAnnotation = it.next();
      FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures();
      FeatureMap curFeaturesMap = Factory.newFeatureMap();

      if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) {
        curFeaturesMap.put("string", tokenFeaturesMap.get("string"));
        curFeaturesMap.put("root", tokenFeaturesMap.get("lemma"));
        curFeaturesMap.put("category", tokenFeaturesMap.get("POS"));

        // Add the new Token to the Annotation Set

        defaultAs.add(
            currentTokenAnnotation.getStartNode(),
            currentTokenAnnotation.getEndNode(),
            currentTokenAnnotation.getType(),
            curFeaturesMap);
      }
    }
    gateDocument.removeAnnotationSet("Tokenization");
  }
Esempio n. 6
0
  /** Annotation remove event */
  public void annotationRemoved(AnnotationSetEvent ase) {
    if (!disableListener && ase.getSourceDocument() == this) {
      AnnotationSet as = (AnnotationSet) ase.getSource();
      Annotation annot = ase.getAnnotation();
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());

      boolean defaultAS = as.getName() == null;
      for (String docID : combinedDocumentIds) {
        Document aDoc = compoundDocument.getDocument(docID);

        // find out the details which refer to the deleted annotation
        OffsetDetails od = getOffsetDetails(docID, as.getName(), annot);
        if (od == null) continue;

        if (defaultAS) {
          aDoc.getAnnotations().remove(od.getOriginalAnnotation());
        } else {
          aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation());
        }
        removeOffsetDetails(docID, od);
        break;
      }
    }
  }
  // generate annotations for ngrams over a larger span e.g all couples inside
  // a span of 5 tokens
  // this allows to match more variants e.g. with adjectives in the middle
  // we do not generate intermediate annotations here
  // do with only bigrams for the moment
  private void generateNGramsOverWindow(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);
    try {
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;

        // create a temporary list containing all the annotations
        // at position 0
        List<Annotation> headannots = boxes.get(b);
        for (Annotation newAnn : headannots) {
          // remembering positions
          loStart = newAnn.getStartNode().getOffset();
          if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
          else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
            hiEnd = newAnn.getEndNode().getOffset();

          String string = (String) newAnn.getFeatures().get(inputAnnotationFeature);
          tempAnnotationsStartingHere.add(string);

          if (this.generateIntermediateAnnotations) {
            FeatureMap fm = Factory.newFeatureMap();
            fm.put(this.outputAnnotationFeature, string);
            outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
          }
        }

        for (int z = 1; z < window && (b + z < boxes.size()); z++) {
          // generate all possible bi-grams
          List<Annotation> current = boxes.get(b + z);
          for (Annotation newAnn : current) {
            // remembering positions
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);

            // take what is in the buffer
            // and make a new annotation out of that
            for (String s : tempAnnotationsStartingHere) {
              String combination = s + getNgramSeparator() + newString;

              // create an annotation for the combination
              FeatureMap fm = Factory.newFeatureMap();
              fm.put(this.outputAnnotationFeature, combination);
              outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
            }
          }
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Esempio n. 8
0
  /**
   * Generation of a GATE document from a Behemoth one
   *
   * @param key URL of the input doc
   * @param inputDoc
   * @return
   * @throws ResourceInstantiationException
   * @throws InvalidOffsetException
   * @throws IOException
   */
  public gate.Document generateGATEDoc(BehemothDocument inputDoc)
      throws ResourceInstantiationException, InvalidOffsetException, IOException {

    gate.Document gatedocument = null;

    // if no text is available (e.g. Tika has not extracted it)
    // let GATE do the parsing itself from the binary content
    if (inputDoc.getText() == null) {
      try {
        gatedocument = generateGATEDocFromLocalDump(inputDoc);

        // transfer the text from GATE to Behemoth
        String textContent = gatedocument.getContent().toString();
        inputDoc.setText(textContent);

        return gatedocument;
      } catch (Exception e) {
        LOG.error("Can't generate GATE doc from byte dump", e);
      }
    }

    // if the input document does not have any text -> create a doc with an
    // empty text

    String text = inputDoc.getText();
    if (inputDoc.getText() == null) text = "";
    else text = inputDoc.getText();

    gatedocument = Factory.newDocument(text);

    // then the metadata as document features
    FeatureMap docFeatures = gatedocument.getFeatures();
    String docUrl = inputDoc.getUrl();
    if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl);
    if (inputDoc.getMetadata() != null) {
      Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator();
      while (iter.hasNext()) {
        Entry<Writable, Writable> entry = iter.next();
        String skey = entry.getKey().toString().trim();
        String svalue = null;
        if (entry.getValue() != null) svalue = entry.getValue().toString().trim();
        docFeatures.put(skey, svalue);
      }
    }

    // finally the annotations as original markups
    // TODO change the name of the annotation set via config
    AnnotationSet outputAS = gatedocument.getAnnotations("Original markups");
    for (Annotation annot : inputDoc.getAnnotations()) {
      // add to outputAS as a GATE annotation
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());
      outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features);
    }
    return gatedocument;
  }
  private void generateNGrams(List<Annotation> list, AnnotationSet outputAS)
      throws ExecutionException {
    List<List> boxes = generateBoxes(list, outputAS);

    try {
      // now do the actual n-grams
      for (int b = 0; b < boxes.size(); b++) {
        List<String> tempAnnotationsStartingHere = new ArrayList<String>();
        Long loStart = null;
        Long hiEnd = null;
        for (int z = 0; z < this.ngram.intValue() && (b + z < boxes.size()); z++) {
          // do the combination and dump what we've done at every step
          // e.g generate 1 grams as well as 2-grams
          List<Annotation> current = boxes.get(b + z);
          List<String> temptemp = new ArrayList<String>();
          for (Annotation newAnn : current) {
            // remembering positions
            if (loStart == null) loStart = newAnn.getStartNode().getOffset();
            if (hiEnd == null) hiEnd = newAnn.getEndNode().getOffset();
            else if (newAnn.getEndNode().getOffset().longValue() > hiEnd.longValue())
              hiEnd = newAnn.getEndNode().getOffset();

            String newString = (String) newAnn.getFeatures().get(inputAnnotationFeature);
            // TODO : what if there is no such value????
            if (tempAnnotationsStartingHere.size() == 0) {
              // create an annotation for the current annotation
              if (this.generateIntermediateAnnotations) {
                FeatureMap fm = Factory.newFeatureMap();
                fm.put(this.outputAnnotationFeature, newString);
                outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
              }
              // add it to the temp
              temptemp.add(newString);
            } else
              for (String existing : tempAnnotationsStartingHere) {
                String combination = existing + getNgramSeparator() + newString;
                temptemp.add(combination);

                if (this.generateIntermediateAnnotations | z == this.ngram.intValue() - 1) {
                  // create an annotation for the combination
                  FeatureMap fm = Factory.newFeatureMap();
                  fm.put(this.outputAnnotationFeature, combination);
                  outputAS.add(loStart, hiEnd, outputAnnotationType, fm);
                }
              }
          }
          tempAnnotationsStartingHere = temptemp;
        }
      }
    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
  protected AnnotationDiffer calculateDocumentDiff(Document document, String annotTypeName) {
    AnnotationSet responsesIter = responseAS.get(annotTypeName);

    if (getKeyAnnotationsAreInDocumentFeatures()) {
      return DocumentFeaturesDiff.computeDiffWithDocFeatures(document, featureNames, responsesIter);
    }

    AnnotationSet keysIter = keyAS.get(annotTypeName);

    AnnotationDiffer differ = new AnnotationDiffer();
    differ.setSignificantFeaturesSet(new HashSet<String>(featureNames));
    differ.calculateDiff(keysIter, responsesIter); // compare

    return differ;
  }
Esempio n. 11
0
  /** Annotation added event */
  public void annotationAdded(AnnotationSetEvent ase) {

    if (!disableListener && ase.getSourceDocument() == this) {
      AnnotationSet as = (AnnotationSet) ase.getSource();
      Annotation annot = ase.getAnnotation();
      annot.addAnnotationListener(this);

      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());

      boolean defaultAS = as.getName() == null;
      for (String docID : combinedDocumentIds) {
        Document aDoc = compoundDocument.getDocument(docID);
        long stOffset = getOffsetInSrcDocument(docID, annot.getStartNode().getOffset().longValue());
        if (stOffset == -1) continue;
        long enOffset = getOffsetInSrcDocument(docID, annot.getEndNode().getOffset().longValue());
        if (enOffset == -1) continue;
        Annotation originalAnnot = null;
        try {
          Integer id = annot.getId();
          if (defaultAS) {

            aDoc.getAnnotations()
                .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features);
            originalAnnot = aDoc.getAnnotations().get(id);
          } else {
            aDoc.getAnnotations(as.getName())
                .add(id, new Long(stOffset), new Long(enOffset), annot.getType(), features);
            originalAnnot = aDoc.getAnnotations(as.getName()).get(id);
          }
        } catch (InvalidOffsetException ioe) {
          System.out.println(aDoc.getName() + "=" + stOffset + "=" + enOffset);
          throw new GateRuntimeException(ioe);
        }

        OffsetDetails od = new OffsetDetails();
        od.setOldStartOffset(stOffset);
        od.setOldEndOffset(enOffset);
        od.setNewStartOffset(annot.getStartNode().getOffset().longValue());
        od.setNewEndOffset(annot.getEndNode().getOffset().longValue());
        od.setOriginalAnnotation(originalAnnot);
        od.setNewAnnotation(annot);
        addNewOffsetDetails(docID, od);
        break;
      }
    }
  }
  public void splitter() {
    AnnotationSet sDetectionAS = gateDocument.getAnnotations("SentenceDetection");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = sDetectionAS.iterator(); it.hasNext(); ) {

      Annotation currentSentenceAnnotation = it.next();

      // Add the Sentence to the Annotation Set
      defaultAs.add(
          currentSentenceAnnotation.getStartNode(),
          currentSentenceAnnotation.getEndNode(),
          "Sentence",
          null);
    }
    gateDocument.removeAnnotationSet("SentenceDetection");
  }
  @Override
  public void execute() throws ExecutionException {
    Document doc = getDocument();
    AnnotationSet as = doc.getAnnotations(getAnnotationSetName());
    AnnotationSet tocs = as.get(getTokenAnnotationTypeName());

    try {

      for (Annotation t : tocs) {
        String content = Utils.stringFor(doc, t);
        String val = getOrthographyValue(content);
        if (val != null) t.getFeatures().put("orth", val);
      }

    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Esempio n. 14
0
 /**
  * Sub-range access for annotation sets (mapping to getContained). Allows <code>
  * someAnnotationSet[15..20]</code>. This works with ranges whose end points are any numeric type,
  * so as well as using integer literals you can do <code>someAnnotationSet[ann.start()..ann.end()]
  * </code> (as start and end return Long).
  *
  * @see AnnotationSet#getContained(Long, Long)
  */
 @SuppressWarnings("unchecked")
 public static AnnotationSet getAt(AnnotationSet self, Range<?> range) {
   if (range.getFrom() instanceof Number) {
     return self.getContained(
         Long.valueOf(((Number) range.getFrom()).longValue()),
         Long.valueOf(((Number) range.getTo()).longValue()));
   } else if (range.getFrom() instanceof String) {
     return getAt(self, (List<String>) range);
   } else {
     throw new IllegalArgumentException("AnnotationSet.getAt expects a numeric or string range");
   }
 }
Esempio n. 15
0
  /**
   * Two AnnotationSet are equal if their name, the documents of which belong to the AnnotationSets
   * and annotations from the sets are the same
   */
  public static boolean annotationSetsEqual(AnnotationSet as1, AnnotationSet as2) {
    if (as1 == null ^ as2 == null) return false;
    if (as1 == null) return true;
    // Sets equality
    if (as1.size() != as2.size()) return false;
    try {
      if (!as1.containsAll(as2)) return false;
    } catch (ClassCastException unused) {
      return false;
    } catch (NullPointerException unused) {
      return false;
    }

    // removed to prevent infinite looping in testDocumentsEqual()
    //    // verify the documents which they belong to
    //    if (! check (as1.getDocument(), as2.getDocument())) return false;

    // verify the name of the AnnotationSets
    if (!check(as1.getName(), as2.getName())) return false;
    return true;
  } // equals
 protected void doExecute(Document theDocument) throws ExecutionException {
   interrupted = false;
   if (theDocument == null) {
     throw new ExecutionException("No document to process!");
   }
   AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet());
   if (containingType == null || containingType.isEmpty()) {
     annotateText(document, outputAS, 0, document.getContent().size());
   } else {
     AnnotationSet inputAS = null;
     if (inputASName == null || inputASName.isEmpty()) {
       inputAS = theDocument.getAnnotations();
     } else {
       inputAS = theDocument.getAnnotations(inputASName);
     }
     AnnotationSet containingAnns = inputAS.get(containingType);
     for (Annotation containingAnn : containingAnns) {
       annotateText(
           document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn));
     }
   }
 }
Esempio n. 17
0
  public void execute() throws ExecutionException {
    AnnotationSet outputAS = document.getAnnotations(annotationSetName);

    String text = document.getContent().toString();

    Span[] tokens = tokenizer.getTokens(text);
    try {
      for (Span token : tokens) {
        FeatureMap features = Factory.newFeatureMap();
        features.put(
            ANNIEConstants.TOKEN_STRING_FEATURE_NAME,
            text.substring(token.getStart(), token.getEnd()));

        outputAS.add(
            (long) token.getStart(),
            (long) token.getEnd(),
            ANNIEConstants.TOKEN_ANNOTATION_TYPE,
            features);
      }
    } catch (Exception e) {
      throw new ExecutionException("error running tokenizer", e);
    }
  }
  public void doit(
      gate.Document doc,
      Map<String, AnnotationSet> bindings,
      gate.AnnotationSet annotations,
      gate.AnnotationSet inputAS,
      gate.AnnotationSet outputAS,
      gate.creole.ontology.Ontology ontology)
      throws JapeException {
    // your RHS Java code will be embedded here ...
    String category = (String) doc.getFeatures().get("category");
    Annotation mention = tagAnnots.iterator().next();
    String pName = (String) mention.getFeatures().get("class");

    OntoManager om = OntoManager.getInstance();
    ProductClass catClass = om.getProductClass(category);
    ProductClass pClass = om.getProductClass(pName);
    boolean sameCat =
        pClass
            .getOntClass()
            .hasSuperClass(catClass.getOntClass()); // TODO: refactor to used direct method
    if (!sameCat) {
      outputAS.remove(mention);
    }
  }
  /**
   * @param inputAS input annotation set
   * @param outputAS output annotation set
   * @param term String matched
   * @param startOffset match start offset
   * @param endOffset match end offset
   */
  private void addLookup(
      AnnotationSet inputAS,
      AnnotationSet outputAS,
      String term,
      String outputASType,
      Long startOffset,
      Long endOffset,
      boolean useNounChunk) {
    if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) {
      AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset);
      if (!nounChunkAS.isEmpty()) {
        startOffset = nounChunkAS.firstNode().getOffset();
        endOffset = nounChunkAS.lastNode().getOffset();
      }
    }
    try {
      AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset);
      if (diseaseAS.isEmpty()) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("match", term);
        outputAS.add(startOffset, endOffset, outputASType, fm);
      } else {
        Annotation disease = diseaseAS.iterator().next();
        FeatureMap fm = disease.getFeatures();
        String meta = (String) fm.get("match");
        if (meta != null) {
          meta = meta + " " + term;
        }
        fm.put("match", meta);
      }

    } catch (InvalidOffsetException ie) {
      // shouldn't happen
      gate.util.Err.println(ie);
    }
  }
Esempio n. 20
0
  /** Returns a list of annotations to be added to the Behemoth document from the GATE one * */
  private List<com.digitalpebble.behemoth.Annotation> convertGATEAnnotationsToBehemoth(
      AnnotationSet GATEAnnotionSet, com.digitalpebble.behemoth.BehemothDocument behemoth) {

    List<com.digitalpebble.behemoth.Annotation> beheannotations =
        new ArrayList<com.digitalpebble.behemoth.Annotation>();

    AnnotationSet resultAS = GATEAnnotionSet.get(filters.getTypes());

    // sort the GATE annotations
    List<gate.Annotation> annotationList = new ArrayList<gate.Annotation>(resultAS);
    Collections.sort(annotationList, new OffsetComparator());
    Iterator<gate.Annotation> inputASIter = annotationList.iterator();

    while (inputASIter.hasNext()) {
      gate.Annotation source = inputASIter.next();

      com.digitalpebble.behemoth.Annotation target = new com.digitalpebble.behemoth.Annotation();
      target.setType(source.getType());
      target.setStart(source.getStartNode().getOffset().longValue());
      target.setEnd(source.getEndNode().getOffset().longValue());

      // now do the features
      // is the type listed?
      Set<String> expectedfeatnames = filters.getFeatureFilters().get(source.getType());
      if (expectedfeatnames != null) {
        Iterator featurenames = source.getFeatures().keySet().iterator();
        while (featurenames.hasNext()) {
          // cast the feature name to a string which will be right in
          // 99% of cases
          String featurename = featurenames.next().toString();
          // if this feature name is not wanted just ignore it
          if (expectedfeatnames.contains(featurename) == false) continue;
          // we know that we want to keep this feature
          // let's see what the best way of representing the value
          // would be
          // TODO later => find a better way of mapping when not a
          // string
          Object originalvalue = source.getFeatures().get(featurename);
          if (originalvalue == null) originalvalue = "null";
          target.getFeatures().put(featurename, originalvalue.toString());
        }
      }
      beheannotations.add(target);
    }
    return beheannotations;
  }
Esempio n. 21
0
  /**
   * Creates the Lookup annotations according to a gazetteer match.
   *
   * @param matchingState the final FSMState that was reached while matching.
   * @param matchedRegionStart the start of the matched text region.
   * @param matchedRegionEnd the end of the matched text region.
   * @param annotationSet the annotation set where the new annotations should be added.
   */
  protected void createLookups(
      FSMState matchingState,
      long matchedRegionStart,
      long matchedRegionEnd,
      AnnotationSet annotationSet) {
    Iterator lookupIter = matchingState.getLookupSet().iterator();
    while (lookupIter.hasNext()) {
      Lookup currentLookup = (Lookup) lookupIter.next();
      FeatureMap fm = Factory.newFeatureMap();
      fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
      if (null != currentLookup.oClass && null != currentLookup.ontology) {
        fm.put(LOOKUP_CLASS_FEATURE_NAME, currentLookup.oClass);
        fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME, currentLookup.ontology);
      }

      if (null != currentLookup.minorType)
        fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
      if (null != currentLookup.languages)
        fm.put(LOOKUP_LANGUAGE_FEATURE_NAME, currentLookup.languages);
      if (null != currentLookup.features) {
        fm.putAll(currentLookup.features);
      }
      try {
        //        if(currentLookup.annotationType==null || "".equals(currentLookup.annotationType)){
        //          annotationSet.add(new Long(matchedRegionStart),
        //                          new Long(matchedRegionEnd + 1),
        //                          LOOKUP_ANNOTATION_TYPE,
        //                          fm);
        //        }else{
        annotationSet.add(
            new Long(matchedRegionStart),
            new Long(matchedRegionEnd + 1),
            currentLookup.annotationType, // this pojo attribute will have Lookup as a default tag.
            fm);
        // }
      } catch (InvalidOffsetException ioe) {
        throw new GateRuntimeException(ioe.toString());
      }
    } // while(lookupIter.hasNext())
  }
 /**
  * Rename annotation
  *
  * @param outputAS output annotation set
  * @param oldType old annotation name
  * @param newType new annotation name
  */
 private void renameAnnotations(AnnotationSet outputAS, String oldType, String newType) {
   AnnotationSet tmpAnatomyAS = outputAS.get(oldType);
   for (Annotation tmpAnn : tmpAnatomyAS) {
     Long startOffset = tmpAnn.getStartNode().getOffset();
     Long endOffset = tmpAnn.getEndNode().getOffset();
     AnnotationSet existingAS = outputAS.getCovering(newType, startOffset, endOffset);
     // If we've already got an annotation of the same name in the same place, don't add a new one
     // just delete the old one
     if (existingAS.isEmpty()) {
       FeatureMap tmpFm = tmpAnn.getFeatures();
       FeatureMap fm = Factory.newFeatureMap();
       fm.putAll(tmpFm);
       try {
         outputAS.add(startOffset, endOffset, newType, fm);
         outputAS.remove(tmpAnn);
       } catch (InvalidOffsetException ie) {
         // shouldn't happen
       }
     } else {
       outputAS.remove(tmpAnn);
     }
   }
 }
  public void execute() throws ExecutionException {

    // get all the annotations we need from the input AS
    AnnotationSet inputAS =
        inputAnnotationSet == null || inputAnnotationSet.trim().length() == 0
            ? document.getAnnotations()
            : document.getAnnotations(inputAnnotationSet);
    AnnotationSet outputAS =
        outputAnnotationSet == null || outputAnnotationSet.trim().length() == 0
            ? document.getAnnotations()
            : document.getAnnotations(outputAnnotationSet);

    // no spans?
    if (getSpanAnnotationType() == null | getSpanAnnotationType().equals("")) {
      AnnotationSet inputs = inputAS.get(inputAnnotationType);
      List<Annotation> list = new ArrayList<Annotation>();
      list.addAll(inputs);
      Collections.sort(list, new OffsetComparator());
      // use window or normal
      if (window == -1) generateNGrams(list, outputAS);
      else generateNGramsOverWindow(list, outputAS);
    } else {
      // use the spans
      AnnotationSet spans = inputAS.get(getSpanAnnotationType());
      Iterator spaniter = spans.iterator();
      while (spaniter.hasNext()) {
        Annotation span = (Annotation) spaniter.next();
        AnnotationSet inputs =
            inputAS.get(
                inputAnnotationType,
                span.getStartNode().getOffset(),
                span.getEndNode().getOffset());
        List<Annotation> list = new ArrayList<Annotation>();
        list.addAll(inputs);
        Collections.sort(list, new OffsetComparator());
        if (window == -1) generateNGrams(list, outputAS);
        else generateNGramsOverWindow(list, outputAS);
      }
    }
  }
  public void execute() throws ExecutionException {

    // get the sentence splitter file from the URL provided
    File splitter = Files.fileFromURL(splitterBinary);

    // get the document content and replace non-breaking spaces with spaces
    // TODO replace new-lines with spaces so we don't get a sentence per line
    String docContent = document.getContent().toString().replace((char) 160, ' ');

    try {
      // create temporary files to use with the external sentence splitter
      File tmpIn = File.createTempFile("GENIA", ".txt");
      File tmpOut = File.createTempFile("GENIA", ".txt");

      // store the document content in the input file
      FileOutputStream fos = new FileOutputStream(tmpIn);
      fos.write(docContent.getBytes("utf8"));
      fos.close();

      // setup the command line to run the sentence splitter
      String[] args =
          new String[] {
            splitter.getAbsolutePath(), tmpIn.getAbsolutePath(), tmpOut.getAbsolutePath()
          };

      // run the sentence splitter over the docuement
      manager.runProcess(
          args, splitter.getParentFile(), (debug ? System.out : null), (debug ? System.err : null));

      // get the annotation set we are going to store results in
      AnnotationSet annotationSet = document.getAnnotations(annotationSetName);

      // we haven't found any sentence yet so start looking for the next one
      // from the beginning of the document
      int end = 0;

      // read in the output from the sentence splitter one line at a time
      BufferedReader in = new BufferedReader(new FileReader(tmpOut));
      String sentence = in.readLine();
      while (sentence != null) {

        // trim the sentence so we don't annotate extranious white space,
        // this isn't python code after all :)
        sentence = sentence.trim();

        // find the start of the sentence
        // TODO throw a sensible exception if the sentence can't be found?
        int start = docContent.indexOf(sentence, end);

        // work out where the sentence ends
        end = start + sentence.length();

        if (end > start) {
          // the sentence has a length so annotate it
          annotationSet.add((long) start, (long) end, "Sentence", Factory.newFeatureMap());
        }

        // get the next line from the output from the tagger
        sentence = in.readLine();
      }

      // delete the temp files
      if (!debug && !tmpIn.delete()) tmpIn.deleteOnExit();
      if (!debug && !tmpOut.delete()) tmpOut.deleteOnExit();

    } catch (Exception ioe) {
      throw new ExecutionException("An error occured running the splitter", ioe);
    }
  }
  /**
   * The main entry point. First we parse the command line options (see usage() method for details),
   * then we take all remaining command line parameters to be file names to process. Each file is
   * loaded, processed using the application and the results written to the output file
   * (inputFile.out.xml).
   */
  public static void main(String[] args) throws Exception {
    parseCommandLine(args);

    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application
    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(gappFile);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.

    ArrayList<String> files = getFilesFromDir(inputDir);
    gate.Corpus corpus = createCorpus(files);
    // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    System.out.println("Processing " + files.size() + " files");

    // process the files one by one
    for (int i = 0; i < files.size(); i++) {

      // load the document (using the specified encoding if one was given)
      File docFile = new File(files.get(i));
      System.out.print("Processing document " + docFile + " (" + i + ") ...");
      Document doc = Factory.newDocument(docFile.toURL(), encoding);

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      application.execute();

      // remove the document from the corpus again
      corpus.clear();

      String docXMLString = null;
      // if we want to just write out specific annotation types, we must
      // extract the annotations into a Set
      if (annotTypesToWrite != null) {
        // Create a temporary Set to hold the annotations we wish to write out
        Set annotationsToWrite = new HashSet();

        // we only extract annotations from the default (unnamed) AnnotationSet
        // in this example
        AnnotationSet defaultAnnots = doc.getAnnotations("Output");
        Iterator annotTypesIt = annotTypesToWrite.iterator();
        while (annotTypesIt.hasNext()) {
          // extract all the annotations of each requested type and add them to
          // the temporary set
          AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next());
          if (annotsOfThisType != null) {
            annotationsToWrite.addAll(annotsOfThisType);
          }
        }

        // create the XML string using these annotations
        docXMLString = doc.toXml(annotationsToWrite, true);
      }
      // otherwise, just write out the whole document as GateXML
      else {
        docXMLString = doc.toXml();
      }

      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);

      // output the XML to <inputFile>.out.xml
      System.out.println("Writing file " + docFile.getName());
      String outputFileName = docFile.getName() + ".out.xml";
      // File outputFile = new File(docFile.getParentFile(), outputFileName);
      File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName);

      // Write output files using the same encoding as the original
      FileOutputStream fos = new FileOutputStream(outputFile);
      BufferedOutputStream bos = new BufferedOutputStream(fos);
      OutputStreamWriter out;
      if (encoding == null) {
        out = new OutputStreamWriter(bos);
      } else {
        out = new OutputStreamWriter(bos, encoding);
      }

      out.write(docXMLString);

      out.close();
      System.out.println("done");
    } // for each file

    System.out.println("All done");
  } // void main(String[] args)
  @Override
  public void execute() throws ExecutionException {
    interrupted = false;

    // quit if setup failed
    if (gracefulExit) {
      gracefulExit("Plugin was not initialised correctly. Exiting gracefully ... ");
      return;
    }

    AnnotationSet inputAS =
        (inputASName == null || inputASName.trim().length() == 0)
            ? document.getAnnotations()
            : document.getAnnotations(inputASName);
    AnnotationSet outputAS =
        (outputASName == null || outputASName.trim().length() == 0)
            ? document.getAnnotations()
            : document.getAnnotations(outputASName);

    AnnotationSet sentenceAS = null;
    if (sentenceType != null && !sentenceType.isEmpty()) {
      sentenceAS = inputAS.get(sentenceType);
    }

    // Document content
    String docContent = document.getContent().toString();
    int docLen = docContent.length();

    // For matching purposes replace all whitespace characters with a single space
    docContent = docContent.replaceAll("[\\s\\xA0\\u2007\\u202F]", " ");

    fireStatusChanged("Locating anatomy, disease and procedure mentions in " + document.getName());
    fireProgressChanged(0);

    if (sentenceAS != null) {
      for (Annotation sentence : sentenceAS) {
        Long sentStartOffset = sentence.getStartNode().getOffset();
        Long sentEndOffset = sentence.getEndNode().getOffset();

        // Converting the sentence to lower case prevents the need to use case-insenstive regex
        // matching, which should give a small performance boost
        String sentenceContent =
            docContent
                .substring(sentStartOffset.intValue(), sentEndOffset.intValue())
                .toLowerCase(Locale.ENGLISH);

        if (diseaseType != null && !diseaseType.isEmpty()) {
          doMatch(
              patternMap.get("disease_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "suffDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_abbrevs"),
              sentenceContent,
              inputAS,
              outputAS,
              "preDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_named_syndrome"),
              sentenceContent,
              inputAS,
              outputAS,
              "namedDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_sense"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDiseaseSense",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_sense_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDiseaseSenseContext",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_generic_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "poDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_anatomy_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDisease",
              sentStartOffset,
              docLen);
        }

        if (procedureType != null && !procedureType.isEmpty()) {
          doMatch(
              patternMap.get("procedure_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "poProcedure",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("procedure_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poProcedure",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("procedure_anatomy_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpProcedure",
              sentStartOffset,
              docLen);
        }

        if (symptomType != null && !symptomType.isEmpty()) {
          doMatch(
              patternMap.get("symptom_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poSymptom",
              sentStartOffset,
              docLen);
        }

        if (testType != null && !testType.isEmpty()) {
          doMatch(
              patternMap.get("test_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poTest",
              sentStartOffset,
              docLen);
        }

        if (anatomyType != null && !anatomyType.isEmpty()) {
          doMatch(
              patternMap.get("anatomy_suffix_adjective"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSuffAdj",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSuff",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_prefix"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPre",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_position"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPos",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_space_region_junction"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSpace",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_part_adjective"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatAdj",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_latin_noun"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatLatin",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_muscle"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatMuscle",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_part"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPart",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_fluid"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatFluid",
              sentStartOffset,
              docLen);
        }
      }
      // Run JAPE transducer to clean up the output
      fireStatusChanged(
          "Processing anatomical, disease and procedure mentions in " + document.getName());
      try {
        japeTransducer.setDocument(document);
        japeTransducer.setInputASName(inputASName);
        japeTransducer.setOutputASName(outputASName);
        japeTransducer.addProgressListener(this);
        japeTransducer.execute();
      } catch (ExecutionException re) {
        gate.util.Err.println("Unable to run " + japeURL);
        gracefulExit = true;
      } finally {
        japeTransducer.setDocument(null);
      }
      // rename temporary annotations
      if (!debug) {
        renameAnnotations(outputAS, "tmpAnatomicalTerm", anatomyType);
        renameAnnotations(outputAS, "suffDisease", diseaseType);
        renameAnnotations(outputAS, "poDisease", diseaseType);
        renameAnnotations(outputAS, "preDisease", diseaseType);
        renameAnnotations(outputAS, "poProcedure", procedureType);
        renameAnnotations(outputAS, "poSymptom", symptomType);
        renameAnnotations(outputAS, "poTest", testType);
      }
    } else {
      gracefulExit("No sentences to process!");
    }

    // want list of disease key words plus symptoms such as oedema? or just diseases

    fireProcessFinished();
  } // end execute()
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main
Esempio n. 28
0
  @Override
  public void execute() throws ExecutionException {
    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);
    AnnotationSet tagAS = document.getAnnotations(tagASName);
    AnnotationSet annotsToTransfer = null;

    boolean newID = copyAnnotations && inputAS.equals(outputAS);

    mappings.clear();

    // TODO clean this up so we don't have to repeat ourselves
    if (configURL != null) {

      BufferedReader in = null;
      try {
        in = new BomStrippingInputStreamReader(configURL.openStream());

        String line = in.readLine();
        while (line != null) {
          if (!line.trim().equals("")) {
            String[] data = line.split("=", 2);
            String oldName = data[0].trim();
            String newName = data.length == 2 ? data[1].trim() : null;
            mappings.put(oldName, new Mapping(oldName, newName));
          }
          line = in.readLine();
        }
      } catch (IOException ioe) {
        ioe.printStackTrace();
      } finally {
        IOUtils.closeQuietly(in);
      }
    } else if (annotationTypes != null) {
      for (String type : annotationTypes) {
        String[] data = type.split("=", 2);
        String oldName = data[0].trim();
        String newName = data.length == 2 ? data[1].trim() : null;

        mappings.put(oldName, new Mapping(oldName, newName));
      }
    }
    // else
    // throw new
    // ExecutionException("The annotation list and URL cannot both be null");

    if (mappings.size() > 0) {
      annotsToTransfer = inputAS.get(mappings.keySet());
    } else {
      // transfer everything
      annotsToTransfer = inputAS.get();
    }
    // in case of no one annotation from some of annotationTypes
    if (annotsToTransfer == null || annotsToTransfer.size() == 0) return;
    // check if we have a BODY annotation
    // if not, just copy all
    if (textTagName == null || textTagName.equals("")) {
      // remove from input set unless we copy only
      if (!copyAnnotations) inputAS.removeAll(annotsToTransfer);
      transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID);

      return;
    }
    // get the BODY annotation
    bodyAnnotations = tagAS.get(textTagName);
    if (bodyAnnotations == null || bodyAnnotations.isEmpty()) {
      // outputAS.addAll(inputAS);
      if (transferAllUnlessFound) {
        // remove from input set unless we copy only
        if (!copyAnnotations) inputAS.removeAll(annotsToTransfer);
        transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID);
      }
      return;
    }
    List<Annotation> annots2Move = new ArrayList<Annotation>();
    Iterator<Annotation> bodyIter = bodyAnnotations.iterator();
    while (bodyIter.hasNext()) {
      Annotation bodyAnn = bodyIter.next();
      Long start = bodyAnn.getStartNode().getOffset();
      Long end = bodyAnn.getEndNode().getOffset();
      // get all annotations we want transferred
      AnnotationSet annots2Copy = annotsToTransfer.getContained(start, end);
      // copy them to the new set and delete them from the old one
      annots2Move.addAll(annots2Copy);
    }
    if (!copyAnnotations) inputAS.removeAll(annots2Move);
    transferAnnotations(annots2Move, outputAS, newID);
  }
 /**
  * This method annotates paragraphs in a GATE document. The investigated text spans beetween start
  * and end offsets and the paragraph annotations are created in the annotSetName. If annotSetName
  * is null then they are creted in the default annotation set.
  *
  * @param aDoc is the gate document on which the paragraph detection would be performed.If it is
  *     null or its content it's null then the method woul simply return doing nothing.
  * @param startOffset is the index form the document content from which the paragraph detection
  *     will start
  * @param endOffset is the offset where the detection will end.
  * @param annotSetName is the name of the set in which paragraph annotation would be created.The
  *     annotation type created will be "paragraph"
  */
 public void annotateParagraphs(Document aDoc, int startOffset, int endOffset, String annotSetName)
     throws DocumentFormatException {
   // Simply return if the document is null or its content
   if (aDoc == null || aDoc.getContent() == null) return;
   // Simply return if the start is > than the end
   if (startOffset > endOffset) return;
   // Decide where to put the newly detected annotations
   AnnotationSet annotSet = null;
   if (annotSetName == null) annotSet = aDoc.getAnnotations();
   else annotSet = aDoc.getAnnotations(annotSetName);
   // Extract the document content
   String content = aDoc.getContent().toString();
   // This is the offset marking the start of a para
   int startOffsetPara = startOffset;
   // This marks the ned of a para
   int endOffsetPara = endOffset;
   // The initial sate of the FSA
   int state = 1;
   // This field marks that a BR entity was read
   // A BR entity can be NL or NL CR, depending on the operating system (UNIX
   // or DOS)
   boolean readBR = false;
   int index = startOffset;
   while (index < endOffset) {
     // Read the current char
     char ch = content.charAt(index);
     // Test if a BR entity was read
     if (ch == '\n') {
       readBR = true;
       // If \n is followed by a \r then advance the index in order to read a
       // BR entity
       while ((index + 1 < endOffset) && (content.charAt(index + 1) == '\r')) index++;
     } // End if
     switch (state) {
         // It is the initial and also a final state
         // Stay in state 1 while it reads whitespaces
       case 1:
         {
           // If reads a non whitespace char then move to state 2 and record
           // the beggining of a paragraph
           if (!Character.isWhitespace(ch)) {
             state = 2;
             startOffsetPara = index;
           } // End if
         }
         break;
         // It can be also a final state.
       case 2:
         {
           // Stay in state 2 while reading chars != BR entities
           if (readBR) {
             // If you find a BR char go to state 3. The possible end of the para
             // can be index. This will be confirmed by state 3. So, this is why
             // the end of a para is recorded here.
             readBR = false;
             endOffsetPara = index;
             state = 3;
           } // End if
         }
         break;
         // It can be also a final state
         // From state 3 there are only 2 possible ways: (state 2 or state1)
         // In state 1 it needs to read a BR
         // For state 2 it nead to read something different then a BR
       case 3:
         {
           if (readBR) {
             // A BR was read. Go to state 1
             readBR = false;
             state = 1;
             // Create an annotation type paragraph
             try {
               annotSet.add(
                   new Long(startOffsetPara),
                   new Long(endOffsetPara),
                   "paragraph",
                   Factory.newFeatureMap());
             } catch (gate.util.InvalidOffsetException ioe) {
               throw new DocumentFormatException(
                   "Coudn't create a paragraph" + " annotation", ioe);
             } // End try
           } else {
             // Go to state 2 an keep reading chars
             state = 2;
           } // End if
         }
         break;
     } // End switch
     // Prepare to read the next char.
     index++;
   } // End while
   endOffsetPara = index;
   // Investigate where the finite automata has stoped
   if (state == 2 || state == 3) {
     // Create an annotation type paragraph
     try {
       annotSet.add(
           new Long(startOffsetPara),
           // Create the final annotation using the endOffset
           new Long(endOffsetPara),
           "paragraph",
           Factory.newFeatureMap());
     } catch (gate.util.InvalidOffsetException ioe) {
       throw new DocumentFormatException("Coudn't create a paragraph" + " annotation", ioe);
     } // End try
   } // End if
 } // End annotateParagraphs();
Esempio n. 30
0
  private RetObj ProcessRecords() throws Exception {
    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    this.application.setCorpus(corpus);

    // object for returned data
    List<String> processedlines = new ArrayList<String>();
    List<String> processedText = new ArrayList<String>();

    for (int record_num = 0; record_num < this.recs.size(); ++record_num) {
      /*if( record_num % Math.ceil(((double) this.recs.size())/10.0) == 0)
           System.out.println("Thread " + this.threadID + ": "+ ((int) ((double)record_num)/((double) this.recs.size())*100.0 ) +"% complete.");
      */

      // first, split title from body and get embedded age in title..
      String title_age = "-1";
      String sep = "..THIS IS MY SEPARATION STRING..";
      String title = "";
      String body = this.recs.get(record_num);
      Boolean trimmed = false;
      int age_end = body.indexOf(",>           ");
      if (age_end >= 0 && age_end < body.length()) {
        int age_start = body.lastIndexOf("-", age_end);
        if (age_start >= 0 && age_start < age_end) {
          title_age = body.substring(age_start + 1, age_end).trim();
          if (!isInteger(title_age)) title_age = "-1";
          else {
            title = body.substring(0, age_start);
            body = body.substring(age_end + 2, body.length());
            body = title + sep + body;
            trimmed = true;
          }
        }
        if (!trimmed) {
          title = body.substring(0, age_end);
          body = body.substring(age_end + 2, body.length());
          body = title + sep + body;
          trimmed = true;
        }
      }
      // --------------------

      org.jsoup.nodes.Document htmldoc =
          Jsoup.parseBodyFragment(body.replaceAll("COMMA_GOES_HERE", ","));
      Elements links = htmldoc.select("a[href]");
      Elements media = htmldoc.select("[src]");
      Elements imports = htmldoc.select("link[href]");

      processedText.add(htmldoc.text().replace(sep, " "));
      Document doc = Factory.newDocument(htmldoc.text());

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      this.application.execute();

      // remove the document from the corpus again
      corpus.clear();

      // extract annotations
      String line = "";
      AnnotationSet Annots = doc.getAnnotations("");

      Integer FirstPersonCount = 0, ThirdPersonCount = 0;
      AnnotationSet FirstPerson = Annots.get("FirstPerson");
      if (FirstPerson != null) FirstPersonCount = FirstPerson.size();
      AnnotationSet ThirdPerson = Annots.get("ThirdPerson");
      if (ThirdPerson != null) ThirdPersonCount = ThirdPerson.size();
      line += FirstPersonCount.toString() + "," + ThirdPersonCount.toString() + ",";

      AnnotationSet Names = Annots.get("Name");
      if (Names == null || Names.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Names.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("name");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Age = Annots.get("Age");
      if (Age == null || Age.size() < 1) line += title_age + ",";
      else {
        Iterator<Annotation> Iter = Age.inDocumentOrder().iterator();
        line += title_age + ";";
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("age");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Cost = Annots.get("Cost");
      if (Cost == null || Cost.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Cost.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_type");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet height = Annots.get("height");
      if (height == null || height.size() < 1) line += ",,";
      else {
        String ft = "";
        String inch = "";
        Iterator<Annotation> Iter = height.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("feet");
          if (Feat != null) ft += Feat.toString();
          else ft += "none";
          Feat = Ann.getFeatures().get("inches");
          if (Feat != null) inch += Feat.toString();
          else inch += "none";
          if (Iter.hasNext()) {
            ft += ";";
            inch += ";";
          }
        }
        line += ft + "," + inch + ",";
      }

      AnnotationSet weight = Annots.get("weight");
      if (weight == null || weight.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = weight.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("pounds");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet measurement = Annots.get("measurement");
      if (measurement == null || measurement.size() < 1) line += ",,,,";
      else {
        String cup = "";
        String chest = "";
        String waist = "";
        String hip = "";
        Iterator<Annotation> Iter = measurement.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("cup");
          if (Feat != null) cup += Feat.toString();
          else cup += "none";
          Feat = Ann.getFeatures().get("chest");
          if (Feat != null) chest += Feat.toString();
          else chest += "none";
          Feat = Ann.getFeatures().get("waist");
          if (Feat != null) waist += Feat.toString();
          else waist += "none";
          Feat = Ann.getFeatures().get("hip");
          if (Feat != null) hip += Feat.toString();
          else hip += "none";
          if (Iter.hasNext()) {
            cup += ";";
            chest += ";";
            waist += ";";
            hip += ";";
          }
        }
        line += cup + "," + chest + "," + waist + "," + hip + ",";
      }

      AnnotationSet Ethnicity = Annots.get("Ethnicity");
      if (Ethnicity == null || Ethnicity.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Ethnicity.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet SkinColor = Annots.get("SkinColor");
      if (SkinColor == null || SkinColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = SkinColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet EyeColor = Annots.get("EyeColor");
      if (EyeColor == null || EyeColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = EyeColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet HairColor = Annots.get("HairColor");
      if (HairColor == null || HairColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = HairColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Restriction = Annots.get("Restriction");
      if (Restriction == null || Restriction.size() < 1) line += ",,,";
      else {
        String type = "";
        String ethnicity = "";
        String age = "";
        Iterator<Annotation> Iter = Restriction.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("type");
          if (Feat != null) type += Feat.toString();
          else type += "none";
          Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            ethnicity += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else ethnicity += "none";
          Feat = Ann.getFeatures().get("age");
          if (Feat != null) age += Feat.toString();
          else age += "none";
          if (Iter.hasNext()) {
            type += ";";
            ethnicity += ";";
            age += ";";
          }
        }
        line += type + "," + ethnicity + "," + age + ",";
      }

      AnnotationSet Phone = Annots.get("PhoneNumber");
      if (Phone == null || Phone.size() < 1) line += ",,,";
      else {
        String value = "";
        String state = "";
        String city = "";
        Iterator<Annotation> Iter = Phone.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) value += Feat.toString();
          else value += "none";
          Feat = Ann.getFeatures().get("state");
          if (Feat != null) state += Feat.toString();
          else state += "none";
          Feat = Ann.getFeatures().get("area");
          if (Feat != null)
            city += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else city += "none";
          if (Iter.hasNext()) {
            value += ";";
            state += ";";
            city += ";";
          }
        }
        line += value + "," + state + "," + city + ",";
      }

      String Emails = "";
      AnnotationSet Email = Annots.get("Email");
      if (Email == null || Email.size() < 1) Emails = "";
      else {
        Iterator<Annotation> Iter = Email.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("email");
          if (Feat != null)
            Emails += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() > 7 && href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Emails +=
                href.substring(7, href.length()).replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (Emails.length() > 0 && Emails.substring(Emails.length() - 1, Emails.length()).equals(";"))
        Emails = Emails.substring(0, Emails.length() - 1);
      line += Emails + ",";

      String Urls = "";
      AnnotationSet Url = Annots.get("Url");
      if (Url == null || Url.size() < 1) Urls = "";
      else {
        Iterator<Annotation> Iter = Url.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("url");
          if (Feat != null)
            Urls += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() <= 7 || !href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (imports != null) {
        for (Element l : imports) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Urls.length() > 0 && Urls.substring(Urls.length() - 1, Urls.length()).equals(";"))
        Urls = Urls.substring(0, Urls.length() - 1);
      line += Urls + ",";

      String Medias = "";
      if (media != null) {
        for (Element l : media) {
          String src = l.attr("abs:src");
          if (src == null) continue;
          Medias += src.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Medias.length() > 0 && Medias.substring(Medias.length() - 1, Medias.length()).equals(";"))
        Medias = Medias.substring(0, Medias.length() - 1);
      line += Medias;

      processedlines.add(line);
      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);
    }
    Factory.deleteResource(corpus);

    RetObj out = new RetObj(processedlines, processedText);
    return out;
  }