コード例 #1
0
  @Override
  public void execute() throws ExecutionException {
    initBeforeExecute();

    AnnotationSet tokensAndDependenciesAS = inputAS;
    TreeIndex index =
        new GateAwareTreeIndex(
            tokensAndDependenciesAS.get(null, Utils.setFromArray(new String[] {"args"})));

    QueryData data =
        new QueryData(index, new GateAnnotationsNodeAttributes(tokensAndDependenciesAS));

    Iterable<QueryMatch> results = queryObject.evaluate(data);

    int queryMatchOrd = 0;

    for (QueryMatch result : results) {
      queryMatchOrd++;
      for (NodeMatch match : result.getMatchingNodes()) {
        String name = match.getQueryNode().getName();
        if (name != null) {
          Annotation matchingAnnot = tokensAndDependenciesAS.get(match.getNodeId());
          FeatureMap fm = Factory.newFeatureMap();
          fm.put("matchingNodeId", match.getNodeId());
          fm.put(
              "queryMatchId",
              String.format("%s_%03d", buildQueryStringHash(getQueryString()), queryMatchOrd));
          outputAS.add(matchingAnnot.getStartNode(), matchingAnnot.getEndNode(), name, fm);
        }
      }
    }
  }
コード例 #2
0
  protected AnnotationDiffer calculateDocumentDiff(Document document, String annotTypeName) {
    AnnotationSet responsesIter = responseAS.get(annotTypeName);

    if (getKeyAnnotationsAreInDocumentFeatures()) {
      return DocumentFeaturesDiff.computeDiffWithDocFeatures(document, featureNames, responsesIter);
    }

    AnnotationSet keysIter = keyAS.get(annotTypeName);

    AnnotationDiffer differ = new AnnotationDiffer();
    differ.setSignificantFeaturesSet(new HashSet<String>(featureNames));
    differ.calculateDiff(keysIter, responsesIter); // compare

    return differ;
  }
コード例 #3
0
  public void execute() throws ExecutionException {

    // get all the annotations we need from the input AS
    AnnotationSet inputAS =
        inputAnnotationSet == null || inputAnnotationSet.trim().length() == 0
            ? document.getAnnotations()
            : document.getAnnotations(inputAnnotationSet);
    AnnotationSet outputAS =
        outputAnnotationSet == null || outputAnnotationSet.trim().length() == 0
            ? document.getAnnotations()
            : document.getAnnotations(outputAnnotationSet);

    // no spans?
    if (getSpanAnnotationType() == null | getSpanAnnotationType().equals("")) {
      AnnotationSet inputs = inputAS.get(inputAnnotationType);
      List<Annotation> list = new ArrayList<Annotation>();
      list.addAll(inputs);
      Collections.sort(list, new OffsetComparator());
      // use window or normal
      if (window == -1) generateNGrams(list, outputAS);
      else generateNGramsOverWindow(list, outputAS);
    } else {
      // use the spans
      AnnotationSet spans = inputAS.get(getSpanAnnotationType());
      Iterator spaniter = spans.iterator();
      while (spaniter.hasNext()) {
        Annotation span = (Annotation) spaniter.next();
        AnnotationSet inputs =
            inputAS.get(
                inputAnnotationType,
                span.getStartNode().getOffset(),
                span.getEndNode().getOffset());
        List<Annotation> list = new ArrayList<Annotation>();
        list.addAll(inputs);
        Collections.sort(list, new OffsetComparator());
        if (window == -1) generateNGrams(list, outputAS);
        else generateNGramsOverWindow(list, outputAS);
      }
    }
  }
コード例 #4
0
  /** Returns a list of annotations to be added to the Behemoth document from the GATE one * */
  private List<com.digitalpebble.behemoth.Annotation> convertGATEAnnotationsToBehemoth(
      AnnotationSet GATEAnnotionSet, com.digitalpebble.behemoth.BehemothDocument behemoth) {

    List<com.digitalpebble.behemoth.Annotation> beheannotations =
        new ArrayList<com.digitalpebble.behemoth.Annotation>();

    AnnotationSet resultAS = GATEAnnotionSet.get(filters.getTypes());

    // sort the GATE annotations
    List<gate.Annotation> annotationList = new ArrayList<gate.Annotation>(resultAS);
    Collections.sort(annotationList, new OffsetComparator());
    Iterator<gate.Annotation> inputASIter = annotationList.iterator();

    while (inputASIter.hasNext()) {
      gate.Annotation source = inputASIter.next();

      com.digitalpebble.behemoth.Annotation target = new com.digitalpebble.behemoth.Annotation();
      target.setType(source.getType());
      target.setStart(source.getStartNode().getOffset().longValue());
      target.setEnd(source.getEndNode().getOffset().longValue());

      // now do the features
      // is the type listed?
      Set<String> expectedfeatnames = filters.getFeatureFilters().get(source.getType());
      if (expectedfeatnames != null) {
        Iterator featurenames = source.getFeatures().keySet().iterator();
        while (featurenames.hasNext()) {
          // cast the feature name to a string which will be right in
          // 99% of cases
          String featurename = featurenames.next().toString();
          // if this feature name is not wanted just ignore it
          if (expectedfeatnames.contains(featurename) == false) continue;
          // we know that we want to keep this feature
          // let's see what the best way of representing the value
          // would be
          // TODO later => find a better way of mapping when not a
          // string
          Object originalvalue = source.getFeatures().get(featurename);
          if (originalvalue == null) originalvalue = "null";
          target.getFeatures().put(featurename, originalvalue.toString());
        }
      }
      beheannotations.add(target);
    }
    return beheannotations;
  }
コード例 #5
0
  @Override
  public void execute() throws ExecutionException {
    Document doc = getDocument();
    AnnotationSet as = doc.getAnnotations(getAnnotationSetName());
    AnnotationSet tocs = as.get(getTokenAnnotationTypeName());

    try {

      for (Annotation t : tocs) {
        String content = Utils.stringFor(doc, t);
        String val = getOrthographyValue(content);
        if (val != null) t.getFeatures().put("orth", val);
      }

    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
コード例 #6
0
 protected void doExecute(Document theDocument) throws ExecutionException {
   interrupted = false;
   if (theDocument == null) {
     throw new ExecutionException("No document to process!");
   }
   AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet());
   if (containingType == null || containingType.isEmpty()) {
     annotateText(document, outputAS, 0, document.getContent().size());
   } else {
     AnnotationSet inputAS = null;
     if (inputASName == null || inputASName.isEmpty()) {
       inputAS = theDocument.getAnnotations();
     } else {
       inputAS = theDocument.getAnnotations(inputASName);
     }
     AnnotationSet containingAnns = inputAS.get(containingType);
     for (Annotation containingAnn : containingAnns) {
       annotateText(
           document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn));
     }
   }
 }
 /**
  * Rename annotation
  *
  * @param outputAS output annotation set
  * @param oldType old annotation name
  * @param newType new annotation name
  */
 private void renameAnnotations(AnnotationSet outputAS, String oldType, String newType) {
   AnnotationSet tmpAnatomyAS = outputAS.get(oldType);
   for (Annotation tmpAnn : tmpAnatomyAS) {
     Long startOffset = tmpAnn.getStartNode().getOffset();
     Long endOffset = tmpAnn.getEndNode().getOffset();
     AnnotationSet existingAS = outputAS.getCovering(newType, startOffset, endOffset);
     // If we've already got an annotation of the same name in the same place, don't add a new one
     // just delete the old one
     if (existingAS.isEmpty()) {
       FeatureMap tmpFm = tmpAnn.getFeatures();
       FeatureMap fm = Factory.newFeatureMap();
       fm.putAll(tmpFm);
       try {
         outputAS.add(startOffset, endOffset, newType, fm);
         outputAS.remove(tmpAnn);
       } catch (InvalidOffsetException ie) {
         // shouldn't happen
       }
     } else {
       outputAS.remove(tmpAnn);
     }
   }
 }
  /**
   * @param inputAS input annotation set
   * @param outputAS output annotation set
   * @param term String matched
   * @param startOffset match start offset
   * @param endOffset match end offset
   */
  private void addLookup(
      AnnotationSet inputAS,
      AnnotationSet outputAS,
      String term,
      String outputASType,
      Long startOffset,
      Long endOffset,
      boolean useNounChunk) {
    if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) {
      AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset);
      if (!nounChunkAS.isEmpty()) {
        startOffset = nounChunkAS.firstNode().getOffset();
        endOffset = nounChunkAS.lastNode().getOffset();
      }
    }
    try {
      AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset);
      if (diseaseAS.isEmpty()) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("match", term);
        outputAS.add(startOffset, endOffset, outputASType, fm);
      } else {
        Annotation disease = diseaseAS.iterator().next();
        FeatureMap fm = disease.getFeatures();
        String meta = (String) fm.get("match");
        if (meta != null) {
          meta = meta + " " + term;
        }
        fm.put("match", meta);
      }

    } catch (InvalidOffsetException ie) {
      // shouldn't happen
      gate.util.Err.println(ie);
    }
  }
  @Override
  public void execute() throws ExecutionException {
    interrupted = false;

    // quit if setup failed
    if (gracefulExit) {
      gracefulExit("Plugin was not initialised correctly. Exiting gracefully ... ");
      return;
    }

    AnnotationSet inputAS =
        (inputASName == null || inputASName.trim().length() == 0)
            ? document.getAnnotations()
            : document.getAnnotations(inputASName);
    AnnotationSet outputAS =
        (outputASName == null || outputASName.trim().length() == 0)
            ? document.getAnnotations()
            : document.getAnnotations(outputASName);

    AnnotationSet sentenceAS = null;
    if (sentenceType != null && !sentenceType.isEmpty()) {
      sentenceAS = inputAS.get(sentenceType);
    }

    // Document content
    String docContent = document.getContent().toString();
    int docLen = docContent.length();

    // For matching purposes replace all whitespace characters with a single space
    docContent = docContent.replaceAll("[\\s\\xA0\\u2007\\u202F]", " ");

    fireStatusChanged("Locating anatomy, disease and procedure mentions in " + document.getName());
    fireProgressChanged(0);

    if (sentenceAS != null) {
      for (Annotation sentence : sentenceAS) {
        Long sentStartOffset = sentence.getStartNode().getOffset();
        Long sentEndOffset = sentence.getEndNode().getOffset();

        // Converting the sentence to lower case prevents the need to use case-insenstive regex
        // matching, which should give a small performance boost
        String sentenceContent =
            docContent
                .substring(sentStartOffset.intValue(), sentEndOffset.intValue())
                .toLowerCase(Locale.ENGLISH);

        if (diseaseType != null && !diseaseType.isEmpty()) {
          doMatch(
              patternMap.get("disease_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "suffDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_abbrevs"),
              sentenceContent,
              inputAS,
              outputAS,
              "preDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_named_syndrome"),
              sentenceContent,
              inputAS,
              outputAS,
              "namedDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_sense"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDiseaseSense",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_sense_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDiseaseSenseContext",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_generic_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "poDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_anatomy_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDisease",
              sentStartOffset,
              docLen);
        }

        if (procedureType != null && !procedureType.isEmpty()) {
          doMatch(
              patternMap.get("procedure_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "poProcedure",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("procedure_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poProcedure",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("procedure_anatomy_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpProcedure",
              sentStartOffset,
              docLen);
        }

        if (symptomType != null && !symptomType.isEmpty()) {
          doMatch(
              patternMap.get("symptom_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poSymptom",
              sentStartOffset,
              docLen);
        }

        if (testType != null && !testType.isEmpty()) {
          doMatch(
              patternMap.get("test_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poTest",
              sentStartOffset,
              docLen);
        }

        if (anatomyType != null && !anatomyType.isEmpty()) {
          doMatch(
              patternMap.get("anatomy_suffix_adjective"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSuffAdj",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSuff",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_prefix"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPre",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_position"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPos",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_space_region_junction"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSpace",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_part_adjective"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatAdj",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_latin_noun"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatLatin",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_muscle"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatMuscle",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_part"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPart",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_fluid"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatFluid",
              sentStartOffset,
              docLen);
        }
      }
      // Run JAPE transducer to clean up the output
      fireStatusChanged(
          "Processing anatomical, disease and procedure mentions in " + document.getName());
      try {
        japeTransducer.setDocument(document);
        japeTransducer.setInputASName(inputASName);
        japeTransducer.setOutputASName(outputASName);
        japeTransducer.addProgressListener(this);
        japeTransducer.execute();
      } catch (ExecutionException re) {
        gate.util.Err.println("Unable to run " + japeURL);
        gracefulExit = true;
      } finally {
        japeTransducer.setDocument(null);
      }
      // rename temporary annotations
      if (!debug) {
        renameAnnotations(outputAS, "tmpAnatomicalTerm", anatomyType);
        renameAnnotations(outputAS, "suffDisease", diseaseType);
        renameAnnotations(outputAS, "poDisease", diseaseType);
        renameAnnotations(outputAS, "preDisease", diseaseType);
        renameAnnotations(outputAS, "poProcedure", procedureType);
        renameAnnotations(outputAS, "poSymptom", symptomType);
        renameAnnotations(outputAS, "poTest", testType);
      }
    } else {
      gracefulExit("No sentences to process!");
    }

    // want list of disease key words plus symptoms such as oedema? or just diseases

    fireProcessFinished();
  } // end execute()
コード例 #10
0
  /**
   * The main entry point. First we parse the command line options (see usage() method for details),
   * then we take all remaining command line parameters to be file names to process. Each file is
   * loaded, processed using the application and the results written to the output file
   * (inputFile.out.xml).
   */
  public static void main(String[] args) throws Exception {
    parseCommandLine(args);

    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application
    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(gappFile);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.

    ArrayList<String> files = getFilesFromDir(inputDir);
    gate.Corpus corpus = createCorpus(files);
    // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    System.out.println("Processing " + files.size() + " files");

    // process the files one by one
    for (int i = 0; i < files.size(); i++) {

      // load the document (using the specified encoding if one was given)
      File docFile = new File(files.get(i));
      System.out.print("Processing document " + docFile + " (" + i + ") ...");
      Document doc = Factory.newDocument(docFile.toURL(), encoding);

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      application.execute();

      // remove the document from the corpus again
      corpus.clear();

      String docXMLString = null;
      // if we want to just write out specific annotation types, we must
      // extract the annotations into a Set
      if (annotTypesToWrite != null) {
        // Create a temporary Set to hold the annotations we wish to write out
        Set annotationsToWrite = new HashSet();

        // we only extract annotations from the default (unnamed) AnnotationSet
        // in this example
        AnnotationSet defaultAnnots = doc.getAnnotations("Output");
        Iterator annotTypesIt = annotTypesToWrite.iterator();
        while (annotTypesIt.hasNext()) {
          // extract all the annotations of each requested type and add them to
          // the temporary set
          AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next());
          if (annotsOfThisType != null) {
            annotationsToWrite.addAll(annotsOfThisType);
          }
        }

        // create the XML string using these annotations
        docXMLString = doc.toXml(annotationsToWrite, true);
      }
      // otherwise, just write out the whole document as GateXML
      else {
        docXMLString = doc.toXml();
      }

      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);

      // output the XML to <inputFile>.out.xml
      System.out.println("Writing file " + docFile.getName());
      String outputFileName = docFile.getName() + ".out.xml";
      // File outputFile = new File(docFile.getParentFile(), outputFileName);
      File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName);

      // Write output files using the same encoding as the original
      FileOutputStream fos = new FileOutputStream(outputFile);
      BufferedOutputStream bos = new BufferedOutputStream(fos);
      OutputStreamWriter out;
      if (encoding == null) {
        out = new OutputStreamWriter(bos);
      } else {
        out = new OutputStreamWriter(bos, encoding);
      }

      out.write(docXMLString);

      out.close();
      System.out.println("done");
    } // for each file

    System.out.println("All done");
  } // void main(String[] args)
コード例 #11
0
  private RetObj ProcessRecords() throws Exception {
    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    this.application.setCorpus(corpus);

    // object for returned data
    List<String> processedlines = new ArrayList<String>();
    List<String> processedText = new ArrayList<String>();

    for (int record_num = 0; record_num < this.recs.size(); ++record_num) {
      /*if( record_num % Math.ceil(((double) this.recs.size())/10.0) == 0)
           System.out.println("Thread " + this.threadID + ": "+ ((int) ((double)record_num)/((double) this.recs.size())*100.0 ) +"% complete.");
      */

      // first, split title from body and get embedded age in title..
      String title_age = "-1";
      String sep = "..THIS IS MY SEPARATION STRING..";
      String title = "";
      String body = this.recs.get(record_num);
      Boolean trimmed = false;
      int age_end = body.indexOf(",>           ");
      if (age_end >= 0 && age_end < body.length()) {
        int age_start = body.lastIndexOf("-", age_end);
        if (age_start >= 0 && age_start < age_end) {
          title_age = body.substring(age_start + 1, age_end).trim();
          if (!isInteger(title_age)) title_age = "-1";
          else {
            title = body.substring(0, age_start);
            body = body.substring(age_end + 2, body.length());
            body = title + sep + body;
            trimmed = true;
          }
        }
        if (!trimmed) {
          title = body.substring(0, age_end);
          body = body.substring(age_end + 2, body.length());
          body = title + sep + body;
          trimmed = true;
        }
      }
      // --------------------

      org.jsoup.nodes.Document htmldoc =
          Jsoup.parseBodyFragment(body.replaceAll("COMMA_GOES_HERE", ","));
      Elements links = htmldoc.select("a[href]");
      Elements media = htmldoc.select("[src]");
      Elements imports = htmldoc.select("link[href]");

      processedText.add(htmldoc.text().replace(sep, " "));
      Document doc = Factory.newDocument(htmldoc.text());

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      this.application.execute();

      // remove the document from the corpus again
      corpus.clear();

      // extract annotations
      String line = "";
      AnnotationSet Annots = doc.getAnnotations("");

      Integer FirstPersonCount = 0, ThirdPersonCount = 0;
      AnnotationSet FirstPerson = Annots.get("FirstPerson");
      if (FirstPerson != null) FirstPersonCount = FirstPerson.size();
      AnnotationSet ThirdPerson = Annots.get("ThirdPerson");
      if (ThirdPerson != null) ThirdPersonCount = ThirdPerson.size();
      line += FirstPersonCount.toString() + "," + ThirdPersonCount.toString() + ",";

      AnnotationSet Names = Annots.get("Name");
      if (Names == null || Names.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Names.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("name");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Age = Annots.get("Age");
      if (Age == null || Age.size() < 1) line += title_age + ",";
      else {
        Iterator<Annotation> Iter = Age.inDocumentOrder().iterator();
        line += title_age + ";";
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("age");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Cost = Annots.get("Cost");
      if (Cost == null || Cost.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Cost.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_type");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet height = Annots.get("height");
      if (height == null || height.size() < 1) line += ",,";
      else {
        String ft = "";
        String inch = "";
        Iterator<Annotation> Iter = height.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("feet");
          if (Feat != null) ft += Feat.toString();
          else ft += "none";
          Feat = Ann.getFeatures().get("inches");
          if (Feat != null) inch += Feat.toString();
          else inch += "none";
          if (Iter.hasNext()) {
            ft += ";";
            inch += ";";
          }
        }
        line += ft + "," + inch + ",";
      }

      AnnotationSet weight = Annots.get("weight");
      if (weight == null || weight.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = weight.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("pounds");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet measurement = Annots.get("measurement");
      if (measurement == null || measurement.size() < 1) line += ",,,,";
      else {
        String cup = "";
        String chest = "";
        String waist = "";
        String hip = "";
        Iterator<Annotation> Iter = measurement.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("cup");
          if (Feat != null) cup += Feat.toString();
          else cup += "none";
          Feat = Ann.getFeatures().get("chest");
          if (Feat != null) chest += Feat.toString();
          else chest += "none";
          Feat = Ann.getFeatures().get("waist");
          if (Feat != null) waist += Feat.toString();
          else waist += "none";
          Feat = Ann.getFeatures().get("hip");
          if (Feat != null) hip += Feat.toString();
          else hip += "none";
          if (Iter.hasNext()) {
            cup += ";";
            chest += ";";
            waist += ";";
            hip += ";";
          }
        }
        line += cup + "," + chest + "," + waist + "," + hip + ",";
      }

      AnnotationSet Ethnicity = Annots.get("Ethnicity");
      if (Ethnicity == null || Ethnicity.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Ethnicity.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet SkinColor = Annots.get("SkinColor");
      if (SkinColor == null || SkinColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = SkinColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet EyeColor = Annots.get("EyeColor");
      if (EyeColor == null || EyeColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = EyeColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet HairColor = Annots.get("HairColor");
      if (HairColor == null || HairColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = HairColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Restriction = Annots.get("Restriction");
      if (Restriction == null || Restriction.size() < 1) line += ",,,";
      else {
        String type = "";
        String ethnicity = "";
        String age = "";
        Iterator<Annotation> Iter = Restriction.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("type");
          if (Feat != null) type += Feat.toString();
          else type += "none";
          Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            ethnicity += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else ethnicity += "none";
          Feat = Ann.getFeatures().get("age");
          if (Feat != null) age += Feat.toString();
          else age += "none";
          if (Iter.hasNext()) {
            type += ";";
            ethnicity += ";";
            age += ";";
          }
        }
        line += type + "," + ethnicity + "," + age + ",";
      }

      AnnotationSet Phone = Annots.get("PhoneNumber");
      if (Phone == null || Phone.size() < 1) line += ",,,";
      else {
        String value = "";
        String state = "";
        String city = "";
        Iterator<Annotation> Iter = Phone.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) value += Feat.toString();
          else value += "none";
          Feat = Ann.getFeatures().get("state");
          if (Feat != null) state += Feat.toString();
          else state += "none";
          Feat = Ann.getFeatures().get("area");
          if (Feat != null)
            city += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else city += "none";
          if (Iter.hasNext()) {
            value += ";";
            state += ";";
            city += ";";
          }
        }
        line += value + "," + state + "," + city + ",";
      }

      String Emails = "";
      AnnotationSet Email = Annots.get("Email");
      if (Email == null || Email.size() < 1) Emails = "";
      else {
        Iterator<Annotation> Iter = Email.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("email");
          if (Feat != null)
            Emails += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() > 7 && href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Emails +=
                href.substring(7, href.length()).replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (Emails.length() > 0 && Emails.substring(Emails.length() - 1, Emails.length()).equals(";"))
        Emails = Emails.substring(0, Emails.length() - 1);
      line += Emails + ",";

      String Urls = "";
      AnnotationSet Url = Annots.get("Url");
      if (Url == null || Url.size() < 1) Urls = "";
      else {
        Iterator<Annotation> Iter = Url.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("url");
          if (Feat != null)
            Urls += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() <= 7 || !href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (imports != null) {
        for (Element l : imports) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Urls.length() > 0 && Urls.substring(Urls.length() - 1, Urls.length()).equals(";"))
        Urls = Urls.substring(0, Urls.length() - 1);
      line += Urls + ",";

      String Medias = "";
      if (media != null) {
        for (Element l : media) {
          String src = l.attr("abs:src");
          if (src == null) continue;
          Medias += src.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Medias.length() > 0 && Medias.substring(Medias.length() - 1, Medias.length()).equals(";"))
        Medias = Medias.substring(0, Medias.length() - 1);
      line += Medias;

      processedlines.add(line);
      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);
    }
    Factory.deleteResource(corpus);

    RetObj out = new RetObj(processedlines, processedText);
    return out;
  }
コード例 #12
0
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main
コード例 #13
0
  @Override
  public void execute() throws ExecutionException {
    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);
    AnnotationSet tagAS = document.getAnnotations(tagASName);
    AnnotationSet annotsToTransfer = null;

    boolean newID = copyAnnotations && inputAS.equals(outputAS);

    mappings.clear();

    // TODO clean this up so we don't have to repeat ourselves
    if (configURL != null) {

      BufferedReader in = null;
      try {
        in = new BomStrippingInputStreamReader(configURL.openStream());

        String line = in.readLine();
        while (line != null) {
          if (!line.trim().equals("")) {
            String[] data = line.split("=", 2);
            String oldName = data[0].trim();
            String newName = data.length == 2 ? data[1].trim() : null;
            mappings.put(oldName, new Mapping(oldName, newName));
          }
          line = in.readLine();
        }
      } catch (IOException ioe) {
        ioe.printStackTrace();
      } finally {
        IOUtils.closeQuietly(in);
      }
    } else if (annotationTypes != null) {
      for (String type : annotationTypes) {
        String[] data = type.split("=", 2);
        String oldName = data[0].trim();
        String newName = data.length == 2 ? data[1].trim() : null;

        mappings.put(oldName, new Mapping(oldName, newName));
      }
    }
    // else
    // throw new
    // ExecutionException("The annotation list and URL cannot both be null");

    if (mappings.size() > 0) {
      annotsToTransfer = inputAS.get(mappings.keySet());
    } else {
      // transfer everything
      annotsToTransfer = inputAS.get();
    }
    // in case of no one annotation from some of annotationTypes
    if (annotsToTransfer == null || annotsToTransfer.size() == 0) return;
    // check if we have a BODY annotation
    // if not, just copy all
    if (textTagName == null || textTagName.equals("")) {
      // remove from input set unless we copy only
      if (!copyAnnotations) inputAS.removeAll(annotsToTransfer);
      transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID);

      return;
    }
    // get the BODY annotation
    bodyAnnotations = tagAS.get(textTagName);
    if (bodyAnnotations == null || bodyAnnotations.isEmpty()) {
      // outputAS.addAll(inputAS);
      if (transferAllUnlessFound) {
        // remove from input set unless we copy only
        if (!copyAnnotations) inputAS.removeAll(annotsToTransfer);
        transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID);
      }
      return;
    }
    List<Annotation> annots2Move = new ArrayList<Annotation>();
    Iterator<Annotation> bodyIter = bodyAnnotations.iterator();
    while (bodyIter.hasNext()) {
      Annotation bodyAnn = bodyIter.next();
      Long start = bodyAnn.getStartNode().getOffset();
      Long end = bodyAnn.getEndNode().getOffset();
      // get all annotations we want transferred
      AnnotationSet annots2Copy = annotsToTransfer.getContained(start, end);
      // copy them to the new set and delete them from the old one
      annots2Move.addAll(annots2Copy);
    }
    if (!copyAnnotations) inputAS.removeAll(annots2Move);
    transferAnnotations(annots2Move, outputAS, newID);
  }