Beispiel #1
0
  @Override
  public void setIndexDefinition(IndexDefinition definition) {
    if (definition != null) {
      this.getFeatures().put(GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY, definition);

      String className = definition.getIrEngineClassName();
      try {
        // Class aClass = Class.forName(className);
        Class<?> aClass = Class.forName(className, true, Gate.getClassLoader());
        IREngine engine = (IREngine) aClass.newInstance();
        this.indexManager = engine.getIndexmanager();
        this.indexManager.setIndexDefinition(definition);
        this.indexManager.setCorpus(this);
      } catch (Exception e) {
        e.printStackTrace(Err.getPrintWriter());
      }
      // switch (definition.getIndexType()) {
      // case GateConstants.IR_LUCENE_INVFILE:
      // this.indexManager = new LuceneIndexManager();
      // this.indexManager.setIndexDefinition(definition);
      // this.indexManager.setCorpus(this);
      // break;
      // }
      this.addedDocs = new Vector<Document>();
      this.removedDocIDs = new Vector<String>();
      this.changedDocs = new Vector<Document>();
    }
  }
Beispiel #2
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
Beispiel #3
0
  @Override
  public boolean add(Document o) {
    if (o == null) return false;
    Document doc = o;

    // make it accept only docs from its own datastore
    if (doc.getDataStore() != null && !this.dataStore.equals(doc.getDataStore())) {
      Err.prln("Error: Persistent corpus can only accept documents " + "from its own datastore!");
      return false;
    } // if

    // add the document with its index in the docDataList
    // in this case, since it's going to be added to the end
    // the index will be the size of the docDataList before
    // the addition
    DocumentData docData =
        new DocumentData(doc.getName(), doc.getLRPersistenceId(), doc.getClass().getName());
    boolean result = docDataList.add(docData);
    documents.add(doc);
    documentAdded(doc);
    fireDocumentAdded(
        new CorpusEvent(
            SerialCorpusImpl.this,
            doc,
            docDataList.size() - 1,
            doc.getLRPersistenceId(),
            CorpusEvent.DOCUMENT_ADDED));

    return result;
  }
Beispiel #4
0
  /**
   * This method is called when the SAX parser encounts the end of the XML document. Here we set the
   * content of the gate Document to be the one generated inside this class (tmpDocContent). After
   * that we use the colector to generate all the annotation reffering this new gate document.
   */
  @Override
  public void endDocument() throws org.xml.sax.SAXException {

    // replace the document content with the one without markups
    doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));

    // fire the status listener
    fireStatusChangedEvent("Total elements: " + elements);

    // If basicAs is null then get the default AnnotationSet,
    // based on the gate document.
    if (basicAS == null) {
      basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    }

    // sort colector ascending on its id
    Collections.sort(colector);
    Set<Integer> testIdsSet = new HashSet<Integer>();
    // create all the annotations (on this new document) from the collector
    while (!colector.isEmpty()) {
      CustomObject obj = colector.getFirst();
      // Test to see if there are two annotation objects with the same id.
      if (testIdsSet.contains(obj.getId())) {
        throw new GateSaxException(
            "Found two annotations with the same Id("
                + obj.getId()
                + ").The document is inconsistent.");
      } else {
        testIdsSet.add(obj.getId());
      } // End iff
      // create a new annotation and add it to the annotation set
      try {
        // the annotation type will be conforming with markupElementsMap
        // add the annotation to the Annotation Set
        if (markupElementsMap == null) {
          basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
        } else {
          // get the type of the annotation from Map
          String annotationType = markupElementsMap.get(obj.getElemName());
          if (annotationType != null) {
            basicAS.add(obj.getId(), obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
          }
        } // End if
      } catch (gate.util.InvalidOffsetException e) {
        Err.prln(
            "InvalidOffsetException for annot :"
                + obj.getElemName()
                + " with Id ="
                + obj.getId()
                + ". Discarded...");
      } // End try
      colector.remove(obj);
    } // End while
  } // endDocument();
  /**
   * @param inputAS input annotation set
   * @param outputAS output annotation set
   * @param term String matched
   * @param startOffset match start offset
   * @param endOffset match end offset
   */
  private void addLookup(
      AnnotationSet inputAS,
      AnnotationSet outputAS,
      String term,
      String outputASType,
      Long startOffset,
      Long endOffset,
      boolean useNounChunk) {
    if (useNounChunk && nounChunkType != null && !nounChunkType.isEmpty()) {
      AnnotationSet nounChunkAS = inputAS.getCovering(nounChunkType, startOffset, endOffset);
      if (!nounChunkAS.isEmpty()) {
        startOffset = nounChunkAS.firstNode().getOffset();
        endOffset = nounChunkAS.lastNode().getOffset();
      }
    }
    try {
      AnnotationSet diseaseAS = inputAS.get(outputASType, startOffset, endOffset);
      if (diseaseAS.isEmpty()) {
        FeatureMap fm = Factory.newFeatureMap();
        fm.put("match", term);
        outputAS.add(startOffset, endOffset, outputASType, fm);
      } else {
        Annotation disease = diseaseAS.iterator().next();
        FeatureMap fm = disease.getFeatures();
        String meta = (String) fm.get("match");
        if (meta != null) {
          meta = meta + " " + term;
        }
        fm.put("match", meta);
      }

    } catch (InvalidOffsetException ie) {
      // shouldn't happen
      gate.util.Err.println(ie);
    }
  }
 /* Set gracefulExit flag and clean up */
 private void gracefulExit(String msg) {
   gate.util.Err.println(msg);
   cleanup();
   fireProcessFinished();
 }
  @Override
  public void execute() throws ExecutionException {
    interrupted = false;

    // quit if setup failed
    if (gracefulExit) {
      gracefulExit("Plugin was not initialised correctly. Exiting gracefully ... ");
      return;
    }

    AnnotationSet inputAS =
        (inputASName == null || inputASName.trim().length() == 0)
            ? document.getAnnotations()
            : document.getAnnotations(inputASName);
    AnnotationSet outputAS =
        (outputASName == null || outputASName.trim().length() == 0)
            ? document.getAnnotations()
            : document.getAnnotations(outputASName);

    AnnotationSet sentenceAS = null;
    if (sentenceType != null && !sentenceType.isEmpty()) {
      sentenceAS = inputAS.get(sentenceType);
    }

    // Document content
    String docContent = document.getContent().toString();
    int docLen = docContent.length();

    // For matching purposes replace all whitespace characters with a single space
    docContent = docContent.replaceAll("[\\s\\xA0\\u2007\\u202F]", " ");

    fireStatusChanged("Locating anatomy, disease and procedure mentions in " + document.getName());
    fireProgressChanged(0);

    if (sentenceAS != null) {
      for (Annotation sentence : sentenceAS) {
        Long sentStartOffset = sentence.getStartNode().getOffset();
        Long sentEndOffset = sentence.getEndNode().getOffset();

        // Converting the sentence to lower case prevents the need to use case-insenstive regex
        // matching, which should give a small performance boost
        String sentenceContent =
            docContent
                .substring(sentStartOffset.intValue(), sentEndOffset.intValue())
                .toLowerCase(Locale.ENGLISH);

        if (diseaseType != null && !diseaseType.isEmpty()) {
          doMatch(
              patternMap.get("disease_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "suffDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_abbrevs"),
              sentenceContent,
              inputAS,
              outputAS,
              "preDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_named_syndrome"),
              sentenceContent,
              inputAS,
              outputAS,
              "namedDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_sense"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDiseaseSense",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_sense_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDiseaseSenseContext",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_generic_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "poDisease",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("disease_anatomy_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpDisease",
              sentStartOffset,
              docLen);
        }

        if (procedureType != null && !procedureType.isEmpty()) {
          doMatch(
              patternMap.get("procedure_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "poProcedure",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("procedure_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poProcedure",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("procedure_anatomy_context"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpProcedure",
              sentStartOffset,
              docLen);
        }

        if (symptomType != null && !symptomType.isEmpty()) {
          doMatch(
              patternMap.get("symptom_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poSymptom",
              sentStartOffset,
              docLen);
        }

        if (testType != null && !testType.isEmpty()) {
          doMatch(
              patternMap.get("test_key"),
              sentenceContent,
              inputAS,
              outputAS,
              "poTest",
              sentStartOffset,
              docLen);
        }

        if (anatomyType != null && !anatomyType.isEmpty()) {
          doMatch(
              patternMap.get("anatomy_suffix_adjective"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSuffAdj",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_suffix"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSuff",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_prefix"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPre",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_position"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPos",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_space_region_junction"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatSpace",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_part_adjective"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatAdj",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_latin_noun"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatLatin",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_muscle"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatMuscle",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_part"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatPart",
              sentStartOffset,
              docLen);
          doMatch(
              patternMap.get("anatomy_fluid"),
              sentenceContent,
              inputAS,
              outputAS,
              "tmpAnatFluid",
              sentStartOffset,
              docLen);
        }
      }
      // Run JAPE transducer to clean up the output
      fireStatusChanged(
          "Processing anatomical, disease and procedure mentions in " + document.getName());
      try {
        japeTransducer.setDocument(document);
        japeTransducer.setInputASName(inputASName);
        japeTransducer.setOutputASName(outputASName);
        japeTransducer.addProgressListener(this);
        japeTransducer.execute();
      } catch (ExecutionException re) {
        gate.util.Err.println("Unable to run " + japeURL);
        gracefulExit = true;
      } finally {
        japeTransducer.setDocument(null);
      }
      // rename temporary annotations
      if (!debug) {
        renameAnnotations(outputAS, "tmpAnatomicalTerm", anatomyType);
        renameAnnotations(outputAS, "suffDisease", diseaseType);
        renameAnnotations(outputAS, "poDisease", diseaseType);
        renameAnnotations(outputAS, "preDisease", diseaseType);
        renameAnnotations(outputAS, "poProcedure", procedureType);
        renameAnnotations(outputAS, "poSymptom", symptomType);
        renameAnnotations(outputAS, "poTest", testType);
      }
    } else {
      gracefulExit("No sentences to process!");
    }

    // want list of disease key words plus symptoms such as oedema? or just diseases

    fireProcessFinished();
  } // end execute()
  @Override
  public Resource init() throws ResourceInstantiationException {
    gracefulExit = false;

    if (configFileURL == null) {
      gracefulExit = true;
      gate.util.Err.println("No configuration file provided!");
    }

    if (japeURL == null) {
      gracefulExit = true;
      gate.util.Err.println("No JAPE grammar file provided!");
    }

    // create the init params for the JAPE transducer
    FeatureMap params = Factory.newFeatureMap();
    params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, japeURL);
    // Code borrowed from Mark Greenwood's Measurements PR
    if (japeTransducer == null) {
      // if this is the first time we are running init then actually create a
      // new transducer as we don't already have one
      FeatureMap hidden = Factory.newFeatureMap();
      Gate.setHiddenAttribute(hidden, true);
      japeTransducer =
          (Transducer) Factory.createResource("gate.creole.Transducer", params, hidden);
    } else {
      // we are being run through a call to reInit so simply re-init the
      // underlying JAPE transducer
      japeTransducer.setParameterValues(params);
      japeTransducer.reInit();
    }

    ConfigReader config = new ConfigReader(configFileURL);
    gracefulExit = config.config();

    try {
      HashMap<String, String> options = config.getOptions();

      patternMap = new HashMap<String, Pattern>();
      addSuffixPattern("disease_suffix", options);
      addWordPattern("disease_abbrevs", options);
      addWordPattern("disease_sense", options);
      addWordExtraPattern("disease_sense_context", options);
      addPossessiveWordPattern("disease_named_syndrome", options);
      addWordExtraPattern("disease_generic_context", options);
      addWordExtraPattern("disease_anatomy_context", options);
      addSuffixPluralPattern("procedure_suffix", options);
      addWordPluralPattern("procedure_key", options);
      addWordExtraPattern("procedure_anatomy_context", options);
      addWordPluralPattern("symptom_key", options);
      addWordPattern("test_key", options);

      addSuffixPattern("anatomy_suffix_adjective", options);
      addSuffixPattern("anatomy_suffix", options);
      addPrefixPattern("anatomy_prefix", options);
      addWordPattern("anatomy_position", options);
      addWordPluralPattern("anatomy_space_region_junction", options);
      addWordPattern("anatomy_part_adjective", options);
      addWordPattern("anatomy_latin_noun", options);
      addWordPattern("anatomy_muscle", options);
      addWordPluralPattern("anatomy_part", options);
      addWordPluralPattern("anatomy_fluid", options);

    } catch (NullPointerException ne) {
      gracefulExit = true;
      gate.util.Err.println(
          "Missing or unset configuration options. Please check configuration file.");
    }

    return this;
  } // end init()
Beispiel #9
0
  /**
   * This method is called when the HTML parser encounts the end of a tag that means that the tag is
   * paired by a beginning tag
   */
  @Override
  public void handleEndTag(HTML.Tag t, int pos) {
    // obj is for internal use
    CustomObject obj = null;

    // end of STYLE tag
    if (HTML.Tag.STYLE.equals(t)) {
      isInsideStyleTag = false;
    } // if

    // If the stack is not empty then we get the object from the stack
    if (!stack.isEmpty()) {
      obj = stack.pop();
      // Before adding it to the colector, we need to check if is an
      // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
      if (obj.getStart().equals(obj.getEnd())) {
        // The element had an end tag and its start was equal to its end. Hence
        // it is anEmptyAndSpan one.
        obj.getFM().put("isEmptyAndSpan", "true");
      } // End iff
      // we add it to the colector
      colector.add(obj);
    } // End if

    // If element has text between, then customize its apearance
    if (obj != null && obj.getStart().longValue() != obj.getEnd().longValue())
      // Customize the appearance of the document
      customizeAppearanceOfDocumentWithEndTag(t);

    // if t is the </HTML> tag then we reached the end of theHTMLdocument
    if (t == HTML.Tag.HTML) {
      // replace the old content with the new one
      doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));

      // If basicAs is null then get the default annotation
      // set from this gate document
      if (basicAS == null)
        basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);

      // sort colector ascending on its id
      Collections.sort(colector);
      // iterate through colector and construct annotations
      while (!colector.isEmpty()) {
        obj = colector.getFirst();
        colector.remove(obj);
        // Construct an annotation from this obj
        try {
          if (markupElementsMap == null) {
            basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
          } else {
            String annotationType = markupElementsMap.get(obj.getElemName());
            if (annotationType != null)
              basicAS.add(obj.getStart(), obj.getEnd(), annotationType, obj.getFM());
          }
        } catch (InvalidOffsetException e) {
          Err.prln("Error creating an annot :" + obj + " Discarded...");
        } // end try
        //        }// end if
      } // while

      // notify the listener about the total amount of elements that
      // has been processed
      fireStatusChangedEvent("Total elements : " + elements);
    } // else
  } // handleEndTag