Esempio n. 1
0
  /** Test the default tokeniser */
  public void testHashGazetteer() throws Exception {
    // get a document
    Document doc =
        Factory.newDocument(new URL(TestDocument.getTestServerName() + "tests/doc0.html"));

    System.out.println(doc.getFeatures().get("gate.SourceURL"));

    // create a default gazetteer
    FeatureMap params = Factory.newFeatureMap();
    HashGazetteer gaz =
        (HashGazetteer) Factory.createResource("com.ontotext.gate.gazetteer.HashGazetteer", params);

    // runtime stuff
    gaz.setDocument(doc);
    gaz.setAnnotationSetName(GAZ_AS);
    gaz.execute();

    assertTrue(
        "the Annotation set resulting of the execution of the OntoText "
            + "Natural Gazetteer is empty.",
        !doc.getAnnotations(GAZ_AS).isEmpty());

    // check whether the annotations are as expected
    assertEquals("wrong number of lookup annotations found", 76, doc.getAnnotations(GAZ_AS).size());
  } // testHashGazetteer();
Esempio n. 2
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
Esempio n. 3
0
 /**
  * Use a {@link SharedDefaultGazetteer} to duplicate this gazetteer by sharing the internal FSM
  * rather than re-loading the lists.
  */
 @Override
 public Resource duplicate(Factory.DuplicationContext ctx) throws ResourceInstantiationException {
   return Factory.createResource(
       SharedDefaultGazetteer.class.getName(),
       Utils.featureMap(SharedDefaultGazetteer.SDEF_GAZ_BOOTSTRAP_GAZETTEER_PROPERTY_NAME, this),
       Factory.duplicate(this.getFeatures(), ctx),
       this.getName());
 }
 /**
  * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui.
  */
 void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN)
     throws GateException, IOException {
   LogService.minVerbosityLevel = 0;
   if (LogService.minVerbosityLevel > 0)
     System.out.println("Learning Home : " + learningHome.getAbsolutePath());
   FeatureMap parameters = Factory.newFeatureMap();
   URL configFileURL = new File(configFileName).toURI().toURL();
   parameters.put("configFileURL", configFileURL);
   learningApi =
       (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters);
   // Load the corpus
   corpus = Factory.newCorpus("DataSet");
   ExtensionFileFilter fileFilter = new ExtensionFileFilter();
   fileFilter.addExtension("xml");
   File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter);
   Arrays.sort(
       xmlFiles,
       new Comparator<File>() {
         public int compare(File a, File b) {
           return a.getName().compareTo(b.getName());
         }
       });
   for (File f : xmlFiles) {
     if (!f.isDirectory()) {
       Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8");
       doc.setName(f.getName());
       corpus.add(doc);
     }
   }
   //    URL tempURL = new File(corpusDirName).toURI().toURL();
   //    corpus.populate(tempURL, fileFilter, "UTF-8", false);
   // Set the inputAS
   learningApi.setInputASName(inputasN);
   learningApi.setOutputASName(outputasN);
   controller =
       (gate.creole.SerialAnalyserController)
           Factory.createResource("gate.creole.SerialAnalyserController");
   controller.setCorpus(corpus);
   controller.add(learningApi);
 }
 private static Document readDocument(String gateDocumentString)
     throws ResourceInstantiationException {
   Document gateDocument =
       (Document)
           Factory.createResource(
               "gate.corpora.DocumentImpl",
               Utils.featureMap(
                   "stringContent",
                   gateDocumentString,
                   "mimeType",
                   "text/xml",
                   "encoding",
                   "UTF-8"));
   return gateDocument;
 }
Esempio n. 6
0
  private gate.Document generateGATEDocFromLocalDump(BehemothDocument inputDoc)
      throws ResourceInstantiationException, IOException {

    // can't get that to work
    // File tempDirectory = new
    // File(this.config.get("hadoop.tmp.dir","/tmp"),this.config.get("user.name",
    // "./tmp"));
    // LOG.info("tempDirectory "+tempDirectory);
    //
    // tempDirectory.mkdirs();
    //
    // File tempInputFile = File.createTempFile("gateInput-",
    // inputDoc.getUrl(),tempDirectory);
    //
    // FileOutputStream fos = new FileOutputStream(tempInputFile);
    // OutputStream bout = new BufferedOutputStream(fos);
    // bout.write(inputDoc.getContent());
    // bout.flush();
    // bout.close();
    //
    // URL url;
    // try {
    // url = tempInputFile.toURI().toURL();
    // } catch (MalformedURLException e) {
    // // delete the input doc
    // tempInputFile.delete();
    // throw e;
    // }

    FeatureMap params = Factory.newFeatureMap();
    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, new String(inputDoc.getContent()));
    String ct = inputDoc.getContentType();
    if (ct != null) params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, ct);

    gate.Document gatedocument;
    try {
      gatedocument = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
    } finally {
      // delete the input doc
      // tempInputFile.delete();
    }

    return gatedocument;
  }
  public static void main(String[] args) throws Exception {
    // Logger.getLogger(DocumentFeaturesDiff.class).setLevel(Level.ALL);

    GateUtils.initGateKeepLog();
    GateUtils.registerCzsemPlugin();

    ProcessingResource eval =
        new PRSetup.SinglePRSetup(LearningEvaluator.class)
            .putFeature("keyASName", ":-)")
            //				.putFeature("responseASName", "lemma_flex")
            .putFeature("responseASName", "flex")
            .putFeature("keyAnnotationsAreInDocumentFeatures", true)
            .putFeatureList("annotationTypes", "Lookup")
            .putFeatureList("featureNames", "meshID")
            .createPR();

    SerialAnalyserController controller =
        (SerialAnalyserController)
            Factory.createResource(SerialAnalyserController.class.getCanonicalName());

    controller.add(eval);

    Corpus corpus = Factory.newCorpus(null);
    corpus.populate(
        new File("C:\\Users\\dedek\\Desktop\\bmc\\experiment\\analyzed").toURI().toURL(),
        //				new File("C:\\Users\\dedek\\Desktop\\bmca_devel").toURI().toURL(),
        null,
        "utf8",
        false);

    System.err.println("populated");

    controller.setCorpus(corpus);

    controller.execute();
  }
  @Override
  public Resource init() throws ResourceInstantiationException {
    gracefulExit = false;

    if (configFileURL == null) {
      gracefulExit = true;
      gate.util.Err.println("No configuration file provided!");
    }

    if (japeURL == null) {
      gracefulExit = true;
      gate.util.Err.println("No JAPE grammar file provided!");
    }

    // create the init params for the JAPE transducer
    FeatureMap params = Factory.newFeatureMap();
    params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, japeURL);
    // Code borrowed from Mark Greenwood's Measurements PR
    if (japeTransducer == null) {
      // if this is the first time we are running init then actually create a
      // new transducer as we don't already have one
      FeatureMap hidden = Factory.newFeatureMap();
      Gate.setHiddenAttribute(hidden, true);
      japeTransducer =
          (Transducer) Factory.createResource("gate.creole.Transducer", params, hidden);
    } else {
      // we are being run through a call to reInit so simply re-init the
      // underlying JAPE transducer
      japeTransducer.setParameterValues(params);
      japeTransducer.reInit();
    }

    ConfigReader config = new ConfigReader(configFileURL);
    gracefulExit = config.config();

    try {
      HashMap<String, String> options = config.getOptions();

      patternMap = new HashMap<String, Pattern>();
      addSuffixPattern("disease_suffix", options);
      addWordPattern("disease_abbrevs", options);
      addWordPattern("disease_sense", options);
      addWordExtraPattern("disease_sense_context", options);
      addPossessiveWordPattern("disease_named_syndrome", options);
      addWordExtraPattern("disease_generic_context", options);
      addWordExtraPattern("disease_anatomy_context", options);
      addSuffixPluralPattern("procedure_suffix", options);
      addWordPluralPattern("procedure_key", options);
      addWordExtraPattern("procedure_anatomy_context", options);
      addWordPluralPattern("symptom_key", options);
      addWordPattern("test_key", options);

      addSuffixPattern("anatomy_suffix_adjective", options);
      addSuffixPattern("anatomy_suffix", options);
      addPrefixPattern("anatomy_prefix", options);
      addWordPattern("anatomy_position", options);
      addWordPluralPattern("anatomy_space_region_junction", options);
      addWordPattern("anatomy_part_adjective", options);
      addWordPattern("anatomy_latin_noun", options);
      addWordPattern("anatomy_muscle", options);
      addWordPluralPattern("anatomy_part", options);
      addWordPluralPattern("anatomy_fluid", options);

    } catch (NullPointerException ne) {
      gracefulExit = true;
      gate.util.Err.println(
          "Missing or unset configuration options. Please check configuration file.");
    }

    return this;
  } // end init()
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main