/**
   * Check the new line sequence and set document property. <br>
   * Possible values are CRLF, LFCR, CR, LF
   */
  protected void setNewLineProperty(Document doc) {
    String content = doc.getContent().toString();
    String newLineType = "";

    char ch = ' ';
    char lastch = ' ';
    for (int i = 0; i < content.length(); ++i) {
      ch = content.charAt(i);
      if (lastch == '\r') {
        if (ch == '\n') {
          newLineType = "CRLF";
          break;
        } else {
          newLineType = "CR";
          break;
        }
      }
      if (lastch == '\n') {
        if (ch == '\r') {
          newLineType = "LFCR";
          break;
        } else {
          newLineType = "LF";
          break;
        }
      }
      lastch = ch;
    } // for

    doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType);
  } // setNewLineProperty()
Пример #2
0
 /**
  * Unloads a document from memory.
  *
  * @param index the index of the document to be unloaded.
  * @param sync should the document be sync'ed (i.e. saved) before unloading.
  */
 public void unloadDocument(int index, boolean sync) {
   // 1. check whether its been loaded and is a persistent one
   // if a persistent doc is not loaded, there's nothing we need to do
   if ((!isDocumentLoaded(index)) && isPersistentDocument(index)) return;
   // 2. If requested, sync the document before releasing it from
   // memory,
   // because the creole register garbage collects all LRs which are
   // not used
   // any more
   if (sync) {
     Document doc = documents.get(index);
     try {
       // if the document is not already adopted, we need to do that
       // first
       if (doc.getLRPersistenceId() == null) {
         doc = (Document) this.getDataStore().adopt(doc);
         this.getDataStore().sync(doc);
         this.setDocumentPersistentID(index, doc.getLRPersistenceId());
       } else // if it is adopted, just sync it
       this.getDataStore().sync(doc);
     } catch (PersistenceException ex) {
       throw new GateRuntimeException(
           "Error unloading document from corpus"
               + "because document sync failed: "
               + ex.getMessage(),
           ex);
     }
   }
   // 3. remove the document from the memory
   // do this, only if the saving has succeeded
   documents.set(index, null);
 }
Пример #3
0
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
Пример #4
0
  /** Annotation remove event */
  public void annotationRemoved(AnnotationSetEvent ase) {
    if (!disableListener && ase.getSourceDocument() == this) {
      AnnotationSet as = (AnnotationSet) ase.getSource();
      Annotation annot = ase.getAnnotation();
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());

      boolean defaultAS = as.getName() == null;
      for (String docID : combinedDocumentIds) {
        Document aDoc = compoundDocument.getDocument(docID);

        // find out the details which refer to the deleted annotation
        OffsetDetails od = getOffsetDetails(docID, as.getName(), annot);
        if (od == null) continue;

        if (defaultAS) {
          aDoc.getAnnotations().remove(od.getOriginalAnnotation());
        } else {
          aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation());
        }
        removeOffsetDetails(docID, od);
        break;
      }
    }
  }
  public void tokenize() {
    AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) {

      Annotation currentTokenAnnotation = it.next();
      FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures();
      FeatureMap curFeaturesMap = Factory.newFeatureMap();

      if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) {
        curFeaturesMap.put("string", tokenFeaturesMap.get("string"));
        curFeaturesMap.put("root", tokenFeaturesMap.get("lemma"));
        curFeaturesMap.put("category", tokenFeaturesMap.get("POS"));

        // Add the new Token to the Annotation Set

        defaultAs.add(
            currentTokenAnnotation.getStartNode(),
            currentTokenAnnotation.getEndNode(),
            currentTokenAnnotation.getType(),
            curFeaturesMap);
      }
    }
    gateDocument.removeAnnotationSet("Tokenization");
  }
Пример #6
0
  /** Test the default tokeniser */
  public void testHashGazetteer() throws Exception {
    // get a document
    Document doc =
        Factory.newDocument(new URL(TestDocument.getTestServerName() + "tests/doc0.html"));

    System.out.println(doc.getFeatures().get("gate.SourceURL"));

    // create a default gazetteer
    FeatureMap params = Factory.newFeatureMap();
    HashGazetteer gaz =
        (HashGazetteer) Factory.createResource("com.ontotext.gate.gazetteer.HashGazetteer", params);

    // runtime stuff
    gaz.setDocument(doc);
    gaz.setAnnotationSetName(GAZ_AS);
    gaz.execute();

    assertTrue(
        "the Annotation set resulting of the execution of the OntoText "
            + "Natural Gazetteer is empty.",
        !doc.getAnnotations(GAZ_AS).isEmpty());

    // check whether the annotations are as expected
    assertEquals("wrong number of lookup annotations found", 76, doc.getAnnotations(GAZ_AS).size());
  } // testHashGazetteer();
Пример #7
0
  @Override
  public Document get(int index) {
    if (index >= docDataList.size()) return null;

    Document res = documents.get(index);

    if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);

    // if the document is null, then I must get it from the DS
    if (res == null) {
      FeatureMap parameters = Factory.newFeatureMap();
      parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
      try {
        parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID());
        Document lr =
            (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters);
        if (DEBUG) Out.prln("Loaded document :" + lr.getName());
        // change the result to the newly loaded doc
        res = lr;

        // finally replace the doc with the instantiated version
        documents.set(index, lr);
      } catch (ResourceInstantiationException ex) {
        Err.prln("Error reading document inside a serialised corpus.");
        throw new GateRuntimeException(ex);
      }
    }

    return res;
  }
 @SuppressWarnings("unchecked")
 public Set<String> processDoc(String str) throws Exception {
   Set<String> toReturn = new HashSet<String>();
   Corpus c = null;
   Document aDoc = null;
   try {
     c = Factory.newCorpus("sample");
     aDoc = Factory.newDocument(str);
     c.add(aDoc);
     controller.setCorpus(c);
     controller.execute();
     AnnotationSet aSet = aDoc.getAnnotations("StockSymbols");
     for (Annotation annot : aSet) {
       String symbol = (String) annot.getFeatures().get("sym");
       toReturn.add(symbol);
     }
   } catch (Exception e) {
     throw e;
   } finally {
     if (aDoc != null) {
       Factory.deleteResource(aDoc);
     }
     if (c != null) {
       Factory.deleteResource(c);
     }
   }
   return toReturn;
 }
 /**
  * Unpack the markup in the document. This converts markup from the native format (e.g. XML, RTF)
  * into annotations in GATE format. Uses the markupElementsMap to determine which elements to
  * convert, and what annotation type names to use.
  */
 public void unpackMarkup(Document doc) throws DocumentFormatException {
   if (doc == null || doc.getContent() == null) return;
   setNewLineProperty(doc);
   // Create paragraph annotations in the specified annotation set
   int endOffset = doc.getContent().toString().length();
   int startOffset = 0;
   annotateParagraphs(doc, startOffset, endOffset, GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
 } // unpackMarkup
Пример #10
0
  public JSONObject persian_sentiment(String text) throws Exception {

    oncreate();

    File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp");
    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application

    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    // process the files one by one

    // load the document (using the specified encoding if one was given)

    Document doc = Factory.newDocument(text);

    // put the document in the corpus
    corpus.add(doc);

    // run the application
    application.execute();

    String featureName = "Doc_sentiment";
    FeatureMap features = doc.getFeatures();
    // remove the document from the corpus again
    corpus.clear();

    // doc.getFeatures().
    // Release the document, as it is no longer needed
    Factory.deleteResource(doc);

    LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName);

    String obj = (String) originalContent.get("sentiment");
    // BigDecimal pos =(BigDecimal) originalContent.get("positive");
    // BigDecimal neg =(BigDecimal) originalContent.get("negative");
    // System.out.println(obj);
    // create Json for response to user
    JSONObject obj1 = new JSONObject();
    obj1.put("sentiment", obj);
    /*obj1.put("positive",pos);
    //obj1.put("negative",neg);
    System.out.print("----------");
    System.out.print(obj1);
    System.out.print("----------");*/
    // application.cleanup();
    return obj1;
  }
Пример #11
0
  /**
   * Generation of a GATE document from a Behemoth one
   *
   * @param key URL of the input doc
   * @param inputDoc
   * @return
   * @throws ResourceInstantiationException
   * @throws InvalidOffsetException
   * @throws IOException
   */
  public gate.Document generateGATEDoc(BehemothDocument inputDoc)
      throws ResourceInstantiationException, InvalidOffsetException, IOException {

    gate.Document gatedocument = null;

    // if no text is available (e.g. Tika has not extracted it)
    // let GATE do the parsing itself from the binary content
    if (inputDoc.getText() == null) {
      try {
        gatedocument = generateGATEDocFromLocalDump(inputDoc);

        // transfer the text from GATE to Behemoth
        String textContent = gatedocument.getContent().toString();
        inputDoc.setText(textContent);

        return gatedocument;
      } catch (Exception e) {
        LOG.error("Can't generate GATE doc from byte dump", e);
      }
    }

    // if the input document does not have any text -> create a doc with an
    // empty text

    String text = inputDoc.getText();
    if (inputDoc.getText() == null) text = "";
    else text = inputDoc.getText();

    gatedocument = Factory.newDocument(text);

    // then the metadata as document features
    FeatureMap docFeatures = gatedocument.getFeatures();
    String docUrl = inputDoc.getUrl();
    if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl);
    if (inputDoc.getMetadata() != null) {
      Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator();
      while (iter.hasNext()) {
        Entry<Writable, Writable> entry = iter.next();
        String skey = entry.getKey().toString().trim();
        String svalue = null;
        if (entry.getValue() != null) svalue = entry.getValue().toString().trim();
        docFeatures.put(skey, svalue);
      }
    }

    // finally the annotations as original markups
    // TODO change the name of the annotation set via config
    AnnotationSet outputAS = gatedocument.getAnnotations("Original markups");
    for (Annotation annot : inputDoc.getAnnotations()) {
      // add to outputAS as a GATE annotation
      FeatureMap features = Factory.newFeatureMap();
      features.putAll(annot.getFeatures());
      outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features);
    }
    return gatedocument;
  }
Пример #12
0
  /**
   * Constructor initialises all the private memeber data
   *
   * @param aDocument The gate document that will be processed
   * @param aMarkupElementsMap The map containing the elements that will transform into annotations
   * @param anAnnotationSet The annotation set that will contain annotations resulted from the
   *     processing of the gate document
   */
  public HtmlDocumentHandler(
      gate.Document aDocument,
      Map<String, String> aMarkupElementsMap,
      gate.AnnotationSet anAnnotationSet) {
    // init stack
    stack = new Stack<CustomObject>();

    // this string contains the plain text (the text without markup)
    tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());

    // colector is used later to transform all custom objects into
    // annotation objects
    colector = new LinkedList<CustomObject>();

    // the Gate document
    doc = aDocument;

    // this map contains the elements name that we want to create
    // if it's null all the elements from the XML documents will be transformed
    // into Gate annotation objects
    markupElementsMap = aMarkupElementsMap;

    // init an annotation set for this gate document
    basicAS = anAnnotationSet;

    customObjectsId = 0;
  } // HtmlDocumentHandler
  @SuppressWarnings("unchecked")
  public static AnnotationDiffer computeDiffWithDocFeatures(
      Document document, List<String> featureNames, AnnotationSet responsesAnnotations) {
    FeatureMap doc_fm = document.getFeatures();
    // Logger log = Logger.getLogger(DocumentFeaturesDiff.class);

    int correct = 0;
    int missing = 0;
    int spurious = 0;

    for (String feature_name : featureNames) {
      // int cur_correct = 0;

      List<String> f = (List<String>) doc_fm.get(feature_name);
      if (f == null) {
        f = (List<String>) doc_fm.get(feature_name + "s");
      }

      AnnotationDiffer diff =
          computeDiffWithGoldStandardDataForSingleFeature(
              feature_name, Utils.setFromList(f), responsesAnnotations);

      spurious += diff.getSpurious();
      correct += diff.getCorrectMatches();
      missing += diff.getMissing();
    }

    return new AnnotationDifferDocumentFeaturesImpl(correct, missing, spurious);
  }
Пример #14
0
  /**
   * Constructs a XmlDocumentHandler object.
   *
   * @param aDocument the Gate document that will be processed.
   * @param aMarkupElementsMap this map contains the elements name that we want to create.
   * @param anElement2StringMap this map contains the strings that will be added to the text
   *     contained by the key element.
   * @param anAnnotationSet is the annotation set that will be filled when the document was
   *     processed
   */
  public XmlDocumentHandler(
      gate.Document aDocument,
      Map<String, String> aMarkupElementsMap,
      Map<String, String> anElement2StringMap,
      AnnotationSet anAnnotationSet) {
    // init parent
    super();
    // init stack
    stack = new Stack<CustomObject>();

    // this string contains the plain text (the text without markup)
    tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());

    // colector is used later to transform all custom objects into annotation
    // objects
    colector = new LinkedList<CustomObject>();

    // the Gate document
    doc = aDocument;

    // this map contains the elements name that we want to create
    // if it's null all the elements from the XML documents will be transformed
    // into Gate annotation objects
    markupElementsMap = aMarkupElementsMap;

    // this map contains the string that we want to insert iside the document
    // content, when a certain element is found
    // if the map is null then no string is added
    element2StringMap = anElement2StringMap;

    basicAS = anAnnotationSet;
    customObjectsId = 0;
  } // XmlDocumentHandler()/
Пример #15
0
 private void thisResourceWritten() {
   if (indexManager != null) {
     try {
       for (int i = 0; i < documents.size(); i++) {
         if (documents.get(i) != null) {
           Document doc = documents.get(i);
           if (!addedDocs.contains(doc) && doc.isModified()) {
             changedDocs.add(doc);
           }
         }
       }
       indexManager.sync(addedDocs, removedDocIDs, changedDocs);
     } catch (IndexException ie) {
       ie.printStackTrace();
     }
   }
 }
  public void splitter() {
    AnnotationSet sDetectionAS = gateDocument.getAnnotations("SentenceDetection");
    AnnotationSet defaultAs = gateDocument.getAnnotations("");

    for (Iterator<Annotation> it = sDetectionAS.iterator(); it.hasNext(); ) {

      Annotation currentSentenceAnnotation = it.next();

      // Add the Sentence to the Annotation Set
      defaultAs.add(
          currentSentenceAnnotation.getStartNode(),
          currentSentenceAnnotation.getEndNode(),
          "Sentence",
          null);
    }
    gateDocument.removeAnnotationSet("SentenceDetection");
  }
Пример #17
0
  @Override
  public boolean add(Document o) {
    if (o == null) return false;
    Document doc = o;

    // make it accept only docs from its own datastore
    if (doc.getDataStore() != null && !this.dataStore.equals(doc.getDataStore())) {
      Err.prln("Error: Persistent corpus can only accept documents " + "from its own datastore!");
      return false;
    } // if

    // add the document with its index in the docDataList
    // in this case, since it's going to be added to the end
    // the index will be the size of the docDataList before
    // the addition
    DocumentData docData =
        new DocumentData(doc.getName(), doc.getLRPersistenceId(), doc.getClass().getName());
    boolean result = docDataList.add(docData);
    documents.add(doc);
    documentAdded(doc);
    fireDocumentAdded(
        new CorpusEvent(
            SerialCorpusImpl.this,
            doc,
            docDataList.size() - 1,
            doc.getLRPersistenceId(),
            CorpusEvent.DOCUMENT_ADDED));

    return result;
  }
Пример #18
0
 @Override
 public void resourceUnloaded(CreoleEvent e) {
   Resource res = e.getResource();
   if (res instanceof Document) {
     Document doc = (Document) res;
     if (DEBUG) Out.prln("resource Unloaded called ");
     // remove from the corpus too, if a transient one
     if (doc.getDataStore() != this.getDataStore()) {
       this.remove(doc);
     } else {
       // unload all occurences
       int index = indexOf(res);
       if (index < 0) return;
       documents.set(index, null);
       if (DEBUG) Out.prln("corpus: document " + index + " unloaded and set to null");
     } // if
   }
 }
  @Override
  public void execute() throws ExecutionException {
    Document doc = getDocument();
    AnnotationSet as = doc.getAnnotations(getAnnotationSetName());
    AnnotationSet tocs = as.get(getTokenAnnotationTypeName());

    try {

      for (Annotation t : tocs) {
        String content = Utils.stringFor(doc, t);
        String val = getOrthographyValue(content);
        if (val != null) t.getFeatures().put("orth", val);
      }

    } catch (Exception e) {
      throw new ExecutionException(e);
    }
  }
Пример #20
0
  public void featureMapUpdated() {
    @SuppressWarnings("unchecked")
    Map<String, List<List<Integer>>> matches =
        (Map<String, List<List<Integer>>>) this.getFeatures().get("MatchesAnnots");
    if (matches == null) return;
    for (List<List<Integer>> topList : matches.values()) {
      for (List<Integer> list : topList) {
        Map<String, List<Integer>> newList = new HashMap<String, List<Integer>>();
        for (Integer id : list) {
          for (String docID : combinedDocumentIds) {
            // find out the details which refer to the deleted
            // annotation
            OffsetDetails od = getOffsetDetails(docID, id);
            if (od == null) continue;

            // bingo found it
            List<Integer> subMatches = newList.get(docID);
            if (subMatches == null) {
              subMatches = new ArrayList<Integer>();
              newList.put(docID, subMatches);
            }
            subMatches.add(od.getOriginalAnnotation().getId());
          }
        }
        for (String docID : newList.keySet()) {
          Document aDoc = compoundDocument.getDocument(docID);
          @SuppressWarnings("unchecked")
          Map<String, List<List<Integer>>> docMatches =
              (Map<String, List<List<Integer>>>) aDoc.getFeatures().get("MatchesAnnots");
          if (docMatches == null) {
            docMatches = new HashMap<String, List<List<Integer>>>();
            aDoc.getFeatures().put("MatchesAnnots", docMatches);
          }

          List<List<Integer>> listOfList = docMatches.get(null);
          if (listOfList == null) {
            listOfList = new ArrayList<List<Integer>>();
            docMatches.put(null, listOfList);
          }
          listOfList.add(newList.get(docID));
        }
      }
    }
  }
Пример #21
0
  @Override
  public void add(int index, Document o) {
    if (o == null) return;
    Document doc = o;

    DocumentData docData =
        new DocumentData(doc.getName(), doc.getLRPersistenceId(), doc.getClass().getName());
    docDataList.add(index, docData);

    documents.add(index, doc);
    documentAdded(doc);
    fireDocumentAdded(
        new CorpusEvent(
            SerialCorpusImpl.this,
            doc,
            index,
            doc.getLRPersistenceId(),
            CorpusEvent.DOCUMENT_ADDED));
  }
  /**
   * This is a test to see if the GATE document has a valid URL or a valid content.
   *
   * @param doc
   * @throws DocumentFormatException
   */
  protected static boolean hasContentButNoValidUrl(Document doc) throws DocumentFormatException {
    try {
      if (doc.getSourceUrl() == null && doc.getContent() != null) {
        // The doc's url is null but there is a content.
        return true;
      } else {
        doc.getSourceUrl().openConnection();
      }
    } catch (IOException ex1) {
      // The URL is not null but is not valid.
      if (doc.getContent() == null)
        // The document content is also null. There is nothing we can do.
        throw new DocumentFormatException(
            "The document doesn't have a" + " valid URL and also no content");
      return true;
    } // End try

    return false;
  }
Пример #23
0
 /**
  * Unloads a document from memory
  *
  * @param doc the document to be unloaded
  * @param sync should the document be sync'ed (i.e. saved) before unloading.
  */
 public void unloadDocument(Document doc, boolean sync) {
   if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
   // 1. determine the index of the document; if not there, do nothing
   int index = findDocument(doc);
   if (index == -1) return;
   if (DEBUG) Out.prln("Index of doc: " + index);
   if (DEBUG) Out.prln("Size of corpus: " + documents.size());
   unloadDocument(index, sync);
   // documents.remove(new Integer(index));
 }
  /** Delete '\r' in combination CRLF or LFCR in document content */
  private void removeExtraNewLine(Document doc) {
    String content = doc.getContent().toString();
    StringBuffer buff = new StringBuffer(content);

    char ch = ' ';
    char lastch = ' ';
    for (int i = content.length() - 1; i > -1; --i) {
      ch = content.charAt(i);
      if (ch == '\n' && lastch == '\r') {
        buff.deleteCharAt(i + 1);
      }
      if (ch == '\r' && lastch == '\n') {
        buff.deleteCharAt(i);
        ch = lastch;
      }
      lastch = ch;
    } // for

    doc.setContent(new DocumentContentImpl(buff.toString()));
  } // removeExtraNewLine(Document doc)
Пример #25
0
  /**
   * Constructor to create a SerialCorpus from a transient one. This is called by adopt() to store
   * the transient corpus and re-route the methods calls to it, until the corpus is sync-ed on disk.
   * After that, the transientCorpus will always be null, so the new functionality will be used
   * instead.
   */
  protected SerialCorpusImpl(Corpus tCorpus) {
    // copy the corpus name and features from the one in memory
    this.setName(tCorpus.getName());
    this.setFeatures(tCorpus.getFeatures());

    docDataList = new ArrayList<DocumentData>();
    // now cache the names of all docs for future use
    List<String> docNames = tCorpus.getDocumentNames();
    for (int i = 0; i < docNames.size(); i++) {
      Document doc = tCorpus.get(i);
      docDataList.add(new DocumentData(docNames.get(i), null, doc.getClass().getName()));
    }

    // copy all the documents from the transient corpus
    documents = new ArrayList<Document>();
    documents.addAll(tCorpus);

    // make sure we fire events when docs are added/removed/etc
    Gate.getCreoleRegister().addCreoleListener(this);
  }
  @Test
  public void testAddFeatureStemmingEnabled() {
    Annotation mockedAnnot1 = Mockito.mock(Annotation.class);
    Annotation mockedAnnot2 = Mockito.mock(Annotation.class);
    FeatureMap mockedMap1 = Mockito.mock(FeatureMap.class);
    FeatureMap mockedMap2 = Mockito.mock(FeatureMap.class);
    Node startNode = Mockito.mock(Node.class);
    Node endNode = Mockito.mock(Node.class);

    String wholeSentence = "First Second Third Fourth.";

    Mockito.when(startNode.getOffset()).thenReturn((long) 0);
    Mockito.when(endNode.getOffset()).thenReturn((long) 11);

    Mockito.when(mockedAnnot1.getFeatures()).thenReturn(mockedMap1);
    Mockito.when(mockedMap1.get("string")).thenReturn("First");
    Mockito.when(mockedMap1.get("stem")).thenReturn("stem1");
    Mockito.when(mockedAnnot1.getStartNode()).thenReturn(startNode);

    Mockito.when(mockedAnnot2.getFeatures()).thenReturn(mockedMap2);
    Mockito.when(mockedMap2.get("string")).thenReturn("Second");
    Mockito.when(mockedMap2.get("stem")).thenReturn("stem2");
    Mockito.when(mockedAnnot2.getEndNode()).thenReturn(endNode);

    Document gateDocument = Mockito.mock(Document.class);
    Mockito.when(gateDocument.getName()).thenReturn("doc1");

    ArrayList<Annotation> featureAnnots = new ArrayList<Annotation>();
    featureAnnots.add(mockedAnnot1);
    featureAnnots.add(mockedAnnot2);

    Mockito.when(options.isEnableStemming()).thenReturn(true);

    String featureString = "First Second";
    String featureStem = "stem1 stem2";
    featureContainer.addFeature(featureAnnots, wholeSentence, gateDocument, "content");

    Assert.assertTrue(featureContainer.getFeatureDictionary().get(featureString) != null);
    Assert.assertTrue(featureContainer.getFeatureStorage().get(featureStem) != null);
  }
Пример #27
0
 /**
  * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui.
  */
 void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN)
     throws GateException, IOException {
   LogService.minVerbosityLevel = 0;
   if (LogService.minVerbosityLevel > 0)
     System.out.println("Learning Home : " + learningHome.getAbsolutePath());
   FeatureMap parameters = Factory.newFeatureMap();
   URL configFileURL = new File(configFileName).toURI().toURL();
   parameters.put("configFileURL", configFileURL);
   learningApi =
       (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters);
   // Load the corpus
   corpus = Factory.newCorpus("DataSet");
   ExtensionFileFilter fileFilter = new ExtensionFileFilter();
   fileFilter.addExtension("xml");
   File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter);
   Arrays.sort(
       xmlFiles,
       new Comparator<File>() {
         public int compare(File a, File b) {
           return a.getName().compareTo(b.getName());
         }
       });
   for (File f : xmlFiles) {
     if (!f.isDirectory()) {
       Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8");
       doc.setName(f.getName());
       corpus.add(doc);
     }
   }
   //    URL tempURL = new File(corpusDirName).toURI().toURL();
   //    corpus.populate(tempURL, fileFilter, "UTF-8", false);
   // Set the inputAS
   learningApi.setInputASName(inputasN);
   learningApi.setOutputASName(outputasN);
   controller =
       (gate.creole.SerialAnalyserController)
           Factory.createResource("gate.creole.SerialAnalyserController");
   controller.setCorpus(corpus);
   controller.add(learningApi);
 }
Пример #28
0
  public int findDocument(Document doc) {
    boolean found = false;
    DocumentData docData = null;

    // first try finding the document in memory
    int index = documents.indexOf(doc);
    if (index > -1 && index < docDataList.size()) return index;

    // else try finding a document with the same name and persistent ID
    Iterator<DocumentData> iter = docDataList.iterator();
    for (index = 0; iter.hasNext(); index++) {
      docData = iter.next();
      if (docData.getDocumentName().equals(doc.getName())
          && docData.getPersistentID().equals(doc.getLRPersistenceId())
          && docData.getClassType().equals(doc.getClass().getName())) {
        found = true;
        break;
      }
    }
    if (found && index < docDataList.size()) return index;
    else return -1;
  } // findDocument
Пример #29
0
  public static void main(String[] args) throws Exception {

    String inputFile = inputDirectory + "inputBulgarianSpecial-1.xml";
    String gateString = FileUtils.readFileToString(new File(inputFile), "UTF-8");

    log.error("Start Pipeline");
    Pipeline pl = new Pipeline();
    pl.initGate();
    Document doc = readDocument(gateString);

    /*
    log.error("Add Sentence Detection and Tokenization");
    GateOperations go = new GateOperations ();
    go.addExtraInfo(doc);
    writeFile(doc.toXml(),"inputEnglish-Proc-1.xml");
    */

    log.error("Add the Layout Annotation Sets");
    Layout layout = new Layout();
    layout.addLayout(doc);
    writeFile(doc.toXml(), "inputBulgarianSpecial-Final-1.xml");
  }
 protected void doExecute(Document theDocument) throws ExecutionException {
   interrupted = false;
   if (theDocument == null) {
     throw new ExecutionException("No document to process!");
   }
   AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet());
   if (containingType == null || containingType.isEmpty()) {
     annotateText(document, outputAS, 0, document.getContent().size());
   } else {
     AnnotationSet inputAS = null;
     if (inputASName == null || inputASName.isEmpty()) {
       inputAS = theDocument.getAnnotations();
     } else {
       inputAS = theDocument.getAnnotations(inputASName);
     }
     AnnotationSet containingAnns = inputAS.get(containingType);
     for (Annotation containingAnn : containingAnns) {
       annotateText(
           document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn));
     }
   }
 }