/** * Check the new line sequence and set document property. <br> * Possible values are CRLF, LFCR, CR, LF */ protected void setNewLineProperty(Document doc) { String content = doc.getContent().toString(); String newLineType = ""; char ch = ' '; char lastch = ' '; for (int i = 0; i < content.length(); ++i) { ch = content.charAt(i); if (lastch == '\r') { if (ch == '\n') { newLineType = "CRLF"; break; } else { newLineType = "CR"; break; } } if (lastch == '\n') { if (ch == '\r') { newLineType = "LFCR"; break; } else { newLineType = "LF"; break; } } lastch = ch; } // for doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType); } // setNewLineProperty()
/** * Unloads a document from memory. * * @param index the index of the document to be unloaded. * @param sync should the document be sync'ed (i.e. saved) before unloading. */ public void unloadDocument(int index, boolean sync) { // 1. check whether its been loaded and is a persistent one // if a persistent doc is not loaded, there's nothing we need to do if ((!isDocumentLoaded(index)) && isPersistentDocument(index)) return; // 2. If requested, sync the document before releasing it from // memory, // because the creole register garbage collects all LRs which are // not used // any more if (sync) { Document doc = documents.get(index); try { // if the document is not already adopted, we need to do that // first if (doc.getLRPersistenceId() == null) { doc = (Document) this.getDataStore().adopt(doc); this.getDataStore().sync(doc); this.setDocumentPersistentID(index, doc.getLRPersistenceId()); } else // if it is adopted, just sync it this.getDataStore().sync(doc); } catch (PersistenceException ex) { throw new GateRuntimeException( "Error unloading document from corpus" + "because document sync failed: " + ex.getMessage(), ex); } } // 3. remove the document from the memory // do this, only if the saving has succeeded documents.set(index, null); }
public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try { gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); // transfer the annotations from the GATE document // to the Behemoth one using the filters if (reporter != null) reporter.incrCounter("GATE", "Document", 1); return gatedocument.toXml(); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } return null; }
/** Annotation remove event */ public void annotationRemoved(AnnotationSetEvent ase) { if (!disableListener && ase.getSourceDocument() == this) { AnnotationSet as = (AnnotationSet) ase.getSource(); Annotation annot = ase.getAnnotation(); FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); boolean defaultAS = as.getName() == null; for (String docID : combinedDocumentIds) { Document aDoc = compoundDocument.getDocument(docID); // find out the details which refer to the deleted annotation OffsetDetails od = getOffsetDetails(docID, as.getName(), annot); if (od == null) continue; if (defaultAS) { aDoc.getAnnotations().remove(od.getOriginalAnnotation()); } else { aDoc.getAnnotations(as.getName()).remove(od.getOriginalAnnotation()); } removeOffsetDetails(docID, od); break; } } }
public void tokenize() { AnnotationSet tokenizationAs = gateDocument.getAnnotations("Tokenization"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = tokenizationAs.iterator(); it.hasNext(); ) { Annotation currentTokenAnnotation = it.next(); FeatureMap tokenFeaturesMap = currentTokenAnnotation.getFeatures(); FeatureMap curFeaturesMap = Factory.newFeatureMap(); if ("Token".compareToIgnoreCase(currentTokenAnnotation.getType()) == 0) { curFeaturesMap.put("string", tokenFeaturesMap.get("string")); curFeaturesMap.put("root", tokenFeaturesMap.get("lemma")); curFeaturesMap.put("category", tokenFeaturesMap.get("POS")); // Add the new Token to the Annotation Set defaultAs.add( currentTokenAnnotation.getStartNode(), currentTokenAnnotation.getEndNode(), currentTokenAnnotation.getType(), curFeaturesMap); } } gateDocument.removeAnnotationSet("Tokenization"); }
/** Test the default tokeniser */ public void testHashGazetteer() throws Exception { // get a document Document doc = Factory.newDocument(new URL(TestDocument.getTestServerName() + "tests/doc0.html")); System.out.println(doc.getFeatures().get("gate.SourceURL")); // create a default gazetteer FeatureMap params = Factory.newFeatureMap(); HashGazetteer gaz = (HashGazetteer) Factory.createResource("com.ontotext.gate.gazetteer.HashGazetteer", params); // runtime stuff gaz.setDocument(doc); gaz.setAnnotationSetName(GAZ_AS); gaz.execute(); assertTrue( "the Annotation set resulting of the execution of the OntoText " + "Natural Gazetteer is empty.", !doc.getAnnotations(GAZ_AS).isEmpty()); // check whether the annotations are as expected assertEquals("wrong number of lookup annotations found", 76, doc.getAnnotations(GAZ_AS).size()); } // testHashGazetteer();
@Override public Document get(int index) { if (index >= docDataList.size()) return null; Document res = documents.get(index); if (DEBUG) Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); // if the document is null, then I must get it from the DS if (res == null) { FeatureMap parameters = Factory.newFeatureMap(); parameters.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); try { parameters.put(DataStore.LR_ID_FEATURE_NAME, docDataList.get(index).getPersistentID()); Document lr = (Document) Factory.createResource(docDataList.get(index).getClassType(), parameters); if (DEBUG) Out.prln("Loaded document :" + lr.getName()); // change the result to the newly loaded doc res = lr; // finally replace the doc with the instantiated version documents.set(index, lr); } catch (ResourceInstantiationException ex) { Err.prln("Error reading document inside a serialised corpus."); throw new GateRuntimeException(ex); } } return res; }
@SuppressWarnings("unchecked") public Set<String> processDoc(String str) throws Exception { Set<String> toReturn = new HashSet<String>(); Corpus c = null; Document aDoc = null; try { c = Factory.newCorpus("sample"); aDoc = Factory.newDocument(str); c.add(aDoc); controller.setCorpus(c); controller.execute(); AnnotationSet aSet = aDoc.getAnnotations("StockSymbols"); for (Annotation annot : aSet) { String symbol = (String) annot.getFeatures().get("sym"); toReturn.add(symbol); } } catch (Exception e) { throw e; } finally { if (aDoc != null) { Factory.deleteResource(aDoc); } if (c != null) { Factory.deleteResource(c); } } return toReturn; }
/** * Unpack the markup in the document. This converts markup from the native format (e.g. XML, RTF) * into annotations in GATE format. Uses the markupElementsMap to determine which elements to * convert, and what annotation type names to use. */ public void unpackMarkup(Document doc) throws DocumentFormatException { if (doc == null || doc.getContent() == null) return; setNewLineProperty(doc); // Create paragraph annotations in the specified annotation set int endOffset = doc.getContent().toString().length(); int startOffset = 0; annotateParagraphs(doc, startOffset, endOffset, GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); } // unpackMarkup
public JSONObject persian_sentiment(String text) throws Exception { oncreate(); File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp"); // initialise GATE - this must be done before calling any GATE APIs Gate.init(); // load the saved application CorpusController application = (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp); // Create a Corpus to use. We recycle the same Corpus object for each // iteration. The string parameter to newCorpus() is simply the // GATE-internal name to use for the corpus. It has no particular // significance. Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus"); application.setCorpus(corpus); // process the files one by one // load the document (using the specified encoding if one was given) Document doc = Factory.newDocument(text); // put the document in the corpus corpus.add(doc); // run the application application.execute(); String featureName = "Doc_sentiment"; FeatureMap features = doc.getFeatures(); // remove the document from the corpus again corpus.clear(); // doc.getFeatures(). // Release the document, as it is no longer needed Factory.deleteResource(doc); LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName); String obj = (String) originalContent.get("sentiment"); // BigDecimal pos =(BigDecimal) originalContent.get("positive"); // BigDecimal neg =(BigDecimal) originalContent.get("negative"); // System.out.println(obj); // create Json for response to user JSONObject obj1 = new JSONObject(); obj1.put("sentiment", obj); /*obj1.put("positive",pos); //obj1.put("negative",neg); System.out.print("----------"); System.out.print(obj1); System.out.print("----------");*/ // application.cleanup(); return obj1; }
/** * Generation of a GATE document from a Behemoth one * * @param key URL of the input doc * @param inputDoc * @return * @throws ResourceInstantiationException * @throws InvalidOffsetException * @throws IOException */ public gate.Document generateGATEDoc(BehemothDocument inputDoc) throws ResourceInstantiationException, InvalidOffsetException, IOException { gate.Document gatedocument = null; // if no text is available (e.g. Tika has not extracted it) // let GATE do the parsing itself from the binary content if (inputDoc.getText() == null) { try { gatedocument = generateGATEDocFromLocalDump(inputDoc); // transfer the text from GATE to Behemoth String textContent = gatedocument.getContent().toString(); inputDoc.setText(textContent); return gatedocument; } catch (Exception e) { LOG.error("Can't generate GATE doc from byte dump", e); } } // if the input document does not have any text -> create a doc with an // empty text String text = inputDoc.getText(); if (inputDoc.getText() == null) text = ""; else text = inputDoc.getText(); gatedocument = Factory.newDocument(text); // then the metadata as document features FeatureMap docFeatures = gatedocument.getFeatures(); String docUrl = inputDoc.getUrl(); if (docUrl != null) docFeatures.put("gate.SourceURL", docUrl); if (inputDoc.getMetadata() != null) { Iterator<Entry<Writable, Writable>> iter = inputDoc.getMetadata().entrySet().iterator(); while (iter.hasNext()) { Entry<Writable, Writable> entry = iter.next(); String skey = entry.getKey().toString().trim(); String svalue = null; if (entry.getValue() != null) svalue = entry.getValue().toString().trim(); docFeatures.put(skey, svalue); } } // finally the annotations as original markups // TODO change the name of the annotation set via config AnnotationSet outputAS = gatedocument.getAnnotations("Original markups"); for (Annotation annot : inputDoc.getAnnotations()) { // add to outputAS as a GATE annotation FeatureMap features = Factory.newFeatureMap(); features.putAll(annot.getFeatures()); outputAS.add(annot.getStart(), annot.getEnd(), annot.getType(), features); } return gatedocument; }
/** * Constructor initialises all the private memeber data * * @param aDocument The gate document that will be processed * @param aMarkupElementsMap The map containing the elements that will transform into annotations * @param anAnnotationSet The annotation set that will contain annotations resulted from the * processing of the gate document */ public HtmlDocumentHandler( gate.Document aDocument, Map<String, String> aMarkupElementsMap, gate.AnnotationSet anAnnotationSet) { // init stack stack = new Stack<CustomObject>(); // this string contains the plain text (the text without markup) tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); // colector is used later to transform all custom objects into // annotation objects colector = new LinkedList<CustomObject>(); // the Gate document doc = aDocument; // this map contains the elements name that we want to create // if it's null all the elements from the XML documents will be transformed // into Gate annotation objects markupElementsMap = aMarkupElementsMap; // init an annotation set for this gate document basicAS = anAnnotationSet; customObjectsId = 0; } // HtmlDocumentHandler
@SuppressWarnings("unchecked") public static AnnotationDiffer computeDiffWithDocFeatures( Document document, List<String> featureNames, AnnotationSet responsesAnnotations) { FeatureMap doc_fm = document.getFeatures(); // Logger log = Logger.getLogger(DocumentFeaturesDiff.class); int correct = 0; int missing = 0; int spurious = 0; for (String feature_name : featureNames) { // int cur_correct = 0; List<String> f = (List<String>) doc_fm.get(feature_name); if (f == null) { f = (List<String>) doc_fm.get(feature_name + "s"); } AnnotationDiffer diff = computeDiffWithGoldStandardDataForSingleFeature( feature_name, Utils.setFromList(f), responsesAnnotations); spurious += diff.getSpurious(); correct += diff.getCorrectMatches(); missing += diff.getMissing(); } return new AnnotationDifferDocumentFeaturesImpl(correct, missing, spurious); }
/** * Constructs a XmlDocumentHandler object. * * @param aDocument the Gate document that will be processed. * @param aMarkupElementsMap this map contains the elements name that we want to create. * @param anElement2StringMap this map contains the strings that will be added to the text * contained by the key element. * @param anAnnotationSet is the annotation set that will be filled when the document was * processed */ public XmlDocumentHandler( gate.Document aDocument, Map<String, String> aMarkupElementsMap, Map<String, String> anElement2StringMap, AnnotationSet anAnnotationSet) { // init parent super(); // init stack stack = new Stack<CustomObject>(); // this string contains the plain text (the text without markup) tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); // colector is used later to transform all custom objects into annotation // objects colector = new LinkedList<CustomObject>(); // the Gate document doc = aDocument; // this map contains the elements name that we want to create // if it's null all the elements from the XML documents will be transformed // into Gate annotation objects markupElementsMap = aMarkupElementsMap; // this map contains the string that we want to insert iside the document // content, when a certain element is found // if the map is null then no string is added element2StringMap = anElement2StringMap; basicAS = anAnnotationSet; customObjectsId = 0; } // XmlDocumentHandler()/
private void thisResourceWritten() { if (indexManager != null) { try { for (int i = 0; i < documents.size(); i++) { if (documents.get(i) != null) { Document doc = documents.get(i); if (!addedDocs.contains(doc) && doc.isModified()) { changedDocs.add(doc); } } } indexManager.sync(addedDocs, removedDocIDs, changedDocs); } catch (IndexException ie) { ie.printStackTrace(); } } }
public void splitter() { AnnotationSet sDetectionAS = gateDocument.getAnnotations("SentenceDetection"); AnnotationSet defaultAs = gateDocument.getAnnotations(""); for (Iterator<Annotation> it = sDetectionAS.iterator(); it.hasNext(); ) { Annotation currentSentenceAnnotation = it.next(); // Add the Sentence to the Annotation Set defaultAs.add( currentSentenceAnnotation.getStartNode(), currentSentenceAnnotation.getEndNode(), "Sentence", null); } gateDocument.removeAnnotationSet("SentenceDetection"); }
@Override public boolean add(Document o) { if (o == null) return false; Document doc = o; // make it accept only docs from its own datastore if (doc.getDataStore() != null && !this.dataStore.equals(doc.getDataStore())) { Err.prln("Error: Persistent corpus can only accept documents " + "from its own datastore!"); return false; } // if // add the document with its index in the docDataList // in this case, since it's going to be added to the end // the index will be the size of the docDataList before // the addition DocumentData docData = new DocumentData(doc.getName(), doc.getLRPersistenceId(), doc.getClass().getName()); boolean result = docDataList.add(docData); documents.add(doc); documentAdded(doc); fireDocumentAdded( new CorpusEvent( SerialCorpusImpl.this, doc, docDataList.size() - 1, doc.getLRPersistenceId(), CorpusEvent.DOCUMENT_ADDED)); return result; }
@Override public void resourceUnloaded(CreoleEvent e) { Resource res = e.getResource(); if (res instanceof Document) { Document doc = (Document) res; if (DEBUG) Out.prln("resource Unloaded called "); // remove from the corpus too, if a transient one if (doc.getDataStore() != this.getDataStore()) { this.remove(doc); } else { // unload all occurences int index = indexOf(res); if (index < 0) return; documents.set(index, null); if (DEBUG) Out.prln("corpus: document " + index + " unloaded and set to null"); } // if } }
@Override public void execute() throws ExecutionException { Document doc = getDocument(); AnnotationSet as = doc.getAnnotations(getAnnotationSetName()); AnnotationSet tocs = as.get(getTokenAnnotationTypeName()); try { for (Annotation t : tocs) { String content = Utils.stringFor(doc, t); String val = getOrthographyValue(content); if (val != null) t.getFeatures().put("orth", val); } } catch (Exception e) { throw new ExecutionException(e); } }
public void featureMapUpdated() { @SuppressWarnings("unchecked") Map<String, List<List<Integer>>> matches = (Map<String, List<List<Integer>>>) this.getFeatures().get("MatchesAnnots"); if (matches == null) return; for (List<List<Integer>> topList : matches.values()) { for (List<Integer> list : topList) { Map<String, List<Integer>> newList = new HashMap<String, List<Integer>>(); for (Integer id : list) { for (String docID : combinedDocumentIds) { // find out the details which refer to the deleted // annotation OffsetDetails od = getOffsetDetails(docID, id); if (od == null) continue; // bingo found it List<Integer> subMatches = newList.get(docID); if (subMatches == null) { subMatches = new ArrayList<Integer>(); newList.put(docID, subMatches); } subMatches.add(od.getOriginalAnnotation().getId()); } } for (String docID : newList.keySet()) { Document aDoc = compoundDocument.getDocument(docID); @SuppressWarnings("unchecked") Map<String, List<List<Integer>>> docMatches = (Map<String, List<List<Integer>>>) aDoc.getFeatures().get("MatchesAnnots"); if (docMatches == null) { docMatches = new HashMap<String, List<List<Integer>>>(); aDoc.getFeatures().put("MatchesAnnots", docMatches); } List<List<Integer>> listOfList = docMatches.get(null); if (listOfList == null) { listOfList = new ArrayList<List<Integer>>(); docMatches.put(null, listOfList); } listOfList.add(newList.get(docID)); } } } }
@Override public void add(int index, Document o) { if (o == null) return; Document doc = o; DocumentData docData = new DocumentData(doc.getName(), doc.getLRPersistenceId(), doc.getClass().getName()); docDataList.add(index, docData); documents.add(index, doc); documentAdded(doc); fireDocumentAdded( new CorpusEvent( SerialCorpusImpl.this, doc, index, doc.getLRPersistenceId(), CorpusEvent.DOCUMENT_ADDED)); }
/** * This is a test to see if the GATE document has a valid URL or a valid content. * * @param doc * @throws DocumentFormatException */ protected static boolean hasContentButNoValidUrl(Document doc) throws DocumentFormatException { try { if (doc.getSourceUrl() == null && doc.getContent() != null) { // The doc's url is null but there is a content. return true; } else { doc.getSourceUrl().openConnection(); } } catch (IOException ex1) { // The URL is not null but is not valid. if (doc.getContent() == null) // The document content is also null. There is nothing we can do. throw new DocumentFormatException( "The document doesn't have a" + " valid URL and also no content"); return true; } // End try return false; }
/** * Unloads a document from memory * * @param doc the document to be unloaded * @param sync should the document be sync'ed (i.e. saved) before unloading. */ public void unloadDocument(Document doc, boolean sync) { if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName()); // 1. determine the index of the document; if not there, do nothing int index = findDocument(doc); if (index == -1) return; if (DEBUG) Out.prln("Index of doc: " + index); if (DEBUG) Out.prln("Size of corpus: " + documents.size()); unloadDocument(index, sync); // documents.remove(new Integer(index)); }
/** Delete '\r' in combination CRLF or LFCR in document content */ private void removeExtraNewLine(Document doc) { String content = doc.getContent().toString(); StringBuffer buff = new StringBuffer(content); char ch = ' '; char lastch = ' '; for (int i = content.length() - 1; i > -1; --i) { ch = content.charAt(i); if (ch == '\n' && lastch == '\r') { buff.deleteCharAt(i + 1); } if (ch == '\r' && lastch == '\n') { buff.deleteCharAt(i); ch = lastch; } lastch = ch; } // for doc.setContent(new DocumentContentImpl(buff.toString())); } // removeExtraNewLine(Document doc)
/** * Constructor to create a SerialCorpus from a transient one. This is called by adopt() to store * the transient corpus and re-route the methods calls to it, until the corpus is sync-ed on disk. * After that, the transientCorpus will always be null, so the new functionality will be used * instead. */ protected SerialCorpusImpl(Corpus tCorpus) { // copy the corpus name and features from the one in memory this.setName(tCorpus.getName()); this.setFeatures(tCorpus.getFeatures()); docDataList = new ArrayList<DocumentData>(); // now cache the names of all docs for future use List<String> docNames = tCorpus.getDocumentNames(); for (int i = 0; i < docNames.size(); i++) { Document doc = tCorpus.get(i); docDataList.add(new DocumentData(docNames.get(i), null, doc.getClass().getName())); } // copy all the documents from the transient corpus documents = new ArrayList<Document>(); documents.addAll(tCorpus); // make sure we fire events when docs are added/removed/etc Gate.getCreoleRegister().addCreoleListener(this); }
@Test public void testAddFeatureStemmingEnabled() { Annotation mockedAnnot1 = Mockito.mock(Annotation.class); Annotation mockedAnnot2 = Mockito.mock(Annotation.class); FeatureMap mockedMap1 = Mockito.mock(FeatureMap.class); FeatureMap mockedMap2 = Mockito.mock(FeatureMap.class); Node startNode = Mockito.mock(Node.class); Node endNode = Mockito.mock(Node.class); String wholeSentence = "First Second Third Fourth."; Mockito.when(startNode.getOffset()).thenReturn((long) 0); Mockito.when(endNode.getOffset()).thenReturn((long) 11); Mockito.when(mockedAnnot1.getFeatures()).thenReturn(mockedMap1); Mockito.when(mockedMap1.get("string")).thenReturn("First"); Mockito.when(mockedMap1.get("stem")).thenReturn("stem1"); Mockito.when(mockedAnnot1.getStartNode()).thenReturn(startNode); Mockito.when(mockedAnnot2.getFeatures()).thenReturn(mockedMap2); Mockito.when(mockedMap2.get("string")).thenReturn("Second"); Mockito.when(mockedMap2.get("stem")).thenReturn("stem2"); Mockito.when(mockedAnnot2.getEndNode()).thenReturn(endNode); Document gateDocument = Mockito.mock(Document.class); Mockito.when(gateDocument.getName()).thenReturn("doc1"); ArrayList<Annotation> featureAnnots = new ArrayList<Annotation>(); featureAnnots.add(mockedAnnot1); featureAnnots.add(mockedAnnot2); Mockito.when(options.isEnableStemming()).thenReturn(true); String featureString = "First Second"; String featureStem = "stem1 stem2"; featureContainer.addFeature(featureAnnots, wholeSentence, gateDocument, "content"); Assert.assertTrue(featureContainer.getFeatureDictionary().get(featureString) != null); Assert.assertTrue(featureContainer.getFeatureStorage().get(featureStem) != null); }
/** * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui. */ void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN) throws GateException, IOException { LogService.minVerbosityLevel = 0; if (LogService.minVerbosityLevel > 0) System.out.println("Learning Home : " + learningHome.getAbsolutePath()); FeatureMap parameters = Factory.newFeatureMap(); URL configFileURL = new File(configFileName).toURI().toURL(); parameters.put("configFileURL", configFileURL); learningApi = (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters); // Load the corpus corpus = Factory.newCorpus("DataSet"); ExtensionFileFilter fileFilter = new ExtensionFileFilter(); fileFilter.addExtension("xml"); File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter); Arrays.sort( xmlFiles, new Comparator<File>() { public int compare(File a, File b) { return a.getName().compareTo(b.getName()); } }); for (File f : xmlFiles) { if (!f.isDirectory()) { Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8"); doc.setName(f.getName()); corpus.add(doc); } } // URL tempURL = new File(corpusDirName).toURI().toURL(); // corpus.populate(tempURL, fileFilter, "UTF-8", false); // Set the inputAS learningApi.setInputASName(inputasN); learningApi.setOutputASName(outputasN); controller = (gate.creole.SerialAnalyserController) Factory.createResource("gate.creole.SerialAnalyserController"); controller.setCorpus(corpus); controller.add(learningApi); }
public int findDocument(Document doc) { boolean found = false; DocumentData docData = null; // first try finding the document in memory int index = documents.indexOf(doc); if (index > -1 && index < docDataList.size()) return index; // else try finding a document with the same name and persistent ID Iterator<DocumentData> iter = docDataList.iterator(); for (index = 0; iter.hasNext(); index++) { docData = iter.next(); if (docData.getDocumentName().equals(doc.getName()) && docData.getPersistentID().equals(doc.getLRPersistenceId()) && docData.getClassType().equals(doc.getClass().getName())) { found = true; break; } } if (found && index < docDataList.size()) return index; else return -1; } // findDocument
public static void main(String[] args) throws Exception { String inputFile = inputDirectory + "inputBulgarianSpecial-1.xml"; String gateString = FileUtils.readFileToString(new File(inputFile), "UTF-8"); log.error("Start Pipeline"); Pipeline pl = new Pipeline(); pl.initGate(); Document doc = readDocument(gateString); /* log.error("Add Sentence Detection and Tokenization"); GateOperations go = new GateOperations (); go.addExtraInfo(doc); writeFile(doc.toXml(),"inputEnglish-Proc-1.xml"); */ log.error("Add the Layout Annotation Sets"); Layout layout = new Layout(); layout.addLayout(doc); writeFile(doc.toXml(), "inputBulgarianSpecial-Final-1.xml"); }
protected void doExecute(Document theDocument) throws ExecutionException { interrupted = false; if (theDocument == null) { throw new ExecutionException("No document to process!"); } AnnotationSet outputAS = theDocument.getAnnotations(getOutputAnnotationSet()); if (containingType == null || containingType.isEmpty()) { annotateText(document, outputAS, 0, document.getContent().size()); } else { AnnotationSet inputAS = null; if (inputASName == null || inputASName.isEmpty()) { inputAS = theDocument.getAnnotations(); } else { inputAS = theDocument.getAnnotations(inputASName); } AnnotationSet containingAnns = inputAS.get(containingType); for (Annotation containingAnn : containingAnns) { annotateText( document, outputAS, gate.Utils.start(containingAnn), gate.Utils.end(containingAnn)); } } }