public Document getDocument(Object docID) throws ResourceInstantiationException { // load document from datastore FeatureMap docFeatures = Factory.newFeatureMap(); docFeatures.put(DataStore.LR_ID_FEATURE_NAME, docID); docFeatures.put(DataStore.DATASTORE_FEATURE_NAME, ds); return (Document) Factory.createResource("gate.corpora.DocumentImpl", docFeatures); }
public int shrink(float threshold) { int[] indexMap = new int[getFeatureSize()]; int i, j, k, l, count = 1; // bias float max, min; for (i = 1; i < getFeatureSize(); i++) { k = i * getLabelSize(); max = weight_vector.get(k); min = weight_vector.get(k); for (j = 1; j < getLabelSize(); j++) { max = Math.max(max, weight_vector.get(k + j)); min = Math.min(min, weight_vector.get(k + j)); } if (Math.abs(max - min) >= threshold) indexMap[i] = count++; } WeightVector newWeights = new WeightVectorDynamic( weight_vector.getLabelSize(), count, weight_vector.getActivationFunction()); ObjectIterator<Entry<String, Integer>> it; int oldIndex, newIndex; Entry<String, Integer> e; // bias weights for (j = 0; j < getLabelSize(); j++) newWeights.set(j, weight_vector.get(j)); for (Object2IntMap<String> map : feature_map.getIndexMaps()) { it = map.entrySet().iterator(); while (it.hasNext()) { e = it.next(); oldIndex = e.getValue(); newIndex = indexMap[oldIndex]; if (newIndex > 0) { e.setValue(newIndex); k = oldIndex * getLabelSize(); l = newIndex * getLabelSize(); for (j = 0; j < getLabelSize(); j++) newWeights.set(l + j, weight_vector.get(k + j)); } else it.remove(); } } weight_vector = newWeights; feature_map.setSize(count); return count; }
protected int binariesInstance(String line, FeatureList featureList) throws MaltChainedException { int y = -1; featureList.clear(); try { String[] columns = tabPattern.split(line); if (columns.length == 0) { return -1; } try { y = Integer.parseInt(columns[0]); } catch (NumberFormatException e) { throw new LibException( "The instance file contain a non-integer value '" + columns[0] + "'", e); } for (int j = 1; j < columns.length; j++) { final String[] items = pipePattern.split(columns[j]); for (int k = 0; k < items.length; k++) { try { int colon = items[k].indexOf(':'); if (colon == -1) { if (Integer.parseInt(items[k]) != -1) { int v = featureMap.addIndex(j, Integer.parseInt(items[k])); if (v != -1) { featureList.add(v, 1); } } } else { int index = featureMap.addIndex(j, Integer.parseInt(items[k].substring(0, colon))); double value; if (items[k].substring(colon + 1).indexOf('.') != -1) { value = Double.parseDouble(items[k].substring(colon + 1)); } else { value = Integer.parseInt(items[k].substring(colon + 1)); } featureList.add(index, value); } } catch (NumberFormatException e) { throw new LibException( "The instance file contain a non-numeric value '" + items[k] + "'", e); } } } } catch (ArrayIndexOutOfBoundsException e) { throw new LibException("Couln't read from the instance file. ", e); } return y; }
public boolean train(DataIter trainData, boolean cachedLabels, boolean collectIds) throws Exception { // map the y-values in the training set. boolean labelsMapped = false; if (cachedLabels) { labelsMapped = stateMappings(trainData); } if (dict != null) dict.train(trainData, model.numStates()); for (WordsInTrain d : otherDicts) { d.train(trainData, model.numStates()); } boolean requiresTraining = false; for (int f = 0; f < features.size(); f++) { if (getFeature(f).requiresTraining()) { requiresTraining = true; break; } } if (requiresTraining) { for (trainData.startScan(); trainData.hasNext(); ) { DataSequence seq = trainData.next(); for (int f = 0; f < features.size(); f++) { if (getFeature(f).requiresTraining()) { trainFeatureType(getFeature(f), seq); } } } } if (collectIds) totalFeatures = featureMap.collectFeatureIdentifiers(trainData, maxMemory()); return labelsMapped; };
public Corpus getCorpus(Object corpusID) { // load corpus from datastore using its persistent ID FeatureMap corpFeatures = Factory.newFeatureMap(); corpFeatures.put(DataStore.LR_ID_FEATURE_NAME, corpusID); corpFeatures.put(DataStore.DATASTORE_FEATURE_NAME, ds); // tell the factory to load the Serial Corpus with the specified ID // from the specified datastore try { Corpus persistCorp = (Corpus) Factory.createResource("gate.corpora.SerialCorpusImpl", corpFeatures); if (DEBUG) Out.println("corpus loaded from datastore..."); return persistCorp; } catch (ResourceInstantiationException e) { e.printStackTrace(); } return null; }
@Override public SparseVector toSparseVector(StringVector features) { SparseVector x = new SparseVector(); int index; if (bias > 0) x.add(new SparseItem(0, bias)); for (StringItem f : features) { index = feature_map.index(f.getType(), f.getValue()); if (index > 0) x.add(index, f.getWeight()); } x.sort(); return x; }
public boolean predict(FeatureVector featureVector, SingleDecision decision) throws MaltChainedException { if (featureVector == null) { throw new LibException( "The learner cannot predict the next class, because the feature vector cannot be found. "); } final FeatureList featureList = new FeatureList(); final int size = featureVector.size(); for (int i = 1; i <= size; i++) { final FeatureValue featureValue = featureVector.getFeatureValue(i - 1); if (featureValue != null && !(excludeNullValues == true && featureValue.isNullValue())) { if (featureValue instanceof SingleFeatureValue) { SingleFeatureValue singleFeatureValue = (SingleFeatureValue) featureValue; int index = featureMap.getIndex(i, singleFeatureValue.getIndexCode()); if (index != -1 && singleFeatureValue.getValue() != 0) { featureList.add(index, singleFeatureValue.getValue()); } } else if (featureValue instanceof MultipleFeatureValue) { for (Integer value : ((MultipleFeatureValue) featureValue).getCodes()) { int v = featureMap.getIndex(i, value); if (v != -1) { featureList.add(v, 1); } } } } } try { decision.getKBestList().addList(model.predict(featureList.toArray())); // decision.getKBestList().addList(prediction(featureList)); } catch (OutOfMemoryError e) { throw new LibException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e); } return true; }
protected void advance(boolean returnWithId) { while (true) { for (; ((currentFeatureType == null) || !currentFeatureType.hasNext()) && featureIter.hasNext(); ) { currentFeatureType = featureIter.next(); } if (!currentFeatureType.hasNext()) break; while (currentFeatureType.hasNext()) { featureToReturn.init(); copyNextFeature(featureToReturn); featureToReturn.id = featureMap.getId(featureToReturn); if (featureToReturn.id < 0) { continue; } if (featureValid(data, cposStart, cposEnd, featureToReturn, model, _fixedTransitions)) return; } } featureToReturn.id = -1; }
public String getFeatureName(int i) { return featureMap.getName(i); }
public void write(String fileName) throws IOException { PrintWriter out = new PrintWriter(new FileOutputStream(fileName)); if (dict != null) dict.write(out); featureMap.write(out); out.close(); }
public void printFeatureMap(PrintWriter out) throws IOException { featureMap.write(out); }
public int featureIndex(FeatureIdentifier fId) { return featureMap.getIndex(fId); }
public void read(String fileName) throws IOException { BufferedReader in = new BufferedReader(new FileReader(fileName)); if (dict != null) dict.read(in, model.numStates()); totalFeatures = featureMap.read(in); }
public FeatureIdentifier featureIdentifier(int id) { return featureMap.getIdentifier(id); }
public String featureName(int featureIndex) { return featureMap.getName(featureIndex); }
public void collectAndFreezeFeatures(DataIter trainData) throws Exception { featureMap.collectFeatureIdentifiers(trainData, 1); }
public void freezeFeatures() { if (featureCollectMode) featureMap.freezeFeatures(); }
@Override public int getFeatureSize() { return feature_map.size(); }
@Override public void addFeatures(StringVector features) { for (StringItem f : features) feature_map.add(f.getType(), f.getValue()); }
/** * Run from the command-line, with a list of URLs as argument. * * <p><B>NOTE:</B><br> * This code will run with all the documents in memory - if you want to unload each from memory * after use, add code to store the corpus in a DataStore. */ public static void main(String args[]) throws GateException, IOException { // initialise the GATE library Out.prln("Initialising GATE..."); Gate.init(); Out.prln("...GATE initialised"); // initialise ANNIE (this may take several minutes) StandAloneAnnie annie = new StandAloneAnnie(); annie.initAnnie(); // create a GATE corpus and add a document for each command-line // argument Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus"); for (int i = 0; i < args.length; i++) { URL u = new URL(args[i]); FeatureMap params = Factory.newFeatureMap(); params.put("sourceUrl", u); params.put("preserveOriginalContent", new Boolean(true)); params.put("collectRepositioningInfo", new Boolean(true)); Out.prln("Creating doc for " + u); Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params); corpus.add(doc); } // for each of args // tell the pipeline about the corpus and run it annie.setCorpus(corpus); annie.execute(); // for each document, get an XML document with the // person and location names added Iterator iter = corpus.iterator(); int count = 0; String startTagPart_1 = "<span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:Red;\">"; String endTag = "</span>"; while (iter.hasNext()) { Document doc = (Document) iter.next(); AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("Person"); annotTypesRequired.add("Location"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); RepositioningInfo info = (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); ++count; File file = new File("StANNIE_" + count + ".HTML"); Out.prln("File name: '" + file.getAbsolutePath() + "'"); if (originalContent != null && info != null) { Out.prln("OrigContent and reposInfo existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionStart = info.getOriginalPos(insertPositionStart); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); insertPositionEnd = info.getOriginalPos(insertPositionEnd, true); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } // if - should generate else if (originalContent != null) { Out.prln("OrigContent existing. Generate file..."); Iterator it = peopleAndPlaces.iterator(); Annotation currAnnot; SortedAnnotationList sortedAnnotations = new SortedAnnotationList(); while (it.hasNext()) { currAnnot = (Annotation) it.next(); sortedAnnotations.addSortedExclusive(currAnnot); } // while StringBuffer editableContent = new StringBuffer(originalContent); long insertPositionEnd; long insertPositionStart; // insert anotation tags backward Out.prln("Unsorted annotations count: " + peopleAndPlaces.size()); Out.prln("Sorted annotations count: " + sortedAnnotations.size()); for (int i = sortedAnnotations.size() - 1; i >= 0; --i) { currAnnot = (Annotation) sortedAnnotations.get(i); insertPositionStart = currAnnot.getStartNode().getOffset().longValue(); insertPositionEnd = currAnnot.getEndNode().getOffset().longValue(); if (insertPositionEnd != -1 && insertPositionStart != -1) { editableContent.insert((int) insertPositionEnd, endTag); editableContent.insert((int) insertPositionStart, startTagPart_3); editableContent.insert((int) insertPositionStart, currAnnot.getType()); editableContent.insert((int) insertPositionStart, startTagPart_2); editableContent.insert((int) insertPositionStart, currAnnot.getId().toString()); editableContent.insert((int) insertPositionStart, startTagPart_1); } // if } // for FileWriter writer = new FileWriter(file); writer.write(editableContent.toString()); writer.close(); } else { Out.prln("Content : " + originalContent); Out.prln("Repositioning: " + info); } String xmlDocument = doc.toXml(peopleAndPlaces, false); String fileName = new String("StANNIE_toXML_" + count + ".HTML"); FileWriter writer = new FileWriter(fileName); writer.write(xmlDocument); writer.close(); } // for each doc } // main