public Document getDocument(Object docID) throws ResourceInstantiationException {
    // load document from datastore
    FeatureMap docFeatures = Factory.newFeatureMap();
    docFeatures.put(DataStore.LR_ID_FEATURE_NAME, docID);
    docFeatures.put(DataStore.DATASTORE_FEATURE_NAME, ds);

    return (Document) Factory.createResource("gate.corpora.DocumentImpl", docFeatures);
  }
Пример #2
0
  public int shrink(float threshold) {
    int[] indexMap = new int[getFeatureSize()];
    int i, j, k, l, count = 1; // bias
    float max, min;

    for (i = 1; i < getFeatureSize(); i++) {
      k = i * getLabelSize();
      max = weight_vector.get(k);
      min = weight_vector.get(k);

      for (j = 1; j < getLabelSize(); j++) {
        max = Math.max(max, weight_vector.get(k + j));
        min = Math.min(min, weight_vector.get(k + j));
      }

      if (Math.abs(max - min) >= threshold) indexMap[i] = count++;
    }

    WeightVector newWeights =
        new WeightVectorDynamic(
            weight_vector.getLabelSize(), count, weight_vector.getActivationFunction());
    ObjectIterator<Entry<String, Integer>> it;
    int oldIndex, newIndex;
    Entry<String, Integer> e;

    // bias weights
    for (j = 0; j < getLabelSize(); j++) newWeights.set(j, weight_vector.get(j));

    for (Object2IntMap<String> map : feature_map.getIndexMaps()) {
      it = map.entrySet().iterator();

      while (it.hasNext()) {
        e = it.next();
        oldIndex = e.getValue();
        newIndex = indexMap[oldIndex];

        if (newIndex > 0) {
          e.setValue(newIndex);
          k = oldIndex * getLabelSize();
          l = newIndex * getLabelSize();

          for (j = 0; j < getLabelSize(); j++) newWeights.set(l + j, weight_vector.get(k + j));
        } else it.remove();
      }
    }

    weight_vector = newWeights;
    feature_map.setSize(count);
    return count;
  }
Пример #3
0
  protected int binariesInstance(String line, FeatureList featureList) throws MaltChainedException {
    int y = -1;
    featureList.clear();
    try {
      String[] columns = tabPattern.split(line);

      if (columns.length == 0) {
        return -1;
      }
      try {
        y = Integer.parseInt(columns[0]);
      } catch (NumberFormatException e) {
        throw new LibException(
            "The instance file contain a non-integer value '" + columns[0] + "'", e);
      }
      for (int j = 1; j < columns.length; j++) {
        final String[] items = pipePattern.split(columns[j]);
        for (int k = 0; k < items.length; k++) {
          try {
            int colon = items[k].indexOf(':');
            if (colon == -1) {
              if (Integer.parseInt(items[k]) != -1) {
                int v = featureMap.addIndex(j, Integer.parseInt(items[k]));
                if (v != -1) {
                  featureList.add(v, 1);
                }
              }
            } else {
              int index = featureMap.addIndex(j, Integer.parseInt(items[k].substring(0, colon)));
              double value;
              if (items[k].substring(colon + 1).indexOf('.') != -1) {
                value = Double.parseDouble(items[k].substring(colon + 1));
              } else {
                value = Integer.parseInt(items[k].substring(colon + 1));
              }
              featureList.add(index, value);
            }
          } catch (NumberFormatException e) {
            throw new LibException(
                "The instance file contain a non-numeric value '" + items[k] + "'", e);
          }
        }
      }
    } catch (ArrayIndexOutOfBoundsException e) {
      throw new LibException("Couln't read from the instance file. ", e);
    }
    return y;
  }
Пример #4
0
 public boolean train(DataIter trainData, boolean cachedLabels, boolean collectIds)
     throws Exception {
   // map the y-values in the training set.
   boolean labelsMapped = false;
   if (cachedLabels) {
     labelsMapped = stateMappings(trainData);
   }
   if (dict != null) dict.train(trainData, model.numStates());
   for (WordsInTrain d : otherDicts) {
     d.train(trainData, model.numStates());
   }
   boolean requiresTraining = false;
   for (int f = 0; f < features.size(); f++) {
     if (getFeature(f).requiresTraining()) {
       requiresTraining = true;
       break;
     }
   }
   if (requiresTraining) {
     for (trainData.startScan(); trainData.hasNext(); ) {
       DataSequence seq = trainData.next();
       for (int f = 0; f < features.size(); f++) {
         if (getFeature(f).requiresTraining()) {
           trainFeatureType(getFeature(f), seq);
         }
       }
     }
   }
   if (collectIds) totalFeatures = featureMap.collectFeatureIdentifiers(trainData, maxMemory());
   return labelsMapped;
 };
  public Corpus getCorpus(Object corpusID) {
    // load corpus from datastore using its persistent ID
    FeatureMap corpFeatures = Factory.newFeatureMap();
    corpFeatures.put(DataStore.LR_ID_FEATURE_NAME, corpusID);
    corpFeatures.put(DataStore.DATASTORE_FEATURE_NAME, ds);

    // tell the factory to load the Serial Corpus with the specified ID
    // from the specified datastore
    try {
      Corpus persistCorp =
          (Corpus) Factory.createResource("gate.corpora.SerialCorpusImpl", corpFeatures);

      if (DEBUG) Out.println("corpus loaded from datastore...");
      return persistCorp;
    } catch (ResourceInstantiationException e) {
      e.printStackTrace();
    }
    return null;
  }
Пример #6
0
  @Override
  public SparseVector toSparseVector(StringVector features) {
    SparseVector x = new SparseVector();
    int index;

    if (bias > 0) x.add(new SparseItem(0, bias));

    for (StringItem f : features) {
      index = feature_map.index(f.getType(), f.getValue());
      if (index > 0) x.add(index, f.getWeight());
    }

    x.sort();
    return x;
  }
Пример #7
0
  public boolean predict(FeatureVector featureVector, SingleDecision decision)
      throws MaltChainedException {
    if (featureVector == null) {
      throw new LibException(
          "The learner cannot predict the next class, because the feature vector cannot be found. ");
    }

    final FeatureList featureList = new FeatureList();
    final int size = featureVector.size();
    for (int i = 1; i <= size; i++) {
      final FeatureValue featureValue = featureVector.getFeatureValue(i - 1);
      if (featureValue != null && !(excludeNullValues == true && featureValue.isNullValue())) {
        if (featureValue instanceof SingleFeatureValue) {
          SingleFeatureValue singleFeatureValue = (SingleFeatureValue) featureValue;
          int index = featureMap.getIndex(i, singleFeatureValue.getIndexCode());
          if (index != -1 && singleFeatureValue.getValue() != 0) {
            featureList.add(index, singleFeatureValue.getValue());
          }
        } else if (featureValue instanceof MultipleFeatureValue) {
          for (Integer value : ((MultipleFeatureValue) featureValue).getCodes()) {
            int v = featureMap.getIndex(i, value);
            if (v != -1) {
              featureList.add(v, 1);
            }
          }
        }
      }
    }
    try {
      decision.getKBestList().addList(model.predict(featureList.toArray()));
      //			decision.getKBestList().addList(prediction(featureList));
    } catch (OutOfMemoryError e) {
      throw new LibException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e);
    }
    return true;
  }
Пример #8
0
  protected void advance(boolean returnWithId) {
    while (true) {
      for (;
          ((currentFeatureType == null) || !currentFeatureType.hasNext())
              && featureIter.hasNext(); ) {
        currentFeatureType = featureIter.next();
      }
      if (!currentFeatureType.hasNext()) break;
      while (currentFeatureType.hasNext()) {
        featureToReturn.init();
        copyNextFeature(featureToReturn);

        featureToReturn.id = featureMap.getId(featureToReturn);

        if (featureToReturn.id < 0) {
          continue;
        }
        if (featureValid(data, cposStart, cposEnd, featureToReturn, model, _fixedTransitions))
          return;
      }
    }
    featureToReturn.id = -1;
  }
Пример #9
0
 public String getFeatureName(int i) {
   return featureMap.getName(i);
 }
Пример #10
0
 public void write(String fileName) throws IOException {
   PrintWriter out = new PrintWriter(new FileOutputStream(fileName));
   if (dict != null) dict.write(out);
   featureMap.write(out);
   out.close();
 }
Пример #11
0
 public void printFeatureMap(PrintWriter out) throws IOException {
   featureMap.write(out);
 }
Пример #12
0
 public int featureIndex(FeatureIdentifier fId) {
   return featureMap.getIndex(fId);
 }
Пример #13
0
 public void read(String fileName) throws IOException {
   BufferedReader in = new BufferedReader(new FileReader(fileName));
   if (dict != null) dict.read(in, model.numStates());
   totalFeatures = featureMap.read(in);
 }
Пример #14
0
 public FeatureIdentifier featureIdentifier(int id) {
   return featureMap.getIdentifier(id);
 }
Пример #15
0
 public String featureName(int featureIndex) {
   return featureMap.getName(featureIndex);
 }
Пример #16
0
 public void collectAndFreezeFeatures(DataIter trainData) throws Exception {
   featureMap.collectFeatureIdentifiers(trainData, 1);
 }
Пример #17
0
 public void freezeFeatures() {
   if (featureCollectMode) featureMap.freezeFeatures();
 }
Пример #18
0
 @Override
 public int getFeatureSize() {
   return feature_map.size();
 }
Пример #19
0
 @Override
 public void addFeatures(StringVector features) {
   for (StringItem f : features) feature_map.add(f.getType(), f.getValue());
 }
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main