/**
  * Returns a string with the node's word plus it's distance from the focus word, with a hyphen
  * between the two.
  */
 protected String getFeature(DependencyTreeNode node, int index) {
   return node.word() + "-" + index;
 }
Пример #2
0
  /** {@inheritDoc} */
  public void processDocument(BufferedReader document) throws IOException {
    // Local maps to record occurrence counts.
    Map<Pair<String>, Double> localLemmaCounts = new HashMap<Pair<String>, Double>();
    Map<RelationTuple, SparseDoubleVector> localTuples =
        new HashMap<RelationTuple, SparseDoubleVector>();

    // Iterate over all of the parseable dependency parsed sentences in the
    // document.
    for (DependencyTreeNode[] nodes = null; (nodes = parser.readNextTree(document)) != null; ) {

      // Skip empty documents.
      if (nodes.length == 0) continue;

      // Examine the paths for each word in the sentence.
      for (int i = 0; i < nodes.length; ++i) {
        // Reject words that are not nouns, verbs, or adjectives.
        if (!(nodes[i].pos().startsWith("N")
            || nodes[i].pos().startsWith("J")
            || nodes[i].pos().startsWith("V"))) continue;

        String focusWord = nodes[i].word();

        // Skip words that are rejected by the semantic filter.
        if (!acceptWord(focusWord)) continue;
        int focusIndex = termBasis.getDimension(focusWord);

        // Create the path iterator for all acceptable paths rooted at
        // the focus word in the sentence.
        Iterator<DependencyPath> pathIter = new FilteredDependencyIterator(nodes[i], acceptor, 1);

        while (pathIter.hasNext()) {
          DependencyPath path = pathIter.next();
          DependencyTreeNode last = path.last();

          // Reject words that are not nouns, verbs, or adjectives.
          if (!(last.pos().startsWith("N")
              || last.pos().startsWith("J")
              || last.pos().startsWith("V"))) continue;

          // Get the feature index for the co-occurring word.
          String otherTerm = last.word();

          // Skip any filtered features.
          if (otherTerm.equals(EMPTY_STRING)) continue;

          int featureIndex = termBasis.getDimension(otherTerm);

          Pair<String> p = new Pair<String>(focusWord, otherTerm);
          Double curCount = localLemmaCounts.get(p);
          localLemmaCounts.put(p, (curCount == null) ? 1 : 1 + curCount);

          // Create a RelationTuple as a local key that records this
          // relation tuple occurrence.  If there is not a local
          // relation vector, create it.  Then add an occurrence count
          // of 1.
          DependencyRelation relation = path.iterator().next();

          // Skip relations that do not have the focusWord as the
          // head word in the relation.  The inverse relation will
          // eventually be encountered and we'll account for it then.
          if (!relation.headNode().word().equals(focusWord)) continue;

          RelationTuple relationKey = new RelationTuple(focusIndex, relation.relation().intern());
          SparseDoubleVector relationVector = localTuples.get(relationKey);
          if (relationVector == null) {
            relationVector = new CompactSparseVector();
            localTuples.put(relationKey, relationVector);
          }
          relationVector.add(featureIndex, 1);
        }
      }
    }

    document.close();

    // Once the document has been processed, update the co-occurrence matrix
    // accordingly.
    for (Map.Entry<Pair<String>, Double> e : localLemmaCounts.entrySet()) {
      // Push the local co-occurrence counts to the larger mapping.
      Pair<String> p = e.getKey();

      // Get the prefernce vectors for the current focus word.  If they do
      // not exist, create it in a thread safe manner.
      SelectionalPreference preference = preferenceVectors.get(p.x);
      if (preference == null) {
        synchronized (this) {
          preference = preferenceVectors.get(p.x);
          if (preference == null) {
            preference = new SelectionalPreference(combinor);
            preferenceVectors.put(p.x, preference);
          }
        }
      }
      // Add the local count.
      synchronized (preference) {
        preference.lemmaVector.add(termBasis.getDimension(p.y), e.getValue());
      }
    }

    // Push the relation tuple counts to the larger counts.
    for (Map.Entry<RelationTuple, SparseDoubleVector> r : localTuples.entrySet()) {
      // Get the global counts for this relation tuple.  If it does not
      // exist, create a new one in a thread safe manner.
      SparseDoubleVector relationCounts = relationVectors.get(r.getKey());
      if (relationCounts == null) {
        synchronized (this) {
          relationCounts = relationVectors.get(r.getKey());
          if (relationCounts == null) {
            relationCounts = new CompactSparseVector();
            relationVectors.put(r.getKey(), relationCounts);
          }
        }
      }

      // Update the counts.
      synchronized (relationCounts) {
        VectorMath.add(relationCounts, r.getValue());
      }
    }
  }