/** * Returns a string with the node's word plus it's distance from the focus word, with a hyphen * between the two. */ protected String getFeature(DependencyTreeNode node, int index) { return node.word() + "-" + index; }
/** {@inheritDoc} */ public void processDocument(BufferedReader document) throws IOException { // Local maps to record occurrence counts. Map<Pair<String>, Double> localLemmaCounts = new HashMap<Pair<String>, Double>(); Map<RelationTuple, SparseDoubleVector> localTuples = new HashMap<RelationTuple, SparseDoubleVector>(); // Iterate over all of the parseable dependency parsed sentences in the // document. for (DependencyTreeNode[] nodes = null; (nodes = parser.readNextTree(document)) != null; ) { // Skip empty documents. if (nodes.length == 0) continue; // Examine the paths for each word in the sentence. for (int i = 0; i < nodes.length; ++i) { // Reject words that are not nouns, verbs, or adjectives. if (!(nodes[i].pos().startsWith("N") || nodes[i].pos().startsWith("J") || nodes[i].pos().startsWith("V"))) continue; String focusWord = nodes[i].word(); // Skip words that are rejected by the semantic filter. if (!acceptWord(focusWord)) continue; int focusIndex = termBasis.getDimension(focusWord); // Create the path iterator for all acceptable paths rooted at // the focus word in the sentence. Iterator<DependencyPath> pathIter = new FilteredDependencyIterator(nodes[i], acceptor, 1); while (pathIter.hasNext()) { DependencyPath path = pathIter.next(); DependencyTreeNode last = path.last(); // Reject words that are not nouns, verbs, or adjectives. if (!(last.pos().startsWith("N") || last.pos().startsWith("J") || last.pos().startsWith("V"))) continue; // Get the feature index for the co-occurring word. String otherTerm = last.word(); // Skip any filtered features. if (otherTerm.equals(EMPTY_STRING)) continue; int featureIndex = termBasis.getDimension(otherTerm); Pair<String> p = new Pair<String>(focusWord, otherTerm); Double curCount = localLemmaCounts.get(p); localLemmaCounts.put(p, (curCount == null) ? 1 : 1 + curCount); // Create a RelationTuple as a local key that records this // relation tuple occurrence. If there is not a local // relation vector, create it. Then add an occurrence count // of 1. DependencyRelation relation = path.iterator().next(); // Skip relations that do not have the focusWord as the // head word in the relation. The inverse relation will // eventually be encountered and we'll account for it then. if (!relation.headNode().word().equals(focusWord)) continue; RelationTuple relationKey = new RelationTuple(focusIndex, relation.relation().intern()); SparseDoubleVector relationVector = localTuples.get(relationKey); if (relationVector == null) { relationVector = new CompactSparseVector(); localTuples.put(relationKey, relationVector); } relationVector.add(featureIndex, 1); } } } document.close(); // Once the document has been processed, update the co-occurrence matrix // accordingly. for (Map.Entry<Pair<String>, Double> e : localLemmaCounts.entrySet()) { // Push the local co-occurrence counts to the larger mapping. Pair<String> p = e.getKey(); // Get the prefernce vectors for the current focus word. If they do // not exist, create it in a thread safe manner. SelectionalPreference preference = preferenceVectors.get(p.x); if (preference == null) { synchronized (this) { preference = preferenceVectors.get(p.x); if (preference == null) { preference = new SelectionalPreference(combinor); preferenceVectors.put(p.x, preference); } } } // Add the local count. synchronized (preference) { preference.lemmaVector.add(termBasis.getDimension(p.y), e.getValue()); } } // Push the relation tuple counts to the larger counts. for (Map.Entry<RelationTuple, SparseDoubleVector> r : localTuples.entrySet()) { // Get the global counts for this relation tuple. If it does not // exist, create a new one in a thread safe manner. SparseDoubleVector relationCounts = relationVectors.get(r.getKey()); if (relationCounts == null) { synchronized (this) { relationCounts = relationVectors.get(r.getKey()); if (relationCounts == null) { relationCounts = new CompactSparseVector(); relationVectors.put(r.getKey(), relationCounts); } } } // Update the counts. synchronized (relationCounts) { VectorMath.add(relationCounts, r.getValue()); } } }