/** Saves the {@link BasisMapping} created from the {@link OccurrenceCounter}. */ protected void saveSSpace(SemanticSpace sspace, File outputFile) throws IOException { BasisMapping<String, String> savedTerms = new StringBasisMapping(); for (String term : sspace.getWords()) savedTerms.getDimension(term); ObjectOutputStream ouStream = new ObjectOutputStream(new FileOutputStream(outputFile)); ouStream.writeObject(savedTerms); ouStream.close(); }
/** {@inheritDoc} */ public void processSpace(Properties properties) { SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(cooccurrenceMatrix); for (String term : basis.keySet()) { int index = basis.getDimension(term); SparseDoubleVector sdv = cleanedMatrix.getRowVector(index); double score = 0; for (int i : sdv.getNonZeroIndices()) score += sdv.get(i); wordScores.put(term, score); } }
/** {@inheritDoc} */ public void processDocument(BufferedReader document) throws IOException { Queue<String> nextWords = new ArrayDeque<String>(); Queue<String> prevWords = new ArrayDeque<String>(); Iterator<String> documentTokens = IteratorFactory.tokenizeOrdered(document); String focus = null; // Rather than updating the matrix every time an occurrence is seen, // keep a thread-local count of what needs to be modified in the // matrix and update after the document has been processed. This // saves potential contention from concurrent writes. Map<Pair<Integer>, Double> matrixEntryToCount = new HashMap<Pair<Integer>, Double>(); // Load the first windowSize words into the Queue for (int i = 0; i < windowSize && documentTokens.hasNext(); i++) nextWords.offer(documentTokens.next()); while (!nextWords.isEmpty()) { // Load the top of the nextWords Queue into the focus word focus = nextWords.remove(); // Add the next word to nextWords queue (if possible) if (documentTokens.hasNext()) nextWords.offer(documentTokens.next()); // If the filter does not accept this word, skip the semantic // processing, continue with the next word if (focus.equals(IteratorFactory.EMPTY_TOKEN)) { int focusIndex = basis.getDimension(focus); countOccurrences(nextWords, focusIndex, 1, matrixEntryToCount); countOccurrences(prevWords, focusIndex, -prevWords.size(), matrixEntryToCount); } // last, put this focus word in the prev words and shift off the // front if it is larger than the window prevWords.offer(focus); if (prevWords.size() > windowSize) prevWords.remove(); } // Once the document has been processed, update the co-occurrence // matrix accordingly. for (Map.Entry<Pair<Integer>, Double> e : matrixEntryToCount.entrySet()) { Pair<Integer> p = e.getKey(); cooccurrenceMatrix.addAndGet(p.x, p.y, e.getValue()); } }
/** * Adds a occurnce count for each term in {@code words} according to it's distance from the * focus word. */ private void countOccurrences( Queue<String> words, int focusIndex, int wordDistance, Map<Pair<Integer>, Double> entryCounts) { // Iterate through the words occurring after and add values for (String term : words) { // skip adding co-occurence values for words that are not // accepted by the filter if (!term.equals(IteratorFactory.EMPTY_TOKEN)) { int index = basis.getDimension(term); // Get the current number of times that the focus word has // co-occurred with this word appearing after it. Weight // the word appropriately based on distance Pair<Integer> p = new Pair<Integer>(focusIndex, index); double value = weighting.weight(wordDistance, windowSize); Double curCount = entryCounts.get(p); entryCounts.put(p, (curCount == null) ? value : value + curCount); } wordDistance++; } }