示例#1
0
  /** Saves the {@link BasisMapping} created from the {@link OccurrenceCounter}. */
  protected void saveSSpace(SemanticSpace sspace, File outputFile) throws IOException {
    BasisMapping<String, String> savedTerms = new StringBasisMapping();
    for (String term : sspace.getWords()) savedTerms.getDimension(term);

    ObjectOutputStream ouStream = new ObjectOutputStream(new FileOutputStream(outputFile));
    ouStream.writeObject(savedTerms);
    ouStream.close();
  }
示例#2
0
    /** {@inheritDoc} */
    public void processSpace(Properties properties) {
      SparseMatrix cleanedMatrix = (SparseMatrix) transform.transform(cooccurrenceMatrix);
      for (String term : basis.keySet()) {
        int index = basis.getDimension(term);
        SparseDoubleVector sdv = cleanedMatrix.getRowVector(index);

        double score = 0;
        for (int i : sdv.getNonZeroIndices()) score += sdv.get(i);

        wordScores.put(term, score);
      }
    }
示例#3
0
    /** {@inheritDoc} */
    public void processDocument(BufferedReader document) throws IOException {
      Queue<String> nextWords = new ArrayDeque<String>();
      Queue<String> prevWords = new ArrayDeque<String>();

      Iterator<String> documentTokens = IteratorFactory.tokenizeOrdered(document);

      String focus = null;

      // Rather than updating the matrix every time an occurrence is seen,
      // keep a thread-local count of what needs to be modified in the
      // matrix and update after the document has been processed.    This
      // saves potential contention from concurrent writes.
      Map<Pair<Integer>, Double> matrixEntryToCount = new HashMap<Pair<Integer>, Double>();

      // Load the first windowSize words into the Queue
      for (int i = 0; i < windowSize && documentTokens.hasNext(); i++)
        nextWords.offer(documentTokens.next());

      while (!nextWords.isEmpty()) {
        // Load the top of the nextWords Queue into the focus word
        focus = nextWords.remove();

        // Add the next word to nextWords queue (if possible)
        if (documentTokens.hasNext()) nextWords.offer(documentTokens.next());

        // If the filter does not accept this word, skip the semantic
        // processing, continue with the next word
        if (focus.equals(IteratorFactory.EMPTY_TOKEN)) {
          int focusIndex = basis.getDimension(focus);

          countOccurrences(nextWords, focusIndex, 1, matrixEntryToCount);
          countOccurrences(prevWords, focusIndex, -prevWords.size(), matrixEntryToCount);
        }

        // last, put this focus word in the prev words and shift off the
        // front if it is larger than the window
        prevWords.offer(focus);
        if (prevWords.size() > windowSize) prevWords.remove();
      }

      // Once the document has been processed, update the co-occurrence
      // matrix accordingly.
      for (Map.Entry<Pair<Integer>, Double> e : matrixEntryToCount.entrySet()) {
        Pair<Integer> p = e.getKey();
        cooccurrenceMatrix.addAndGet(p.x, p.y, e.getValue());
      }
    }
示例#4
0
    /**
     * Adds a occurnce count for each term in {@code words} according to it's distance from the
     * focus word.
     */
    private void countOccurrences(
        Queue<String> words,
        int focusIndex,
        int wordDistance,
        Map<Pair<Integer>, Double> entryCounts) {
      // Iterate through the words occurring after and add values
      for (String term : words) {
        // skip adding co-occurence values for words that are not
        // accepted by the filter
        if (!term.equals(IteratorFactory.EMPTY_TOKEN)) {
          int index = basis.getDimension(term);

          // Get the current number of times that the focus word has
          // co-occurred with this word appearing after it.    Weight
          // the word appropriately based on distance
          Pair<Integer> p = new Pair<Integer>(focusIndex, index);
          double value = weighting.weight(wordDistance, windowSize);
          Double curCount = entryCounts.get(p);
          entryCounts.put(p, (curCount == null) ? value : value + curCount);
        }
        wordDistance++;
      }
    }