public double getValue(Document doc) throws EventGenerationException {
   double wordCount = wordCounter.getValue(doc);
   double sentenceCount = sentenceCounter.getValue(doc);
   EventSet syllables = syllablesDriver.createEventSet(doc);
   for (int i = syllables.size() - 1; i >= 0; i--) {
     if (Integer.parseInt(syllables.eventAt(i).toString()) < 3) {
       syllables.removeEvent(syllables.eventAt(i));
     }
   }
   double complexWordsCount = syllables.size();
   return 0.4 * (wordCount / sentenceCount + 100 * complexWordsCount / wordCount);
 }
Beispiel #2
0
 public void build(EventSet e) {
   for (int i = 0; i < e.size(); i++) {
     Event start = e.eventAt(i);
     if (!root.isEventInLevel(start)) {
       insertAtRoot(start, e, i);
     } else {
       insertBelowRoot(start, e, i);
     }
   }
   root.key = null;
 }
Beispiel #3
0
 private void insertAtRoot(Event start, EventSet e, int offset) {
   root.addEventToLevel(start);
   XEDictionaryNode node;
   node = root;
   int j = offset;
   while (j < e.size() - 1) {
     node = node.get(e.eventAt(j));
     j++;
     // System.out.println("Adding Event: " + e.eventAt(j));
     node.addEventToLevel(e.eventAt(j));
   }
 }
Beispiel #4
0
 private void insertBelowRoot(Event start, EventSet e, int offset) {
   XEDictionaryNode node;
   node = root;
   // System.out.println("Event at offset: " + e.eventAt(offset));
   node = node.get(e.eventAt(offset));
   int j = offset;
   boolean matches = true; // match the events up to a given level
   while (matches && (j < e.size() - 1)) {
     j++;
     if (node.isEventInLevel(e.eventAt(j))) {
       // System.out.println("Match at level: " + e.eventAt(j));
       node = node.get(e.eventAt(j));
     } else {
       matches = false;
     }
   }
   for (int i = j; i < e.size(); i++) {
     // System.out.println("Adding Event: " + e.eventAt(i));
     node.addEventToLevel(e.eventAt(i));
     node = node.get(e.eventAt(i));
   }
 }
Beispiel #5
0
  private double meanEntropy(EventSet e1, EventSet e2, int windowSize) {

    double totalEntropy = 0;
    int trials = 0;

    if (windowSize > e1.size() - 1) {
      windowSize = e1.size();
    }

    //		for (int j = 0; j <= e1.size() - windowSize; j++) {
    XEDictionary xed = new XEDictionary();
    EventSet dictionary;
    dictionary = window(e1, 0, windowSize);
    xed.build(dictionary);

    for (int i = 0; i <= e2.size() - windowSize; i++) {
      totalEntropy += xed.find(window(e2, i, windowSize));
      trials++;
    }
    //		}
    return totalEntropy / trials;
  }
Beispiel #6
0
 public int find(EventSet e) {
   int matchlength = 0;
   boolean matched = false;
   XEDictionaryNode node = root;
   while ((matchlength < e.size()) && !matched) {
     if (node.isEventInLevel(e.eventAt(matchlength))) {
       node = node.get(e.eventAt(matchlength));
       matchlength++;
     } else {
       matched = true;
     }
   }
   return matchlength;
 }
  /**
   * Culls the test set using the relevant Events extracted from the training data.<br>
   *
   * @param relevantEvents the features from the EventSets which are going to be evaluated
   * @param eventSetsToCull The test documents to be culled
   * @return the culled test documents
   * @throws Exception
   */
  public List<EventSet> cullWithRespectToTraining(
      List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd)
      throws Exception {
    List<EventSet> relevant = relevantEvents;
    int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata
    int i;
    List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>();

    // remove the metadata prior to culling
    EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1);

    // make sure all unknown sets would have only events that appear in the
    // known sets
    // UNLESS the event set contains a sole numeric value event - in that
    // case take it anyway
    for (i = 0; i < numOfFeatureClasses; i++) {
      if (cfd.featureDriverAt(i).isCalcHist()) {
        // initialize set of relevant events
        EventSet es = relevant.get(i);
        Set<String> relevantEventsString = new HashSet<String>(es.size());
        for (Event e : es) relevantEventsString.add(e.getEvent());

        // remove all non-relevant events from unknown event sets
        EventSet unknown;
        Event e;
        unknown = eventSetsToCull.get(i);
        Iterator<Event> iterator = unknown.iterator();
        Event next = null;

        // the test doc may not contain a given feature (ie it might not
        // have any semi-colons)
        if (iterator.hasNext()) next = (Event) iterator.next();

        // while it has more of a feature
        while (iterator.hasNext()) {
          // copy the feature
          e = next;
          boolean remove = true;

          // check to see if the feature is relevant
          for (int l = 0; l < unknown.size(); l++) {
            try {
              if (e.equals(relevantEvents.get(i).eventAt(l))) {
                remove = false; // if it is, break
                break;
              }
            } catch (IndexOutOfBoundsException iobe) {
              remove = true; // it is not relevant if we reach this point.
              break;
            }
          }

          // remove the feature if it isn't relevant
          if (remove) {
            iterator.remove();
          }

          // grab the next feature
          next = iterator.next();
        }

        // add the culled event set
        culledUnknownEventSets.add(unknown);

      } else { // one unique numeric event
        // add non-histogram if it is in the relevantEventSets list
        boolean isRelevant = false;

        for (EventSet res : relevantEvents) {
          if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) {
            isRelevant = true;
            break;
          }
        }

        if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i));
      }
    }
    eventSetsToCull.add(metadata);
    culledUnknownEventSets.add(metadata);

    return culledUnknownEventSets;
  }