public double getValue(Document doc) throws EventGenerationException { double wordCount = wordCounter.getValue(doc); double sentenceCount = sentenceCounter.getValue(doc); EventSet syllables = syllablesDriver.createEventSet(doc); for (int i = syllables.size() - 1; i >= 0; i--) { if (Integer.parseInt(syllables.eventAt(i).toString()) < 3) { syllables.removeEvent(syllables.eventAt(i)); } } double complexWordsCount = syllables.size(); return 0.4 * (wordCount / sentenceCount + 100 * complexWordsCount / wordCount); }
public void build(EventSet e) { for (int i = 0; i < e.size(); i++) { Event start = e.eventAt(i); if (!root.isEventInLevel(start)) { insertAtRoot(start, e, i); } else { insertBelowRoot(start, e, i); } } root.key = null; }
private void insertAtRoot(Event start, EventSet e, int offset) { root.addEventToLevel(start); XEDictionaryNode node; node = root; int j = offset; while (j < e.size() - 1) { node = node.get(e.eventAt(j)); j++; // System.out.println("Adding Event: " + e.eventAt(j)); node.addEventToLevel(e.eventAt(j)); } }
private void insertBelowRoot(Event start, EventSet e, int offset) { XEDictionaryNode node; node = root; // System.out.println("Event at offset: " + e.eventAt(offset)); node = node.get(e.eventAt(offset)); int j = offset; boolean matches = true; // match the events up to a given level while (matches && (j < e.size() - 1)) { j++; if (node.isEventInLevel(e.eventAt(j))) { // System.out.println("Match at level: " + e.eventAt(j)); node = node.get(e.eventAt(j)); } else { matches = false; } } for (int i = j; i < e.size(); i++) { // System.out.println("Adding Event: " + e.eventAt(i)); node.addEventToLevel(e.eventAt(i)); node = node.get(e.eventAt(i)); } }
private double meanEntropy(EventSet e1, EventSet e2, int windowSize) { double totalEntropy = 0; int trials = 0; if (windowSize > e1.size() - 1) { windowSize = e1.size(); } // for (int j = 0; j <= e1.size() - windowSize; j++) { XEDictionary xed = new XEDictionary(); EventSet dictionary; dictionary = window(e1, 0, windowSize); xed.build(dictionary); for (int i = 0; i <= e2.size() - windowSize; i++) { totalEntropy += xed.find(window(e2, i, windowSize)); trials++; } // } return totalEntropy / trials; }
public int find(EventSet e) { int matchlength = 0; boolean matched = false; XEDictionaryNode node = root; while ((matchlength < e.size()) && !matched) { if (node.isEventInLevel(e.eventAt(matchlength))) { node = node.get(e.eventAt(matchlength)); matchlength++; } else { matched = true; } } return matchlength; }
/** * Culls the test set using the relevant Events extracted from the training data.<br> * * @param relevantEvents the features from the EventSets which are going to be evaluated * @param eventSetsToCull The test documents to be culled * @return the culled test documents * @throws Exception */ public List<EventSet> cullWithRespectToTraining( List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd) throws Exception { List<EventSet> relevant = relevantEvents; int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata int i; List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>(); // remove the metadata prior to culling EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1); // make sure all unknown sets would have only events that appear in the // known sets // UNLESS the event set contains a sole numeric value event - in that // case take it anyway for (i = 0; i < numOfFeatureClasses; i++) { if (cfd.featureDriverAt(i).isCalcHist()) { // initialize set of relevant events EventSet es = relevant.get(i); Set<String> relevantEventsString = new HashSet<String>(es.size()); for (Event e : es) relevantEventsString.add(e.getEvent()); // remove all non-relevant events from unknown event sets EventSet unknown; Event e; unknown = eventSetsToCull.get(i); Iterator<Event> iterator = unknown.iterator(); Event next = null; // the test doc may not contain a given feature (ie it might not // have any semi-colons) if (iterator.hasNext()) next = (Event) iterator.next(); // while it has more of a feature while (iterator.hasNext()) { // copy the feature e = next; boolean remove = true; // check to see if the feature is relevant for (int l = 0; l < unknown.size(); l++) { try { if (e.equals(relevantEvents.get(i).eventAt(l))) { remove = false; // if it is, break break; } } catch (IndexOutOfBoundsException iobe) { remove = true; // it is not relevant if we reach this point. break; } } // remove the feature if it isn't relevant if (remove) { iterator.remove(); } // grab the next feature next = iterator.next(); } // add the culled event set culledUnknownEventSets.add(unknown); } else { // one unique numeric event // add non-histogram if it is in the relevantEventSets list boolean isRelevant = false; for (EventSet res : relevantEvents) { if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) { isRelevant = true; break; } } if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i)); } } eventSetsToCull.add(metadata); culledUnknownEventSets.add(metadata); return culledUnknownEventSets; }