/** * Culls the test set using the relevant Events extracted from the training data.<br> * * @param relevantEvents the features from the EventSets which are going to be evaluated * @param eventSetsToCull The test documents to be culled * @return the culled test documents * @throws Exception */ public List<EventSet> cullWithRespectToTraining( List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd) throws Exception { List<EventSet> relevant = relevantEvents; int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata int i; List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>(); // remove the metadata prior to culling EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1); // make sure all unknown sets would have only events that appear in the // known sets // UNLESS the event set contains a sole numeric value event - in that // case take it anyway for (i = 0; i < numOfFeatureClasses; i++) { if (cfd.featureDriverAt(i).isCalcHist()) { // initialize set of relevant events EventSet es = relevant.get(i); Set<String> relevantEventsString = new HashSet<String>(es.size()); for (Event e : es) relevantEventsString.add(e.getEvent()); // remove all non-relevant events from unknown event sets EventSet unknown; Event e; unknown = eventSetsToCull.get(i); Iterator<Event> iterator = unknown.iterator(); Event next = null; // the test doc may not contain a given feature (ie it might not // have any semi-colons) if (iterator.hasNext()) next = (Event) iterator.next(); // while it has more of a feature while (iterator.hasNext()) { // copy the feature e = next; boolean remove = true; // check to see if the feature is relevant for (int l = 0; l < unknown.size(); l++) { try { if (e.equals(relevantEvents.get(i).eventAt(l))) { remove = false; // if it is, break break; } } catch (IndexOutOfBoundsException iobe) { remove = true; // it is not relevant if we reach this point. break; } } // remove the feature if it isn't relevant if (remove) { iterator.remove(); } // grab the next feature next = iterator.next(); } // add the culled event set culledUnknownEventSets.add(unknown); } else { // one unique numeric event // add non-histogram if it is in the relevantEventSets list boolean isRelevant = false; for (EventSet res : relevantEvents) { if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) { isRelevant = true; break; } } if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i)); } } eventSetsToCull.add(metadata); culledUnknownEventSets.add(metadata); return culledUnknownEventSets; }
/** * Extracts a list of all features to be used for analysis. * * @param culledEventSets * @param relevantEvents * @param cumulativeFeatureDriver * @return * @throws Exception */ public List<String> getFeatureList( List<List<EventSet>> culledEventSets, List<EventSet> relevantEvents, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // remove the metdata prior to generating attribute list ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : culledEventSets) { docMetaData.add(les.remove(les.size() - 1)); } // initialize useful things int numOfFeatureClasses = relevantEvents.size(); List<EventSet> list; List<String> features = new ArrayList<String>(numOfFeatureClasses); // initialize list of sets of events, which will eventually become the // attributes List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses); // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) { // initialize relevant list of event sets and histograms list = new ArrayList<EventSet>(); for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i)); // initialize eventSet EventSet events = new EventSet(); events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID()); if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature // generate event histograms and unique event list EventSet eventSet = list.get(currEventSet); for (Event event : eventSet) { events.addEvent(event); } allEvents.add(events); } else { // one unique numeric event // generate sole event (give placeholder value) Event event = new Event("{-}"); events.addEvent(event); allEvents.add(events); } } // Adds all of the events to the fast vector int featureIndex = 0; for (EventSet es : allEvents) { Iterator<Event> iterator = es.iterator(); if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) { if (iterator.hasNext()) { // grab first event; there should be at least one Event nextEvent = (Event) iterator.next(); // get and add all middle events if they exist while (iterator.hasNext()) { features.add(nextEvent.getEvent()); nextEvent = (Event) iterator.next(); } // add the last event features.add(nextEvent.getEvent()); } } else { features.add(es.getEventSetID()); } featureIndex++; } // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } return features; }