/** * Determines which EventSets to use for the given documents based on the chosen cullers.<br> * * @param eventSets A List which contains Lists of EventSets (represents a list of documents' * EventSets * @param cumulativeFeatureDriver the driver with the culling functionality * @return The culled List of Lists of EventSets created from eventSets * @throws Exception */ public List<List<EventSet>> cull( List<List<EventSet>> eventSets, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // a hacky workaround for the bug in the eventCuller. Fix that // later then remove these ArrayList<String> IDs = new ArrayList<String>(); for (EventSet es : eventSets.get(0)) { IDs.add(es.getEventSetID()); } // remove the metdata prior to culling ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : eventSets) { docMetaData.add(les.remove(les.size() - 1)); } // cull the events List<List<EventSet>> culledEventSets = CumulativeEventCuller.cull(eventSets, cumulativeFeatureDriver); // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } // a hacky workaround for the bug in the eventCuller. Fix that // later then remove these for (int j1 = 0; j1 < culledEventSets.size(); j1++) { for (int iterator = 0; iterator < culledEventSets.get(j1).size(); iterator++) { culledEventSets.get(j1).get(iterator).setEventSetID(IDs.get(iterator)); } } // return culled events return culledEventSets; }
/** * Culls the test set using the relevant Events extracted from the training data.<br> * * @param relevantEvents the features from the EventSets which are going to be evaluated * @param eventSetsToCull The test documents to be culled * @return the culled test documents * @throws Exception */ public List<EventSet> cullWithRespectToTraining( List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd) throws Exception { List<EventSet> relevant = relevantEvents; int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata int i; List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>(); // remove the metadata prior to culling EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1); // make sure all unknown sets would have only events that appear in the // known sets // UNLESS the event set contains a sole numeric value event - in that // case take it anyway for (i = 0; i < numOfFeatureClasses; i++) { if (cfd.featureDriverAt(i).isCalcHist()) { // initialize set of relevant events EventSet es = relevant.get(i); Set<String> relevantEventsString = new HashSet<String>(es.size()); for (Event e : es) relevantEventsString.add(e.getEvent()); // remove all non-relevant events from unknown event sets EventSet unknown; Event e; unknown = eventSetsToCull.get(i); Iterator<Event> iterator = unknown.iterator(); Event next = null; // the test doc may not contain a given feature (ie it might not // have any semi-colons) if (iterator.hasNext()) next = (Event) iterator.next(); // while it has more of a feature while (iterator.hasNext()) { // copy the feature e = next; boolean remove = true; // check to see if the feature is relevant for (int l = 0; l < unknown.size(); l++) { try { if (e.equals(relevantEvents.get(i).eventAt(l))) { remove = false; // if it is, break break; } } catch (IndexOutOfBoundsException iobe) { remove = true; // it is not relevant if we reach this point. break; } } // remove the feature if it isn't relevant if (remove) { iterator.remove(); } // grab the next feature next = iterator.next(); } // add the culled event set culledUnknownEventSets.add(unknown); } else { // one unique numeric event // add non-histogram if it is in the relevantEventSets list boolean isRelevant = false; for (EventSet res : relevantEvents) { if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) { isRelevant = true; break; } } if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i)); } } eventSetsToCull.add(metadata); culledUnknownEventSets.add(metadata); return culledUnknownEventSets; }
/** * Converts the extracted document information into a JStylo DataMap * * @param features * @param relevantEvents * @param cumulativeFeatureDriver * @param documentData * @return */ public ConcurrentHashMap<Integer, FeatureData> createDocMap( List<String> features, List<EventSet> relevantEvents, CumulativeFeatureDriver cumulativeFeatureDriver, List<EventSet> documentData) { // generate training instances ConcurrentHashMap<Integer, FeatureData> documentMap = new ConcurrentHashMap<Integer, FeatureData>(); // remove metadata event EventSet metadata = documentData.remove(documentData.size() - 1); // go through all eventSets in the document for (EventSet es : documentData) { // initialize relevant information ArrayList<Integer> indices = new ArrayList<Integer>(); ArrayList<Event> events = new ArrayList<Event>(); EventHistogram currHistogram = new EventHistogram(); // whether or not we actually need this eventSet boolean eventSetIsRelevant = false; // find out if it is a histogram or not if (cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).isCalcHist()) { // find the event set in the list of relevant events for (EventSet res : relevantEvents) { if (es.getEventSetID().equals(res.getEventSetID())) { eventSetIsRelevant = true; break; } } // if it is relevant if (eventSetIsRelevant) { // find the indices of the events // and count all of the events for (Event e : es) { int currIndex = 0; boolean hasInner = false; // for the events n the set for (EventSet res : relevantEvents) { boolean found = false; for (Event re : res) { hasInner = true; // if they are the same event if (e.getEvent().equals(re.getEvent())) { boolean inList = false; for (Event el : events) { if (el.getEvent().equals(e.getEvent())) { inList = true; break; } } if (!inList) { indices.add(currIndex); events.add(e); } // Old location revert if change breaks currHistogram.add(e); found = true; } if (found) { break; } currIndex++; } if (found) { break; } // if there's no inner, it was a non-hist feature. // increment by one if (!hasInner) { currIndex++; } } } // calculate/add the histograms int index = 0; for (Integer i : indices) { documentMap.put( i, new FeatureData( cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(), cumulativeFeatureDriver .featureDriverAt(documentData.indexOf(es)) .getNormBaseline() .getTitle(), currHistogram.getAbsoluteFrequency(events.get(index)))); index++; } } } else { // non histogram feature // initialize the index int nonHistIndex = 0; // find the indices of the events // and count all of the events for (EventSet res : relevantEvents) { if (es.getEventSetID().equals(res.getEventSetID())) { break; } // count to find the index boolean hasInner = false; for (@SuppressWarnings("unused") Event re : res) { hasInner = true; nonHistIndex++; } // if ther's no inner feature, increment by one; we just passed a non-histogram if (!hasInner) nonHistIndex++; } // Extract and add the event String eventString = es.eventAt(0).getEvent(); int startIndex = eventString.indexOf("{"); int endIndex = eventString.indexOf("}"); eventString = eventString.substring(startIndex + 1, endIndex); FeatureData fd = new FeatureData( cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(), cumulativeFeatureDriver .featureDriverAt(documentData.indexOf(es)) .getNormBaseline() .getTitle(), Math.round((float) Double.parseDouble(eventString))); documentMap.put(nonHistIndex, fd); } } // add metadata back. Not sure if necessary documentData.add(metadata); return documentMap; }
/** * Extracts a list of all features to be used for analysis. * * @param culledEventSets * @param relevantEvents * @param cumulativeFeatureDriver * @return * @throws Exception */ public List<String> getFeatureList( List<List<EventSet>> culledEventSets, List<EventSet> relevantEvents, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // remove the metdata prior to generating attribute list ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : culledEventSets) { docMetaData.add(les.remove(les.size() - 1)); } // initialize useful things int numOfFeatureClasses = relevantEvents.size(); List<EventSet> list; List<String> features = new ArrayList<String>(numOfFeatureClasses); // initialize list of sets of events, which will eventually become the // attributes List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses); // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) { // initialize relevant list of event sets and histograms list = new ArrayList<EventSet>(); for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i)); // initialize eventSet EventSet events = new EventSet(); events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID()); if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature // generate event histograms and unique event list EventSet eventSet = list.get(currEventSet); for (Event event : eventSet) { events.addEvent(event); } allEvents.add(events); } else { // one unique numeric event // generate sole event (give placeholder value) Event event = new Event("{-}"); events.addEvent(event); allEvents.add(events); } } // Adds all of the events to the fast vector int featureIndex = 0; for (EventSet es : allEvents) { Iterator<Event> iterator = es.iterator(); if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) { if (iterator.hasNext()) { // grab first event; there should be at least one Event nextEvent = (Event) iterator.next(); // get and add all middle events if they exist while (iterator.hasNext()) { features.add(nextEvent.getEvent()); nextEvent = (Event) iterator.next(); } // add the last event features.add(nextEvent.getEvent()); } } else { features.add(es.getEventSetID()); } featureIndex++; } // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } return features; }
/** * Goes over the culled List of Lists of EventSets and determines which events are histograms and * which have a single numerical value.<br> * Uses the information to prepare a List of EventSets to extract from the test document(s). * * @param culledEventSets The culled List of Lists of EventSets * @param cumulativeFeatureDriver The driver used to extract the EventSets * @return The List of EventSet to extract from the test document(s) * @throws Exception */ public List<EventSet> getRelevantEvents( List<List<EventSet>> culledEventSets, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // remove the metadata prior to generating the relevantEvents ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : culledEventSets) { docMetaData.add(les.remove(les.size() - 1)); } // initialize the EventSet list List<EventSet> relevantEvents = new LinkedList<EventSet>(); // iterate over the List of Lists for (List<EventSet> l : culledEventSets) { // iterate over each inner list's eventSets int featureIndex = 0; for (EventSet esToAdd : l) { // whether or not to add the event set to the list (if false, it // is already on the list) boolean add = true; ; for (EventSet esl : relevantEvents) { // this should compare the category/name of the event set if (esToAdd.getEventSetID().equals(esl.getEventSetID())) { add = false; break; } } // this event set isn't on the list at all, just add it (which // also adds its internal events) to the list if (add) { EventSet temp = new EventSet(); temp.setEventSetID(esToAdd.getEventSetID()); // for all of the events for (Event e : esToAdd) { boolean absent = true; // check to see if it's been added yet or not for (Event ae : temp) { if (ae.getEvent().equals(e.getEvent())) { absent = false; break; } } // if it has not been added, add it if (absent) { if (!cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) temp.addEvent(new Event("{-}")); else temp.addEvent(e); } } relevantEvents.add(temp); } else { // go through this eventSet and add any events to the // relevant EventSet if they aren't already there. if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) { for (Event e : esToAdd) { boolean toAdd = true; // for all events in the relecant list for (Event re : relevantEvents.get(featureIndex)) { // if it's already there, don't add it if (e.getEvent().equals(re.getEvent())) { toAdd = false; break; } } // add it if it isn't there if (toAdd) { relevantEvents.get(featureIndex).addEvent(e); } } } } featureIndex++; } } // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } return relevantEvents; }