Exemplo n.º 1
0
  /**
   * Culls the test set using the relevant Events extracted from the training data.<br>
   *
   * @param relevantEvents the features from the EventSets which are going to be evaluated
   * @param eventSetsToCull The test documents to be culled
   * @return the culled test documents
   * @throws Exception
   */
  public List<EventSet> cullWithRespectToTraining(
      List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd)
      throws Exception {
    List<EventSet> relevant = relevantEvents;
    int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata
    int i;
    List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>();

    // remove the metadata prior to culling
    EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1);

    // make sure all unknown sets would have only events that appear in the
    // known sets
    // UNLESS the event set contains a sole numeric value event - in that
    // case take it anyway
    for (i = 0; i < numOfFeatureClasses; i++) {
      if (cfd.featureDriverAt(i).isCalcHist()) {
        // initialize set of relevant events
        EventSet es = relevant.get(i);
        Set<String> relevantEventsString = new HashSet<String>(es.size());
        for (Event e : es) relevantEventsString.add(e.getEvent());

        // remove all non-relevant events from unknown event sets
        EventSet unknown;
        Event e;
        unknown = eventSetsToCull.get(i);
        Iterator<Event> iterator = unknown.iterator();
        Event next = null;

        // the test doc may not contain a given feature (ie it might not
        // have any semi-colons)
        if (iterator.hasNext()) next = (Event) iterator.next();

        // while it has more of a feature
        while (iterator.hasNext()) {
          // copy the feature
          e = next;
          boolean remove = true;

          // check to see if the feature is relevant
          for (int l = 0; l < unknown.size(); l++) {
            try {
              if (e.equals(relevantEvents.get(i).eventAt(l))) {
                remove = false; // if it is, break
                break;
              }
            } catch (IndexOutOfBoundsException iobe) {
              remove = true; // it is not relevant if we reach this point.
              break;
            }
          }

          // remove the feature if it isn't relevant
          if (remove) {
            iterator.remove();
          }

          // grab the next feature
          next = iterator.next();
        }

        // add the culled event set
        culledUnknownEventSets.add(unknown);

      } else { // one unique numeric event
        // add non-histogram if it is in the relevantEventSets list
        boolean isRelevant = false;

        for (EventSet res : relevantEvents) {
          if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) {
            isRelevant = true;
            break;
          }
        }

        if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i));
      }
    }
    eventSetsToCull.add(metadata);
    culledUnknownEventSets.add(metadata);

    return culledUnknownEventSets;
  }
Exemplo n.º 2
0
  /**
   * Extracts a list of all features to be used for analysis.
   *
   * @param culledEventSets
   * @param relevantEvents
   * @param cumulativeFeatureDriver
   * @return
   * @throws Exception
   */
  public List<String> getFeatureList(
      List<List<EventSet>> culledEventSets,
      List<EventSet> relevantEvents,
      CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // remove the metdata prior to generating attribute list
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : culledEventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // initialize useful things
    int numOfFeatureClasses = relevantEvents.size();
    List<EventSet> list;

    List<String> features = new ArrayList<String>(numOfFeatureClasses);

    // initialize list of sets of events, which will eventually become the
    // attributes
    List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses);

    // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine
    for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) {
      // initialize relevant list of event sets and histograms
      list = new ArrayList<EventSet>();
      for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i));

      // initialize eventSet
      EventSet events = new EventSet();
      events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID());

      if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature

        // generate event histograms and unique event list
        EventSet eventSet = list.get(currEventSet);
        for (Event event : eventSet) {
          events.addEvent(event);
        }
        allEvents.add(events);

      } else { // one unique numeric event

        // generate sole event (give placeholder value)
        Event event = new Event("{-}");
        events.addEvent(event);
        allEvents.add(events);
      }
    }

    // Adds all of the events to the fast vector
    int featureIndex = 0;
    for (EventSet es : allEvents) {
      Iterator<Event> iterator = es.iterator();
      if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) {
        if (iterator.hasNext()) {
          // grab first event; there should be at least one
          Event nextEvent = (Event) iterator.next();
          // get and add all middle events if they exist
          while (iterator.hasNext()) {
            features.add(nextEvent.getEvent());
            nextEvent = (Event) iterator.next();
          }
          // add the last event
          features.add(nextEvent.getEvent());
        }
      } else {
        features.add(es.getEventSetID());
      }
      featureIndex++;
    }

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    return features;
  }