Пример #1
0
  /**
   * Determines which EventSets to use for the given documents based on the chosen cullers.<br>
   *
   * @param eventSets A List which contains Lists of EventSets (represents a list of documents'
   *     EventSets
   * @param cumulativeFeatureDriver the driver with the culling functionality
   * @return The culled List of Lists of EventSets created from eventSets
   * @throws Exception
   */
  public List<List<EventSet>> cull(
      List<List<EventSet>> eventSets, CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // a hacky workaround for the bug in the eventCuller. Fix that
    // later then remove these
    ArrayList<String> IDs = new ArrayList<String>();
    for (EventSet es : eventSets.get(0)) {
      IDs.add(es.getEventSetID());
    }

    // remove the metdata prior to culling
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : eventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // cull the events
    List<List<EventSet>> culledEventSets =
        CumulativeEventCuller.cull(eventSets, cumulativeFeatureDriver);

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    // a hacky workaround for the bug in the eventCuller. Fix that
    // later then remove these
    for (int j1 = 0; j1 < culledEventSets.size(); j1++) {
      for (int iterator = 0; iterator < culledEventSets.get(j1).size(); iterator++) {
        culledEventSets.get(j1).get(iterator).setEventSetID(IDs.get(iterator));
      }
    }

    // return culled events
    return culledEventSets;
  }
Пример #2
0
  /**
   * Culls the test set using the relevant Events extracted from the training data.<br>
   *
   * @param relevantEvents the features from the EventSets which are going to be evaluated
   * @param eventSetsToCull The test documents to be culled
   * @return the culled test documents
   * @throws Exception
   */
  public List<EventSet> cullWithRespectToTraining(
      List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd)
      throws Exception {
    List<EventSet> relevant = relevantEvents;
    int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata
    int i;
    List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>();

    // remove the metadata prior to culling
    EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1);

    // make sure all unknown sets would have only events that appear in the
    // known sets
    // UNLESS the event set contains a sole numeric value event - in that
    // case take it anyway
    for (i = 0; i < numOfFeatureClasses; i++) {
      if (cfd.featureDriverAt(i).isCalcHist()) {
        // initialize set of relevant events
        EventSet es = relevant.get(i);
        Set<String> relevantEventsString = new HashSet<String>(es.size());
        for (Event e : es) relevantEventsString.add(e.getEvent());

        // remove all non-relevant events from unknown event sets
        EventSet unknown;
        Event e;
        unknown = eventSetsToCull.get(i);
        Iterator<Event> iterator = unknown.iterator();
        Event next = null;

        // the test doc may not contain a given feature (ie it might not
        // have any semi-colons)
        if (iterator.hasNext()) next = (Event) iterator.next();

        // while it has more of a feature
        while (iterator.hasNext()) {
          // copy the feature
          e = next;
          boolean remove = true;

          // check to see if the feature is relevant
          for (int l = 0; l < unknown.size(); l++) {
            try {
              if (e.equals(relevantEvents.get(i).eventAt(l))) {
                remove = false; // if it is, break
                break;
              }
            } catch (IndexOutOfBoundsException iobe) {
              remove = true; // it is not relevant if we reach this point.
              break;
            }
          }

          // remove the feature if it isn't relevant
          if (remove) {
            iterator.remove();
          }

          // grab the next feature
          next = iterator.next();
        }

        // add the culled event set
        culledUnknownEventSets.add(unknown);

      } else { // one unique numeric event
        // add non-histogram if it is in the relevantEventSets list
        boolean isRelevant = false;

        for (EventSet res : relevantEvents) {
          if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) {
            isRelevant = true;
            break;
          }
        }

        if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i));
      }
    }
    eventSetsToCull.add(metadata);
    culledUnknownEventSets.add(metadata);

    return culledUnknownEventSets;
  }
Пример #3
0
  /**
   * Converts the extracted document information into a JStylo DataMap
   *
   * @param features
   * @param relevantEvents
   * @param cumulativeFeatureDriver
   * @param documentData
   * @return
   */
  public ConcurrentHashMap<Integer, FeatureData> createDocMap(
      List<String> features,
      List<EventSet> relevantEvents,
      CumulativeFeatureDriver cumulativeFeatureDriver,
      List<EventSet> documentData) {

    // generate training instances
    ConcurrentHashMap<Integer, FeatureData> documentMap =
        new ConcurrentHashMap<Integer, FeatureData>();

    // remove metadata event
    EventSet metadata = documentData.remove(documentData.size() - 1);

    // go through all eventSets in the document
    for (EventSet es : documentData) {

      // initialize relevant information
      ArrayList<Integer> indices = new ArrayList<Integer>();
      ArrayList<Event> events = new ArrayList<Event>();
      EventHistogram currHistogram = new EventHistogram();

      // whether or not we actually need this eventSet
      boolean eventSetIsRelevant = false;

      // find out if it is a histogram or not
      if (cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).isCalcHist()) {

        // find the event set in the list of relevant events
        for (EventSet res : relevantEvents) {
          if (es.getEventSetID().equals(res.getEventSetID())) {
            eventSetIsRelevant = true;
            break;
          }
        }

        // if it is relevant
        if (eventSetIsRelevant) {

          // find the indices of the events
          // and count all of the events
          for (Event e : es) {
            int currIndex = 0;
            boolean hasInner = false;

            // for the events n the set
            for (EventSet res : relevantEvents) {
              boolean found = false;
              for (Event re : res) {
                hasInner = true;

                // if they are the same event
                if (e.getEvent().equals(re.getEvent())) {
                  boolean inList = false;
                  for (Event el : events) {
                    if (el.getEvent().equals(e.getEvent())) {
                      inList = true;
                      break;
                    }
                  }

                  if (!inList) {
                    indices.add(currIndex);
                    events.add(e);
                  }
                  // Old location revert if change breaks
                  currHistogram.add(e);
                  found = true;
                }
                if (found) {
                  break;
                }
                currIndex++;
              }
              if (found) {
                break;
              }

              // if there's no inner, it was a non-hist feature.
              // increment by one
              if (!hasInner) {
                currIndex++;
              }
            }
          }
          // calculate/add the histograms
          int index = 0;
          for (Integer i : indices) {
            documentMap.put(
                i,
                new FeatureData(
                    cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(),
                    cumulativeFeatureDriver
                        .featureDriverAt(documentData.indexOf(es))
                        .getNormBaseline()
                        .getTitle(),
                    currHistogram.getAbsoluteFrequency(events.get(index))));
            index++;
          }
        }
      } else { // non histogram feature

        // initialize the index
        int nonHistIndex = 0;

        // find the indices of the events
        // and count all of the events
        for (EventSet res : relevantEvents) {

          if (es.getEventSetID().equals(res.getEventSetID())) {
            break;
          }

          // count to find the index
          boolean hasInner = false;
          for (@SuppressWarnings("unused") Event re : res) {
            hasInner = true;
            nonHistIndex++;
          }

          // if ther's no inner feature, increment by one; we just passed a non-histogram
          if (!hasInner) nonHistIndex++;
        }

        // Extract and add the event
        String eventString = es.eventAt(0).getEvent();
        int startIndex = eventString.indexOf("{");
        int endIndex = eventString.indexOf("}");
        eventString = eventString.substring(startIndex + 1, endIndex);

        FeatureData fd =
            new FeatureData(
                cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(),
                cumulativeFeatureDriver
                    .featureDriverAt(documentData.indexOf(es))
                    .getNormBaseline()
                    .getTitle(),
                Math.round((float) Double.parseDouble(eventString)));
        documentMap.put(nonHistIndex, fd);
      }
    }
    // add metadata back. Not sure if necessary
    documentData.add(metadata);

    return documentMap;
  }
Пример #4
0
  /**
   * Extracts a list of all features to be used for analysis.
   *
   * @param culledEventSets
   * @param relevantEvents
   * @param cumulativeFeatureDriver
   * @return
   * @throws Exception
   */
  public List<String> getFeatureList(
      List<List<EventSet>> culledEventSets,
      List<EventSet> relevantEvents,
      CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // remove the metdata prior to generating attribute list
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : culledEventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // initialize useful things
    int numOfFeatureClasses = relevantEvents.size();
    List<EventSet> list;

    List<String> features = new ArrayList<String>(numOfFeatureClasses);

    // initialize list of sets of events, which will eventually become the
    // attributes
    List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses);

    // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine
    for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) {
      // initialize relevant list of event sets and histograms
      list = new ArrayList<EventSet>();
      for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i));

      // initialize eventSet
      EventSet events = new EventSet();
      events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID());

      if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature

        // generate event histograms and unique event list
        EventSet eventSet = list.get(currEventSet);
        for (Event event : eventSet) {
          events.addEvent(event);
        }
        allEvents.add(events);

      } else { // one unique numeric event

        // generate sole event (give placeholder value)
        Event event = new Event("{-}");
        events.addEvent(event);
        allEvents.add(events);
      }
    }

    // Adds all of the events to the fast vector
    int featureIndex = 0;
    for (EventSet es : allEvents) {
      Iterator<Event> iterator = es.iterator();
      if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) {
        if (iterator.hasNext()) {
          // grab first event; there should be at least one
          Event nextEvent = (Event) iterator.next();
          // get and add all middle events if they exist
          while (iterator.hasNext()) {
            features.add(nextEvent.getEvent());
            nextEvent = (Event) iterator.next();
          }
          // add the last event
          features.add(nextEvent.getEvent());
        }
      } else {
        features.add(es.getEventSetID());
      }
      featureIndex++;
    }

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    return features;
  }
Пример #5
0
  /**
   * Goes over the culled List of Lists of EventSets and determines which events are histograms and
   * which have a single numerical value.<br>
   * Uses the information to prepare a List of EventSets to extract from the test document(s).
   *
   * @param culledEventSets The culled List of Lists of EventSets
   * @param cumulativeFeatureDriver The driver used to extract the EventSets
   * @return The List of EventSet to extract from the test document(s)
   * @throws Exception
   */
  public List<EventSet> getRelevantEvents(
      List<List<EventSet>> culledEventSets, CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // remove the metadata prior to generating the relevantEvents
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : culledEventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // initialize the EventSet list
    List<EventSet> relevantEvents = new LinkedList<EventSet>();

    // iterate over the List of Lists
    for (List<EventSet> l : culledEventSets) {
      // iterate over each inner list's eventSets
      int featureIndex = 0;
      for (EventSet esToAdd : l) {
        // whether or not to add the event set to the list (if false, it
        // is already on the list)
        boolean add = true;
        ;

        for (EventSet esl : relevantEvents) {
          // this should compare the category/name of the event set
          if (esToAdd.getEventSetID().equals(esl.getEventSetID())) {
            add = false;
            break;
          }
        }

        // this event set isn't on the list at all, just add it (which
        // also adds its internal events) to the list
        if (add) {
          EventSet temp = new EventSet();
          temp.setEventSetID(esToAdd.getEventSetID());

          // for all of the events
          for (Event e : esToAdd) {
            boolean absent = true;
            // check to see if it's been added yet or not
            for (Event ae : temp) {
              if (ae.getEvent().equals(e.getEvent())) {
                absent = false;
                break;
              }
            }
            // if it has not been added, add it
            if (absent) {
              if (!cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist())
                temp.addEvent(new Event("{-}"));
              else temp.addEvent(e);
            }
          }

          relevantEvents.add(temp);
        } else {
          // go through this eventSet and add any events to the
          // relevant EventSet if they aren't already there.
          if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) {
            for (Event e : esToAdd) {
              boolean toAdd = true;
              // for all events in the relecant list
              for (Event re : relevantEvents.get(featureIndex)) {
                // if it's already there, don't add it
                if (e.getEvent().equals(re.getEvent())) {
                  toAdd = false;
                  break;
                }
              }
              // add it if it isn't there
              if (toAdd) {
                relevantEvents.get(featureIndex).addEvent(e);
              }
            }
          }
        }
        featureIndex++;
      }
    }

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    return relevantEvents;
  }