예제 #1
0
  /**
   * Loads the cached features for a given document
   *
   * @param document
   * @param documentFile The cache file for the document.
   * @return the cached features if possible. Null if a cache doesn't exist or it fails to get them.
   * @throws Exception
   */
  private List<EventSet> getCachedFeatures(Document document, File documentFile) {
    List<EventSet> generatedEvents = null;
    BufferedReader reader = null;

    if (documentFile.exists() && !documentFile.isDirectory() && documentFile.canRead()) {
      try {
        reader = new BufferedReader(new FileReader(documentFile));
      } catch (FileNotFoundException e) {
        // shouldn't ever get here.. just put this here so I can keep track
        // of exceptions below.
        e.printStackTrace();
      }
    } else {
      return null;
    }

    try {
      // cachedPath is the path to the document that was used when the cache for that
      // document was created. cachedLastModified is the last modified time stamp on the
      // document that was cached.
      String cachedPath = reader.readLine();
      long cachedLastModified = Long.parseLong(reader.readLine());

      String path = document.getFilePath();
      File currDoc = new File(path);
      long lastModified = currDoc.lastModified();

      if (!(currDoc.getCanonicalPath().equals(cachedPath) && lastModified == cachedLastModified)) {
        // cache is invalid
        reader.close();
        return null;
      }
      String line = null;
      generatedEvents = new ArrayList<EventSet>();
      while ((line = reader.readLine()) != null) {
        if (line.isEmpty()) continue;
        EventSet es = new EventSet();
        es.setAuthor(document.getAuthor());
        es.setDocumentName(document.getTitle());
        es.setEventSetID(line);

        String event = null;
        while ((event = reader.readLine()) != null) {
          if (line.isEmpty()) continue;
          if (event.equals(",")) // delimiter for event sets
          break;
          es.addEvent(new Event(event));
        }

        generatedEvents.add(es);
      }
      reader.close();
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }

    return generatedEvents;
  }
예제 #2
0
  /**
   * Extracts a list of all features to be used for analysis.
   *
   * @param culledEventSets
   * @param relevantEvents
   * @param cumulativeFeatureDriver
   * @return
   * @throws Exception
   */
  public List<String> getFeatureList(
      List<List<EventSet>> culledEventSets,
      List<EventSet> relevantEvents,
      CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // remove the metdata prior to generating attribute list
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : culledEventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // initialize useful things
    int numOfFeatureClasses = relevantEvents.size();
    List<EventSet> list;

    List<String> features = new ArrayList<String>(numOfFeatureClasses);

    // initialize list of sets of events, which will eventually become the
    // attributes
    List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses);

    // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine
    for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) {
      // initialize relevant list of event sets and histograms
      list = new ArrayList<EventSet>();
      for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i));

      // initialize eventSet
      EventSet events = new EventSet();
      events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID());

      if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature

        // generate event histograms and unique event list
        EventSet eventSet = list.get(currEventSet);
        for (Event event : eventSet) {
          events.addEvent(event);
        }
        allEvents.add(events);

      } else { // one unique numeric event

        // generate sole event (give placeholder value)
        Event event = new Event("{-}");
        events.addEvent(event);
        allEvents.add(events);
      }
    }

    // Adds all of the events to the fast vector
    int featureIndex = 0;
    for (EventSet es : allEvents) {
      Iterator<Event> iterator = es.iterator();
      if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) {
        if (iterator.hasNext()) {
          // grab first event; there should be at least one
          Event nextEvent = (Event) iterator.next();
          // get and add all middle events if they exist
          while (iterator.hasNext()) {
            features.add(nextEvent.getEvent());
            nextEvent = (Event) iterator.next();
          }
          // add the last event
          features.add(nextEvent.getEvent());
        }
      } else {
        features.add(es.getEventSetID());
      }
      featureIndex++;
    }

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    return features;
  }
예제 #3
0
  /**
   * Extracts the List of EventSets from a document using the provided CumulativeFeatureDriver.<br>
   *
   * @param document the document to have features extracted and made into event sets
   * @param cumulativeFeatureDriver the driver containing the features to be extracted and the
   *     functionality to do so
   * @param loadDocContents whether or not the document contents are already loaded into the object
   * @return the List of EventSets for the document
   */
  public List<EventSet> extractEventSets(
      Document document,
      CumulativeFeatureDriver cumulativeFeatureDriver,
      boolean loadDocContents,
      boolean isUsingCache)
      throws Exception {

    List<EventSet> generatedEvents = new ArrayList<EventSet>();

    if (isUsingCache) {
      File cacheDir =
          new File(JSANConstants.JSAN_CACHE + "_" + cumulativeFeatureDriver.getName() + "/");

      File authorDir = null;
      if (document.getAuthor().equals(JSANConstants.DUMMY_NAME)) {
        authorDir = new File(cacheDir, "you");
      } else {
        authorDir = new File(cacheDir, "_" + document.getAuthor());
      }

      File documentFile = new File(authorDir, document.getTitle() + ".cache");
      generatedEvents = getCachedFeatures(document, documentFile);
      if (generatedEvents == null) {
        // delete the cache for this document! It is invalid
        documentFile.delete();
        // program will continue as normal, extracting events
      } else {
        // return the cached features
        return generatedEvents;
      }
    }

    // Extract the Events from the documents
    try {
      generatedEvents =
          cumulativeFeatureDriver.createEventSets(document, loadDocContents, isUsingCache);
    } catch (Exception e) {
      LOG.error("Failed to extract events from a document!", e);
      throw e;
    }
    // create metadata event to store document information
    EventSet documentInfo = new EventSet();
    documentInfo.setEventSetID("<DOCUMENT METADATA>");

    /*
     * Metadata Event format:
     *
     * EventSetID: "<DOCUMENT METADATA>" Event at Index:
     * 0 : author
     * 1 : title
     * 2 : Sentences in document
     * 3 : Words in document
     * 4 : Characters in document
     * 5 : Letters in document
     */

    // Extract document title and author
    Event authorEvent = new Event(document.getAuthor());
    // Event titleEvent = new Event(document.getFilePath());
    Event titleEvent = new Event(document.getTitle());
    documentInfo.addEvent(authorEvent);
    documentInfo.addEvent(titleEvent);

    // Extract normalization baselines
    // Sentences in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new SentenceCounterEventDriver();
      doc = document;
      Event tempEvent = null;

      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num sentences from document", e);
        throw e;
      }

      documentInfo.addEvent(tempEvent);
    }

    // Words in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new WordCounterEventDriver();
      doc = document;
      Event tempEvent = null;
      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num words from document", e);
        throw e;
      }
      documentInfo.addEvent(tempEvent);
    }

    // Characters in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new CharCounterEventDriver();
      doc = document;
      Event tempEvent = null;
      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num characters from document", e);
        throw e;
      }
      documentInfo.addEvent(tempEvent);
    }

    // Letters in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new LetterCounterEventDriver();
      doc = document;
      Event tempEvent = null;
      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num letters from document", e);
        throw e;
      }
      documentInfo.addEvent(tempEvent);
    }

    // add the metadata EventSet to the List<EventSet>
    generatedEvents.add(documentInfo);

    // return the List<EventSet>
    return generatedEvents;
  }
예제 #4
0
  /**
   * Goes over the culled List of Lists of EventSets and determines which events are histograms and
   * which have a single numerical value.<br>
   * Uses the information to prepare a List of EventSets to extract from the test document(s).
   *
   * @param culledEventSets The culled List of Lists of EventSets
   * @param cumulativeFeatureDriver The driver used to extract the EventSets
   * @return The List of EventSet to extract from the test document(s)
   * @throws Exception
   */
  public List<EventSet> getRelevantEvents(
      List<List<EventSet>> culledEventSets, CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // remove the metadata prior to generating the relevantEvents
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : culledEventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // initialize the EventSet list
    List<EventSet> relevantEvents = new LinkedList<EventSet>();

    // iterate over the List of Lists
    for (List<EventSet> l : culledEventSets) {
      // iterate over each inner list's eventSets
      int featureIndex = 0;
      for (EventSet esToAdd : l) {
        // whether or not to add the event set to the list (if false, it
        // is already on the list)
        boolean add = true;
        ;

        for (EventSet esl : relevantEvents) {
          // this should compare the category/name of the event set
          if (esToAdd.getEventSetID().equals(esl.getEventSetID())) {
            add = false;
            break;
          }
        }

        // this event set isn't on the list at all, just add it (which
        // also adds its internal events) to the list
        if (add) {
          EventSet temp = new EventSet();
          temp.setEventSetID(esToAdd.getEventSetID());

          // for all of the events
          for (Event e : esToAdd) {
            boolean absent = true;
            // check to see if it's been added yet or not
            for (Event ae : temp) {
              if (ae.getEvent().equals(e.getEvent())) {
                absent = false;
                break;
              }
            }
            // if it has not been added, add it
            if (absent) {
              if (!cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist())
                temp.addEvent(new Event("{-}"));
              else temp.addEvent(e);
            }
          }

          relevantEvents.add(temp);
        } else {
          // go through this eventSet and add any events to the
          // relevant EventSet if they aren't already there.
          if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) {
            for (Event e : esToAdd) {
              boolean toAdd = true;
              // for all events in the relecant list
              for (Event re : relevantEvents.get(featureIndex)) {
                // if it's already there, don't add it
                if (e.getEvent().equals(re.getEvent())) {
                  toAdd = false;
                  break;
                }
              }
              // add it if it isn't there
              if (toAdd) {
                relevantEvents.get(featureIndex).addEvent(e);
              }
            }
          }
        }
        featureIndex++;
      }
    }

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    return relevantEvents;
  }