/**
   * Loads the cached features for a given document
   *
   * @param document
   * @param documentFile The cache file for the document.
   * @return the cached features if possible. Null if a cache doesn't exist or it fails to get them.
   * @throws Exception
   */
  private List<EventSet> getCachedFeatures(Document document, File documentFile) {
    List<EventSet> generatedEvents = null;
    BufferedReader reader = null;

    if (documentFile.exists() && !documentFile.isDirectory() && documentFile.canRead()) {
      try {
        reader = new BufferedReader(new FileReader(documentFile));
      } catch (FileNotFoundException e) {
        // shouldn't ever get here.. just put this here so I can keep track
        // of exceptions below.
        e.printStackTrace();
      }
    } else {
      return null;
    }

    try {
      // cachedPath is the path to the document that was used when the cache for that
      // document was created. cachedLastModified is the last modified time stamp on the
      // document that was cached.
      String cachedPath = reader.readLine();
      long cachedLastModified = Long.parseLong(reader.readLine());

      String path = document.getFilePath();
      File currDoc = new File(path);
      long lastModified = currDoc.lastModified();

      if (!(currDoc.getCanonicalPath().equals(cachedPath) && lastModified == cachedLastModified)) {
        // cache is invalid
        reader.close();
        return null;
      }
      String line = null;
      generatedEvents = new ArrayList<EventSet>();
      while ((line = reader.readLine()) != null) {
        if (line.isEmpty()) continue;
        EventSet es = new EventSet();
        es.setAuthor(document.getAuthor());
        es.setDocumentName(document.getTitle());
        es.setEventSetID(line);

        String event = null;
        while ((event = reader.readLine()) != null) {
          if (line.isEmpty()) continue;
          if (event.equals(",")) // delimiter for event sets
          break;
          es.addEvent(new Event(event));
        }

        generatedEvents.add(es);
      }
      reader.close();
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }

    return generatedEvents;
  }
Exemple #2
0
 public void build(EventSet e) {
   for (int i = 0; i < e.size(); i++) {
     Event start = e.eventAt(i);
     if (!root.isEventInLevel(start)) {
       insertAtRoot(start, e, i);
     } else {
       insertBelowRoot(start, e, i);
     }
   }
   root.key = null;
 }
Exemple #3
0
 private void insertAtRoot(Event start, EventSet e, int offset) {
   root.addEventToLevel(start);
   XEDictionaryNode node;
   node = root;
   int j = offset;
   while (j < e.size() - 1) {
     node = node.get(e.eventAt(j));
     j++;
     // System.out.println("Adding Event: " + e.eventAt(j));
     node.addEventToLevel(e.eventAt(j));
   }
 }
Exemple #4
0
 public int find(EventSet e) {
   int matchlength = 0;
   boolean matched = false;
   XEDictionaryNode node = root;
   while ((matchlength < e.size()) && !matched) {
     if (node.isEventInLevel(e.eventAt(matchlength))) {
       node = node.get(e.eventAt(matchlength));
       matchlength++;
     } else {
       matched = true;
     }
   }
   return matchlength;
 }
Exemple #5
0
  private double meanEntropy(EventSet e1, EventSet e2, int windowSize) {

    double totalEntropy = 0;
    int trials = 0;

    if (windowSize > e1.size() - 1) {
      windowSize = e1.size();
    }

    //		for (int j = 0; j <= e1.size() - windowSize; j++) {
    XEDictionary xed = new XEDictionary();
    EventSet dictionary;
    dictionary = window(e1, 0, windowSize);
    xed.build(dictionary);

    for (int i = 0; i <= e2.size() - windowSize; i++) {
      totalEntropy += xed.find(window(e2, i, windowSize));
      trials++;
    }
    //		}
    return totalEntropy / trials;
  }
  /**
   * Determines which EventSets to use for the given documents based on the chosen cullers.<br>
   *
   * @param eventSets A List which contains Lists of EventSets (represents a list of documents'
   *     EventSets
   * @param cumulativeFeatureDriver the driver with the culling functionality
   * @return The culled List of Lists of EventSets created from eventSets
   * @throws Exception
   */
  public List<List<EventSet>> cull(
      List<List<EventSet>> eventSets, CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // a hacky workaround for the bug in the eventCuller. Fix that
    // later then remove these
    ArrayList<String> IDs = new ArrayList<String>();
    for (EventSet es : eventSets.get(0)) {
      IDs.add(es.getEventSetID());
    }

    // remove the metdata prior to culling
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : eventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // cull the events
    List<List<EventSet>> culledEventSets =
        CumulativeEventCuller.cull(eventSets, cumulativeFeatureDriver);

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    // a hacky workaround for the bug in the eventCuller. Fix that
    // later then remove these
    for (int j1 = 0; j1 < culledEventSets.size(); j1++) {
      for (int iterator = 0; iterator < culledEventSets.get(j1).size(); iterator++) {
        culledEventSets.get(j1).get(iterator).setEventSetID(IDs.get(iterator));
      }
    }

    // return culled events
    return culledEventSets;
  }
  @SuppressWarnings("static-access")
  @Override
  public EventSet createEventSet(Document doc) {
    EventSet es = new EventSet(doc.getAuthor());
    char[] text = doc.getProcessedText();
    String stringText = new String(text);

    // use MaxentPOSTagsEventDriver's tagger
    // initialize tagger and return empty event set if encountered a problem
    if (tagger == null) {
      tagger = MaxentPOSTagsEventDriver.initTagger();
      if (tagger == null) return es;
    }

    List<List<HasWord>> sentences =
        tagger.tokenizeText(new BufferedReader(new StringReader(stringText)));
    ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>();
    for (List<HasWord> sentence : sentences) tagged.addAll(tagger.tagSentence(sentence));

    int i, j, n;
    try {
      n = Integer.parseInt(getParameter("N"));
    } catch (NumberFormatException e) {
      n = 2;
    }
    String curr;
    for (i = 0; i < tagged.size() - n + 1; i++) {
      curr = "(" + tagged.get(i).tag() + ")";
      for (j = 1; j < n; j++) {
        curr += "-(" + tagged.get(i + j).tag() + ")";
      }
      es.addEvent(new Event(curr));
    }

    sentences.clear();
    sentences = null;
    return es;
  }
 public double getValue(Document doc) throws EventGenerationException {
   double wordCount = wordCounter.getValue(doc);
   double sentenceCount = sentenceCounter.getValue(doc);
   EventSet syllables = syllablesDriver.createEventSet(doc);
   for (int i = syllables.size() - 1; i >= 0; i--) {
     if (Integer.parseInt(syllables.eventAt(i).toString()) < 3) {
       syllables.removeEvent(syllables.eventAt(i));
     }
   }
   double complexWordsCount = syllables.size();
   return 0.4 * (wordCount / sentenceCount + 100 * complexWordsCount / wordCount);
 }
Exemple #9
0
 private void insertBelowRoot(Event start, EventSet e, int offset) {
   XEDictionaryNode node;
   node = root;
   // System.out.println("Event at offset: " + e.eventAt(offset));
   node = node.get(e.eventAt(offset));
   int j = offset;
   boolean matches = true; // match the events up to a given level
   while (matches && (j < e.size() - 1)) {
     j++;
     if (node.isEventInLevel(e.eventAt(j))) {
       // System.out.println("Match at level: " + e.eventAt(j));
       node = node.get(e.eventAt(j));
     } else {
       matches = false;
     }
   }
   for (int i = j; i < e.size(); i++) {
     // System.out.println("Adding Event: " + e.eventAt(i));
     node.addEventToLevel(e.eventAt(i));
     node = node.get(e.eventAt(i));
   }
 }
  /**
   * Culls the test set using the relevant Events extracted from the training data.<br>
   *
   * @param relevantEvents the features from the EventSets which are going to be evaluated
   * @param eventSetsToCull The test documents to be culled
   * @return the culled test documents
   * @throws Exception
   */
  public List<EventSet> cullWithRespectToTraining(
      List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd)
      throws Exception {
    List<EventSet> relevant = relevantEvents;
    int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata
    int i;
    List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>();

    // remove the metadata prior to culling
    EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1);

    // make sure all unknown sets would have only events that appear in the
    // known sets
    // UNLESS the event set contains a sole numeric value event - in that
    // case take it anyway
    for (i = 0; i < numOfFeatureClasses; i++) {
      if (cfd.featureDriverAt(i).isCalcHist()) {
        // initialize set of relevant events
        EventSet es = relevant.get(i);
        Set<String> relevantEventsString = new HashSet<String>(es.size());
        for (Event e : es) relevantEventsString.add(e.getEvent());

        // remove all non-relevant events from unknown event sets
        EventSet unknown;
        Event e;
        unknown = eventSetsToCull.get(i);
        Iterator<Event> iterator = unknown.iterator();
        Event next = null;

        // the test doc may not contain a given feature (ie it might not
        // have any semi-colons)
        if (iterator.hasNext()) next = (Event) iterator.next();

        // while it has more of a feature
        while (iterator.hasNext()) {
          // copy the feature
          e = next;
          boolean remove = true;

          // check to see if the feature is relevant
          for (int l = 0; l < unknown.size(); l++) {
            try {
              if (e.equals(relevantEvents.get(i).eventAt(l))) {
                remove = false; // if it is, break
                break;
              }
            } catch (IndexOutOfBoundsException iobe) {
              remove = true; // it is not relevant if we reach this point.
              break;
            }
          }

          // remove the feature if it isn't relevant
          if (remove) {
            iterator.remove();
          }

          // grab the next feature
          next = iterator.next();
        }

        // add the culled event set
        culledUnknownEventSets.add(unknown);

      } else { // one unique numeric event
        // add non-histogram if it is in the relevantEventSets list
        boolean isRelevant = false;

        for (EventSet res : relevantEvents) {
          if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) {
            isRelevant = true;
            break;
          }
        }

        if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i));
      }
    }
    eventSetsToCull.add(metadata);
    culledUnknownEventSets.add(metadata);

    return culledUnknownEventSets;
  }
  /**
   * Extracts the List of EventSets from a document using the provided CumulativeFeatureDriver.<br>
   *
   * @param document the document to have features extracted and made into event sets
   * @param cumulativeFeatureDriver the driver containing the features to be extracted and the
   *     functionality to do so
   * @param loadDocContents whether or not the document contents are already loaded into the object
   * @return the List of EventSets for the document
   */
  public List<EventSet> extractEventSets(
      Document document,
      CumulativeFeatureDriver cumulativeFeatureDriver,
      boolean loadDocContents,
      boolean isUsingCache)
      throws Exception {

    List<EventSet> generatedEvents = new ArrayList<EventSet>();

    if (isUsingCache) {
      File cacheDir =
          new File(JSANConstants.JSAN_CACHE + "_" + cumulativeFeatureDriver.getName() + "/");

      File authorDir = null;
      if (document.getAuthor().equals(JSANConstants.DUMMY_NAME)) {
        authorDir = new File(cacheDir, "you");
      } else {
        authorDir = new File(cacheDir, "_" + document.getAuthor());
      }

      File documentFile = new File(authorDir, document.getTitle() + ".cache");
      generatedEvents = getCachedFeatures(document, documentFile);
      if (generatedEvents == null) {
        // delete the cache for this document! It is invalid
        documentFile.delete();
        // program will continue as normal, extracting events
      } else {
        // return the cached features
        return generatedEvents;
      }
    }

    // Extract the Events from the documents
    try {
      generatedEvents =
          cumulativeFeatureDriver.createEventSets(document, loadDocContents, isUsingCache);
    } catch (Exception e) {
      LOG.error("Failed to extract events from a document!", e);
      throw e;
    }
    // create metadata event to store document information
    EventSet documentInfo = new EventSet();
    documentInfo.setEventSetID("<DOCUMENT METADATA>");

    /*
     * Metadata Event format:
     *
     * EventSetID: "<DOCUMENT METADATA>" Event at Index:
     * 0 : author
     * 1 : title
     * 2 : Sentences in document
     * 3 : Words in document
     * 4 : Characters in document
     * 5 : Letters in document
     */

    // Extract document title and author
    Event authorEvent = new Event(document.getAuthor());
    // Event titleEvent = new Event(document.getFilePath());
    Event titleEvent = new Event(document.getTitle());
    documentInfo.addEvent(authorEvent);
    documentInfo.addEvent(titleEvent);

    // Extract normalization baselines
    // Sentences in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new SentenceCounterEventDriver();
      doc = document;
      Event tempEvent = null;

      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num sentences from document", e);
        throw e;
      }

      documentInfo.addEvent(tempEvent);
    }

    // Words in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new WordCounterEventDriver();
      doc = document;
      Event tempEvent = null;
      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num words from document", e);
        throw e;
      }
      documentInfo.addEvent(tempEvent);
    }

    // Characters in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new CharCounterEventDriver();
      doc = document;
      Event tempEvent = null;
      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num characters from document", e);
        throw e;
      }
      documentInfo.addEvent(tempEvent);
    }

    // Letters in doc
    {
      Document doc = null;
      SingleNumericEventDriver counter = new LetterCounterEventDriver();
      doc = document;
      Event tempEvent = null;
      try {
        if (!loadDocContents) doc.load();
        tempEvent = new Event("" + (int) counter.getValue(doc));
      } catch (Exception e) {
        LOG.error("Failed to extract num letters from document", e);
        throw e;
      }
      documentInfo.addEvent(tempEvent);
    }

    // add the metadata EventSet to the List<EventSet>
    generatedEvents.add(documentInfo);

    // return the List<EventSet>
    return generatedEvents;
  }
  /**
   * Converts the extracted document information into a JStylo DataMap
   *
   * @param features
   * @param relevantEvents
   * @param cumulativeFeatureDriver
   * @param documentData
   * @return
   */
  public ConcurrentHashMap<Integer, FeatureData> createDocMap(
      List<String> features,
      List<EventSet> relevantEvents,
      CumulativeFeatureDriver cumulativeFeatureDriver,
      List<EventSet> documentData) {

    // generate training instances
    ConcurrentHashMap<Integer, FeatureData> documentMap =
        new ConcurrentHashMap<Integer, FeatureData>();

    // remove metadata event
    EventSet metadata = documentData.remove(documentData.size() - 1);

    // go through all eventSets in the document
    for (EventSet es : documentData) {

      // initialize relevant information
      ArrayList<Integer> indices = new ArrayList<Integer>();
      ArrayList<Event> events = new ArrayList<Event>();
      EventHistogram currHistogram = new EventHistogram();

      // whether or not we actually need this eventSet
      boolean eventSetIsRelevant = false;

      // find out if it is a histogram or not
      if (cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).isCalcHist()) {

        // find the event set in the list of relevant events
        for (EventSet res : relevantEvents) {
          if (es.getEventSetID().equals(res.getEventSetID())) {
            eventSetIsRelevant = true;
            break;
          }
        }

        // if it is relevant
        if (eventSetIsRelevant) {

          // find the indices of the events
          // and count all of the events
          for (Event e : es) {
            int currIndex = 0;
            boolean hasInner = false;

            // for the events n the set
            for (EventSet res : relevantEvents) {
              boolean found = false;
              for (Event re : res) {
                hasInner = true;

                // if they are the same event
                if (e.getEvent().equals(re.getEvent())) {
                  boolean inList = false;
                  for (Event el : events) {
                    if (el.getEvent().equals(e.getEvent())) {
                      inList = true;
                      break;
                    }
                  }

                  if (!inList) {
                    indices.add(currIndex);
                    events.add(e);
                  }
                  // Old location revert if change breaks
                  currHistogram.add(e);
                  found = true;
                }
                if (found) {
                  break;
                }
                currIndex++;
              }
              if (found) {
                break;
              }

              // if there's no inner, it was a non-hist feature.
              // increment by one
              if (!hasInner) {
                currIndex++;
              }
            }
          }
          // calculate/add the histograms
          int index = 0;
          for (Integer i : indices) {
            documentMap.put(
                i,
                new FeatureData(
                    cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(),
                    cumulativeFeatureDriver
                        .featureDriverAt(documentData.indexOf(es))
                        .getNormBaseline()
                        .getTitle(),
                    currHistogram.getAbsoluteFrequency(events.get(index))));
            index++;
          }
        }
      } else { // non histogram feature

        // initialize the index
        int nonHistIndex = 0;

        // find the indices of the events
        // and count all of the events
        for (EventSet res : relevantEvents) {

          if (es.getEventSetID().equals(res.getEventSetID())) {
            break;
          }

          // count to find the index
          boolean hasInner = false;
          for (@SuppressWarnings("unused") Event re : res) {
            hasInner = true;
            nonHistIndex++;
          }

          // if ther's no inner feature, increment by one; we just passed a non-histogram
          if (!hasInner) nonHistIndex++;
        }

        // Extract and add the event
        String eventString = es.eventAt(0).getEvent();
        int startIndex = eventString.indexOf("{");
        int endIndex = eventString.indexOf("}");
        eventString = eventString.substring(startIndex + 1, endIndex);

        FeatureData fd =
            new FeatureData(
                cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(),
                cumulativeFeatureDriver
                    .featureDriverAt(documentData.indexOf(es))
                    .getNormBaseline()
                    .getTitle(),
                Math.round((float) Double.parseDouble(eventString)));
        documentMap.put(nonHistIndex, fd);
      }
    }
    // add metadata back. Not sure if necessary
    documentData.add(metadata);

    return documentMap;
  }
  /**
   * Extracts a list of all features to be used for analysis.
   *
   * @param culledEventSets
   * @param relevantEvents
   * @param cumulativeFeatureDriver
   * @return
   * @throws Exception
   */
  public List<String> getFeatureList(
      List<List<EventSet>> culledEventSets,
      List<EventSet> relevantEvents,
      CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // remove the metdata prior to generating attribute list
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : culledEventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // initialize useful things
    int numOfFeatureClasses = relevantEvents.size();
    List<EventSet> list;

    List<String> features = new ArrayList<String>(numOfFeatureClasses);

    // initialize list of sets of events, which will eventually become the
    // attributes
    List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses);

    // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine
    for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) {
      // initialize relevant list of event sets and histograms
      list = new ArrayList<EventSet>();
      for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i));

      // initialize eventSet
      EventSet events = new EventSet();
      events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID());

      if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature

        // generate event histograms and unique event list
        EventSet eventSet = list.get(currEventSet);
        for (Event event : eventSet) {
          events.addEvent(event);
        }
        allEvents.add(events);

      } else { // one unique numeric event

        // generate sole event (give placeholder value)
        Event event = new Event("{-}");
        events.addEvent(event);
        allEvents.add(events);
      }
    }

    // Adds all of the events to the fast vector
    int featureIndex = 0;
    for (EventSet es : allEvents) {
      Iterator<Event> iterator = es.iterator();
      if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) {
        if (iterator.hasNext()) {
          // grab first event; there should be at least one
          Event nextEvent = (Event) iterator.next();
          // get and add all middle events if they exist
          while (iterator.hasNext()) {
            features.add(nextEvent.getEvent());
            nextEvent = (Event) iterator.next();
          }
          // add the last event
          features.add(nextEvent.getEvent());
        }
      } else {
        features.add(es.getEventSetID());
      }
      featureIndex++;
    }

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    return features;
  }
  /**
   * Test method for {@link
   * com.jgaap.eventDrivers.PorterStemmerEventDriver#createEventSet(com.jgaap.generics.Document)}.
   *
   * @throws EventGenerationException
   */
  @Test
  public void testCreateEventSetDocumentSet() throws EventGenerationException {
    /* test case 1 -- no punctuation */
    Document doc = new Document();
    doc.readStringText(
        "tests Tested TESTING TeSt "
            + "867-5309 "
            + "a aaron abaissiez abandon abandoned abase abash abate abated abatement "
            + "abatements abates abbess abbey abbeys abbominable abbot abbots abbreviated "
            + "abed abel aberga abergavenny abet abetting abhominable abhor abhorr abhorred "
            + "abhorring abhors abhorson abide abides abilities ability abject abjectly "
            + "abjects abjur abjure able abler aboard abode aboded abodements aboding "
            + "abominable abominably abominations abortive abortives abound abounding "
            + "about above abr abraham abram abreast abridg abridge abridged abridgment "
            + "abroach abroad abrogate abrook abrupt abruption abruptly absence absent "
            + "absey absolute absolutely absolv absolver abstains abstemious abstinence "
            + "abstract absurd absyrtus abundance abundant abundantly abus abuse abused "
            + "abuser abuses abusing abutting aby abysm ac academe academes accent accents "
            + "accept acceptable acceptance accepted accepts access accessary accessible "
            + "accidence accident accidental accidentally accidents accite accited accites "
            + "acclamations accommodate accommodated accommodation accommodations "
            + "accommodo accompanied accompany accompanying accomplices accomplish "
            + "accomplished accomplishing accomplishment accompt accord accordant accorded "
            + "accordeth according accordingly accords accost accosted account accountant "
            + "accounted accounts accoutred accoutrement accoutrements accrue");

    EventSet sampleEventSet = new PorterStemmerEventDriver().createEventSet(doc);
    EventSet expectedEventSet = new EventSet();
    Vector<Event> tmp = new Vector<Event>();

    tmp.add(new Event("test"));
    tmp.add(new Event("Test"));
    tmp.add(new Event("TEST"));
    tmp.add(new Event("TeSt"));

    tmp.add(new Event("867-5309"));

    tmp.add(new Event("a"));
    tmp.add(new Event("aaron"));
    tmp.add(new Event("abaissiez"));
    tmp.add(new Event("abandon"));
    tmp.add(new Event("abandon"));
    tmp.add(new Event("abas"));
    tmp.add(new Event("abash"));
    tmp.add(new Event("abat"));
    tmp.add(new Event("abat"));
    tmp.add(new Event("abat"));
    tmp.add(new Event("abat"));
    tmp.add(new Event("abat"));
    tmp.add(new Event("abbess"));
    tmp.add(new Event("abbei"));
    tmp.add(new Event("abbei"));
    tmp.add(new Event("abbomin"));
    tmp.add(new Event("abbot"));
    tmp.add(new Event("abbot"));
    tmp.add(new Event("abbrevi"));
    tmp.add(new Event("ab"));
    tmp.add(new Event("abel"));
    tmp.add(new Event("aberga"));
    tmp.add(new Event("abergavenni"));
    tmp.add(new Event("abet"));
    tmp.add(new Event("abet"));
    tmp.add(new Event("abhomin"));
    tmp.add(new Event("abhor"));
    tmp.add(new Event("abhorr"));
    tmp.add(new Event("abhor"));
    tmp.add(new Event("abhor"));
    tmp.add(new Event("abhor"));
    tmp.add(new Event("abhorson"));
    tmp.add(new Event("abid"));
    tmp.add(new Event("abid"));
    tmp.add(new Event("abil"));
    tmp.add(new Event("abil"));
    tmp.add(new Event("abject"));
    tmp.add(new Event("abjectli"));
    tmp.add(new Event("abject"));
    tmp.add(new Event("abjur"));
    tmp.add(new Event("abjur"));
    tmp.add(new Event("abl"));
    tmp.add(new Event("abler"));
    tmp.add(new Event("aboard"));
    tmp.add(new Event("abod"));
    tmp.add(new Event("abod"));
    tmp.add(new Event("abod"));
    tmp.add(new Event("abod"));
    tmp.add(new Event("abomin"));
    tmp.add(new Event("abomin"));
    tmp.add(new Event("abomin"));
    tmp.add(new Event("abort"));
    tmp.add(new Event("abort"));
    tmp.add(new Event("abound"));
    tmp.add(new Event("abound"));
    tmp.add(new Event("about"));
    tmp.add(new Event("abov"));
    tmp.add(new Event("abr"));
    tmp.add(new Event("abraham"));
    tmp.add(new Event("abram"));
    tmp.add(new Event("abreast"));
    tmp.add(new Event("abridg"));
    tmp.add(new Event("abridg"));
    tmp.add(new Event("abridg"));
    tmp.add(new Event("abridg"));
    tmp.add(new Event("abroach"));
    tmp.add(new Event("abroad"));
    tmp.add(new Event("abrog"));
    tmp.add(new Event("abrook"));
    tmp.add(new Event("abrupt"));
    tmp.add(new Event("abrupt"));
    tmp.add(new Event("abruptli"));
    tmp.add(new Event("absenc"));
    tmp.add(new Event("absent"));
    tmp.add(new Event("absei"));
    tmp.add(new Event("absolut"));
    tmp.add(new Event("absolut"));
    tmp.add(new Event("absolv"));
    tmp.add(new Event("absolv"));
    tmp.add(new Event("abstain"));
    tmp.add(new Event("abstemi"));
    tmp.add(new Event("abstin"));
    tmp.add(new Event("abstract"));
    tmp.add(new Event("absurd"));
    tmp.add(new Event("absyrtu"));
    tmp.add(new Event("abund"));
    tmp.add(new Event("abund"));
    tmp.add(new Event("abundantli"));
    tmp.add(new Event("abu"));
    tmp.add(new Event("abus"));
    tmp.add(new Event("abus"));
    tmp.add(new Event("abus"));
    tmp.add(new Event("abus"));
    tmp.add(new Event("abus"));
    tmp.add(new Event("abut"));
    tmp.add(new Event("abi"));
    tmp.add(new Event("abysm"));
    tmp.add(new Event("ac"));
    tmp.add(new Event("academ"));
    tmp.add(new Event("academ"));
    tmp.add(new Event("accent"));
    tmp.add(new Event("accent"));
    tmp.add(new Event("accept"));
    tmp.add(new Event("accept"));
    tmp.add(new Event("accept"));
    tmp.add(new Event("accept"));
    tmp.add(new Event("accept"));
    tmp.add(new Event("access"));
    tmp.add(new Event("accessari"));
    tmp.add(new Event("access"));
    tmp.add(new Event("accid"));
    tmp.add(new Event("accid"));
    tmp.add(new Event("accident"));
    tmp.add(new Event("accident"));
    tmp.add(new Event("accid"));
    tmp.add(new Event("accit"));
    tmp.add(new Event("accit"));
    tmp.add(new Event("accit"));
    tmp.add(new Event("acclam"));
    tmp.add(new Event("accommod"));
    tmp.add(new Event("accommod"));
    tmp.add(new Event("accommod"));
    tmp.add(new Event("accommod"));
    tmp.add(new Event("accommodo"));
    tmp.add(new Event("accompani"));
    tmp.add(new Event("accompani"));
    tmp.add(new Event("accompani"));
    tmp.add(new Event("accomplic"));
    tmp.add(new Event("accomplish"));
    tmp.add(new Event("accomplish"));
    tmp.add(new Event("accomplish"));
    tmp.add(new Event("accomplish"));
    tmp.add(new Event("accompt"));
    tmp.add(new Event("accord"));
    tmp.add(new Event("accord"));
    tmp.add(new Event("accord"));
    tmp.add(new Event("accordeth"));
    tmp.add(new Event("accord"));
    tmp.add(new Event("accordingli"));
    tmp.add(new Event("accord"));
    tmp.add(new Event("accost"));
    tmp.add(new Event("accost"));
    tmp.add(new Event("account"));
    tmp.add(new Event("account"));
    tmp.add(new Event("account"));
    tmp.add(new Event("account"));
    tmp.add(new Event("accoutr"));
    tmp.add(new Event("accoutr"));
    tmp.add(new Event("accoutr"));
    tmp.add(new Event("accru"));
    expectedEventSet.addEvents(tmp);

    // System.out.println("Expected is " + expectedEventSet.toString());
    // System.out.println("Actual is " + sampleEventSet.toString());
    assertTrue(expectedEventSet.equals(sampleEventSet));
  }
Exemple #15
0
  private EventSet window(EventSet e1, int offset, int windowSize) {

    return e1.subset(offset, offset + windowSize);
  }
  /**
   * Goes over the culled List of Lists of EventSets and determines which events are histograms and
   * which have a single numerical value.<br>
   * Uses the information to prepare a List of EventSets to extract from the test document(s).
   *
   * @param culledEventSets The culled List of Lists of EventSets
   * @param cumulativeFeatureDriver The driver used to extract the EventSets
   * @return The List of EventSet to extract from the test document(s)
   * @throws Exception
   */
  public List<EventSet> getRelevantEvents(
      List<List<EventSet>> culledEventSets, CumulativeFeatureDriver cumulativeFeatureDriver)
      throws Exception {

    // remove the metadata prior to generating the relevantEvents
    ArrayList<EventSet> docMetaData = new ArrayList<EventSet>();
    for (List<EventSet> les : culledEventSets) {
      docMetaData.add(les.remove(les.size() - 1));
    }

    // initialize the EventSet list
    List<EventSet> relevantEvents = new LinkedList<EventSet>();

    // iterate over the List of Lists
    for (List<EventSet> l : culledEventSets) {
      // iterate over each inner list's eventSets
      int featureIndex = 0;
      for (EventSet esToAdd : l) {
        // whether or not to add the event set to the list (if false, it
        // is already on the list)
        boolean add = true;
        ;

        for (EventSet esl : relevantEvents) {
          // this should compare the category/name of the event set
          if (esToAdd.getEventSetID().equals(esl.getEventSetID())) {
            add = false;
            break;
          }
        }

        // this event set isn't on the list at all, just add it (which
        // also adds its internal events) to the list
        if (add) {
          EventSet temp = new EventSet();
          temp.setEventSetID(esToAdd.getEventSetID());

          // for all of the events
          for (Event e : esToAdd) {
            boolean absent = true;
            // check to see if it's been added yet or not
            for (Event ae : temp) {
              if (ae.getEvent().equals(e.getEvent())) {
                absent = false;
                break;
              }
            }
            // if it has not been added, add it
            if (absent) {
              if (!cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist())
                temp.addEvent(new Event("{-}"));
              else temp.addEvent(e);
            }
          }

          relevantEvents.add(temp);
        } else {
          // go through this eventSet and add any events to the
          // relevant EventSet if they aren't already there.
          if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) {
            for (Event e : esToAdd) {
              boolean toAdd = true;
              // for all events in the relecant list
              for (Event re : relevantEvents.get(featureIndex)) {
                // if it's already there, don't add it
                if (e.getEvent().equals(re.getEvent())) {
                  toAdd = false;
                  break;
                }
              }
              // add it if it isn't there
              if (toAdd) {
                relevantEvents.get(featureIndex).addEvent(e);
              }
            }
          }
        }
        featureIndex++;
      }
    }

    // add the metadata back in
    int index = 0;
    for (List<EventSet> les : culledEventSets) {
      les.add(docMetaData.get(index));
      index++;
    }

    return relevantEvents;
  }
  /**
   * Returns KC distance between event sets es1 and es2
   *
   * @param es1 The first EventSet
   * @param es2 The second EventSet
   * @return the KC distance between them
   */
  @Override
  public double distance(EventSet es1, EventSet es2) {

    EventHistogram h1 = es1.getHistogram();
    EventHistogram h2 = es2.getHistogram();

    Set<Event> s = new HashSet<Event>();

    List<Pair<Event, Double>> l1 = new ArrayList<Pair<Event, Double>>();
    List<Pair<Event, Double>> l2 = new ArrayList<Pair<Event, Double>>();

    HashMap<Event, Integer> hm1 = new HashMap<Event, Integer>();
    HashMap<Event, Integer> hm2 = new HashMap<Event, Integer>();

    double oldfreq = Double.POSITIVE_INFINITY;

    double correlation = 0.0;

    s.addAll(es1.uniqueEvents());
    s.addAll(es2.uniqueEvents());

    // System.out.println(h1.toString());
    // System.out.println(h2.toString());

    /* make lists of the histograms */
    for (Event e : h1) {
      l1.add(new Pair<Event, Double>(e, h1.getRelativeFrequency(e), 2));
    }
    for (Event e : h2) {
      l2.add(new Pair<Event, Double>(e, h2.getRelativeFrequency(e), 2));
    }

    /* sort the list so the most frequent items are at the top */
    /* NOTE : THIS MAY BE USEFUL ELSEWHERE : SAVE THIS CODE */
    Collections.sort(l1);
    Collections.reverse(l1);
    Collections.sort(l2);
    Collections.reverse(l2);

    /* DEBUGGING STUFF
    for (Pair <Event,Double> p : l1) {
    	System.out.println("L1: " + p.toString());
    }
    for (Pair <Event,Double> p : l1) {
    	System.out.println("L2: " + p.toString());
    }
    */

    /* Convert lists into a hashmap of event:rank pairs */
    int rank = 0;
    int count = 0;
    for (Pair<Event, Double> p : l1) {
      Event e = (Event) (p.getFirst());
      double f = (Double) (p.getSecond());
      count++;
      if (f != oldfreq) {
        rank = count;
        oldfreq = f;
      }
      hm1.put(e, rank);
    }

    /* reset and do second list */
    rank = 0;
    count = 0;
    for (Pair<Event, Double> p : l2) {
      Event e = (Event) (p.getFirst());
      double f = (Double) (p.getSecond());
      count++;
      if (f != oldfreq) {
        rank = count;
        oldfreq = f;
      }
      hm2.put(e, rank);
    }

    /* More debugging stuff
    System.out.println(hm1.toString());
    System.out.println(hm2.toString());
    System.out.println(s.toString());
    */

    Integer x1, x2, y1, y2;
    Set<Event> s2 = new HashSet<Event>(s);
    for (Event e1 : s) {
      // s2.remove(e1);
      for (Event e2 : s2) {

        if (e1.equals(e2)) continue;

        /* get ranks of events e1 and e2 in both x and y distributions */
        x1 = hm1.get(e1);
        /* if not present, rank is size + 1 */
        if (x1 == null) x1 = hm1.size() + 1;

        x2 = hm2.get(e1);
        if (x2 == null) x2 = hm2.size() + 1;

        y1 = hm1.get(e2);
        /* if not present, rank is size + 1 */
        // broke because if (y1 == null) x1 = hm1.size()+1; x1 should be y1
        if (y1 == null) y1 = hm1.size() + 1;

        y2 = hm2.get(e2);
        if (y2 == null) y2 = hm2.size() + 1;

        /* more debugging stuff
        System.out.println(e1.toString() + " is ("+x1+","+x2+")");
        System.out.println(e2.toString() + " is ("+y1+","+y2+")");
        System.out.println(sgn(x1.compareTo(y1)) + " " +
        		   sgn(x2.compareTo(y2)) );
        System.out.println("");
        */

        correlation += (sgn(x1.compareTo(y1)) * sgn(x2.compareTo(y2)));
        //				System.out.println(correlation);
      }
    }

    // System.out.println(correlation);
    correlation /= (hm1.size() * (hm2.size() - 1));
    // System.out.println(correlation);
    // System.out.println("---");

    return 1.0 - correlation;
  }
  @Override
  public EventSet createEventSet(Document ds) throws EventGenerationException {
    String param;
    HashMap<String, String> transform = new HashMap<String, String>();
    boolean whitelist = false;

    String line;
    String[] words;

    if (!(param = (getParameter("underlyingEvents"))).equals("")) {
      try {
        underlyingEvents = EventDriverFactory.getEventDriver(param);
      } catch (Exception e) {
        System.out.println("Error: cannot create EventDriver " + param);
        System.out.println(" -- Using NaiveWordEventSet");
        underlyingEvents = new NaiveWordEventDriver();
      }
    } else { // no underlyingEventsParameter, use NaiveWordEventSet
      underlyingEvents = new NaiveWordEventDriver();
    }

    if (!(param = (getParameter("filename"))).equals("")) {
      filename = param;
    } else { // no underlyingfilename,
      filename = null;
    }

    if (!(param = (getParameter("implicitWhiteList"))).equals("")) {
      if (param.equalsIgnoreCase("true")) {
        whitelist = true;
      }
    } else { // no underlyingfilename,
      whitelist = false;
    }

    EventSet es = underlyingEvents.createEventSet(ds);

    EventSet newEs = new EventSet();
    newEs.setAuthor(es.getAuthor());
    newEs.setNewEventSetID(es.getAuthor());

    BufferedReader br = null;

    if (filename != null) {
      try {
        FileInputStream fis = new FileInputStream(filename);
        br = new BufferedReader(new InputStreamReader(fis));

        while ((line = br.readLine()) != null) {
          if (line.length() > 0) {
            String sep = line.substring(0, 1);
            words = line.substring(1).split(sep, -1);
            if (words.length > 1) {
              transform.put(words[0], words[1]);
              System.out.println("Adding \"" + words[0] + "\" : \"" + words[1] + "\"");
            }
          }
        }

      } catch (IOException e) {
        // catch io errors from FileInputStream or readLine()
        System.out.println("Cannot open/read " + filename);
        System.out.println("IOException error! " + e.getMessage());
        transform = null;
      } finally {
        // if the file opened okay, make sure we close it
        if (br != null) {
          try {
            br.close();
          } catch (IOException ioe) {
          }
        }
      }
    } else {
      transform = null;
    }

    for (Event e : es) {
      String s = e.toString();
      if (transform == null) {
        newEs.addEvent(e);
      } else if (transform.containsKey(s)) {
        String newS = transform.get(s);
        if (newS.length() > 0) {
          newEs.addEvent(new Event(newS));
        }
      } else // s is not in transformation list
      if (whitelist == false) {
        // add only if no implicit whitelisting
        newEs.addEvent(e);
      } // otherwise add nothing
    }
    return newEs;
  }