/** * Loads the cached features for a given document * * @param document * @param documentFile The cache file for the document. * @return the cached features if possible. Null if a cache doesn't exist or it fails to get them. * @throws Exception */ private List<EventSet> getCachedFeatures(Document document, File documentFile) { List<EventSet> generatedEvents = null; BufferedReader reader = null; if (documentFile.exists() && !documentFile.isDirectory() && documentFile.canRead()) { try { reader = new BufferedReader(new FileReader(documentFile)); } catch (FileNotFoundException e) { // shouldn't ever get here.. just put this here so I can keep track // of exceptions below. e.printStackTrace(); } } else { return null; } try { // cachedPath is the path to the document that was used when the cache for that // document was created. cachedLastModified is the last modified time stamp on the // document that was cached. String cachedPath = reader.readLine(); long cachedLastModified = Long.parseLong(reader.readLine()); String path = document.getFilePath(); File currDoc = new File(path); long lastModified = currDoc.lastModified(); if (!(currDoc.getCanonicalPath().equals(cachedPath) && lastModified == cachedLastModified)) { // cache is invalid reader.close(); return null; } String line = null; generatedEvents = new ArrayList<EventSet>(); while ((line = reader.readLine()) != null) { if (line.isEmpty()) continue; EventSet es = new EventSet(); es.setAuthor(document.getAuthor()); es.setDocumentName(document.getTitle()); es.setEventSetID(line); String event = null; while ((event = reader.readLine()) != null) { if (line.isEmpty()) continue; if (event.equals(",")) // delimiter for event sets break; es.addEvent(new Event(event)); } generatedEvents.add(es); } reader.close(); } catch (IOException e) { e.printStackTrace(); return null; } return generatedEvents; }
public void build(EventSet e) { for (int i = 0; i < e.size(); i++) { Event start = e.eventAt(i); if (!root.isEventInLevel(start)) { insertAtRoot(start, e, i); } else { insertBelowRoot(start, e, i); } } root.key = null; }
private void insertAtRoot(Event start, EventSet e, int offset) { root.addEventToLevel(start); XEDictionaryNode node; node = root; int j = offset; while (j < e.size() - 1) { node = node.get(e.eventAt(j)); j++; // System.out.println("Adding Event: " + e.eventAt(j)); node.addEventToLevel(e.eventAt(j)); } }
public int find(EventSet e) { int matchlength = 0; boolean matched = false; XEDictionaryNode node = root; while ((matchlength < e.size()) && !matched) { if (node.isEventInLevel(e.eventAt(matchlength))) { node = node.get(e.eventAt(matchlength)); matchlength++; } else { matched = true; } } return matchlength; }
private double meanEntropy(EventSet e1, EventSet e2, int windowSize) { double totalEntropy = 0; int trials = 0; if (windowSize > e1.size() - 1) { windowSize = e1.size(); } // for (int j = 0; j <= e1.size() - windowSize; j++) { XEDictionary xed = new XEDictionary(); EventSet dictionary; dictionary = window(e1, 0, windowSize); xed.build(dictionary); for (int i = 0; i <= e2.size() - windowSize; i++) { totalEntropy += xed.find(window(e2, i, windowSize)); trials++; } // } return totalEntropy / trials; }
/** * Determines which EventSets to use for the given documents based on the chosen cullers.<br> * * @param eventSets A List which contains Lists of EventSets (represents a list of documents' * EventSets * @param cumulativeFeatureDriver the driver with the culling functionality * @return The culled List of Lists of EventSets created from eventSets * @throws Exception */ public List<List<EventSet>> cull( List<List<EventSet>> eventSets, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // a hacky workaround for the bug in the eventCuller. Fix that // later then remove these ArrayList<String> IDs = new ArrayList<String>(); for (EventSet es : eventSets.get(0)) { IDs.add(es.getEventSetID()); } // remove the metdata prior to culling ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : eventSets) { docMetaData.add(les.remove(les.size() - 1)); } // cull the events List<List<EventSet>> culledEventSets = CumulativeEventCuller.cull(eventSets, cumulativeFeatureDriver); // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } // a hacky workaround for the bug in the eventCuller. Fix that // later then remove these for (int j1 = 0; j1 < culledEventSets.size(); j1++) { for (int iterator = 0; iterator < culledEventSets.get(j1).size(); iterator++) { culledEventSets.get(j1).get(iterator).setEventSetID(IDs.get(iterator)); } } // return culled events return culledEventSets; }
@SuppressWarnings("static-access") @Override public EventSet createEventSet(Document doc) { EventSet es = new EventSet(doc.getAuthor()); char[] text = doc.getProcessedText(); String stringText = new String(text); // use MaxentPOSTagsEventDriver's tagger // initialize tagger and return empty event set if encountered a problem if (tagger == null) { tagger = MaxentPOSTagsEventDriver.initTagger(); if (tagger == null) return es; } List<List<HasWord>> sentences = tagger.tokenizeText(new BufferedReader(new StringReader(stringText))); ArrayList<TaggedWord> tagged = new ArrayList<TaggedWord>(); for (List<HasWord> sentence : sentences) tagged.addAll(tagger.tagSentence(sentence)); int i, j, n; try { n = Integer.parseInt(getParameter("N")); } catch (NumberFormatException e) { n = 2; } String curr; for (i = 0; i < tagged.size() - n + 1; i++) { curr = "(" + tagged.get(i).tag() + ")"; for (j = 1; j < n; j++) { curr += "-(" + tagged.get(i + j).tag() + ")"; } es.addEvent(new Event(curr)); } sentences.clear(); sentences = null; return es; }
public double getValue(Document doc) throws EventGenerationException { double wordCount = wordCounter.getValue(doc); double sentenceCount = sentenceCounter.getValue(doc); EventSet syllables = syllablesDriver.createEventSet(doc); for (int i = syllables.size() - 1; i >= 0; i--) { if (Integer.parseInt(syllables.eventAt(i).toString()) < 3) { syllables.removeEvent(syllables.eventAt(i)); } } double complexWordsCount = syllables.size(); return 0.4 * (wordCount / sentenceCount + 100 * complexWordsCount / wordCount); }
private void insertBelowRoot(Event start, EventSet e, int offset) { XEDictionaryNode node; node = root; // System.out.println("Event at offset: " + e.eventAt(offset)); node = node.get(e.eventAt(offset)); int j = offset; boolean matches = true; // match the events up to a given level while (matches && (j < e.size() - 1)) { j++; if (node.isEventInLevel(e.eventAt(j))) { // System.out.println("Match at level: " + e.eventAt(j)); node = node.get(e.eventAt(j)); } else { matches = false; } } for (int i = j; i < e.size(); i++) { // System.out.println("Adding Event: " + e.eventAt(i)); node.addEventToLevel(e.eventAt(i)); node = node.get(e.eventAt(i)); } }
/** * Culls the test set using the relevant Events extracted from the training data.<br> * * @param relevantEvents the features from the EventSets which are going to be evaluated * @param eventSetsToCull The test documents to be culled * @return the culled test documents * @throws Exception */ public List<EventSet> cullWithRespectToTraining( List<EventSet> relevantEvents, List<EventSet> eventSetsToCull, CumulativeFeatureDriver cfd) throws Exception { List<EventSet> relevant = relevantEvents; int numOfFeatureClasses = eventSetsToCull.size() - 1; // -1 to compensate for removing metadata int i; List<EventSet> culledUnknownEventSets = new LinkedList<EventSet>(); // remove the metadata prior to culling EventSet metadata = eventSetsToCull.remove(eventSetsToCull.size() - 1); // make sure all unknown sets would have only events that appear in the // known sets // UNLESS the event set contains a sole numeric value event - in that // case take it anyway for (i = 0; i < numOfFeatureClasses; i++) { if (cfd.featureDriverAt(i).isCalcHist()) { // initialize set of relevant events EventSet es = relevant.get(i); Set<String> relevantEventsString = new HashSet<String>(es.size()); for (Event e : es) relevantEventsString.add(e.getEvent()); // remove all non-relevant events from unknown event sets EventSet unknown; Event e; unknown = eventSetsToCull.get(i); Iterator<Event> iterator = unknown.iterator(); Event next = null; // the test doc may not contain a given feature (ie it might not // have any semi-colons) if (iterator.hasNext()) next = (Event) iterator.next(); // while it has more of a feature while (iterator.hasNext()) { // copy the feature e = next; boolean remove = true; // check to see if the feature is relevant for (int l = 0; l < unknown.size(); l++) { try { if (e.equals(relevantEvents.get(i).eventAt(l))) { remove = false; // if it is, break break; } } catch (IndexOutOfBoundsException iobe) { remove = true; // it is not relevant if we reach this point. break; } } // remove the feature if it isn't relevant if (remove) { iterator.remove(); } // grab the next feature next = iterator.next(); } // add the culled event set culledUnknownEventSets.add(unknown); } else { // one unique numeric event // add non-histogram if it is in the relevantEventSets list boolean isRelevant = false; for (EventSet res : relevantEvents) { if (res.getEventSetID().equals(eventSetsToCull.get(i).getEventSetID())) { isRelevant = true; break; } } if (isRelevant) culledUnknownEventSets.add(eventSetsToCull.get(i)); } } eventSetsToCull.add(metadata); culledUnknownEventSets.add(metadata); return culledUnknownEventSets; }
/** * Extracts the List of EventSets from a document using the provided CumulativeFeatureDriver.<br> * * @param document the document to have features extracted and made into event sets * @param cumulativeFeatureDriver the driver containing the features to be extracted and the * functionality to do so * @param loadDocContents whether or not the document contents are already loaded into the object * @return the List of EventSets for the document */ public List<EventSet> extractEventSets( Document document, CumulativeFeatureDriver cumulativeFeatureDriver, boolean loadDocContents, boolean isUsingCache) throws Exception { List<EventSet> generatedEvents = new ArrayList<EventSet>(); if (isUsingCache) { File cacheDir = new File(JSANConstants.JSAN_CACHE + "_" + cumulativeFeatureDriver.getName() + "/"); File authorDir = null; if (document.getAuthor().equals(JSANConstants.DUMMY_NAME)) { authorDir = new File(cacheDir, "you"); } else { authorDir = new File(cacheDir, "_" + document.getAuthor()); } File documentFile = new File(authorDir, document.getTitle() + ".cache"); generatedEvents = getCachedFeatures(document, documentFile); if (generatedEvents == null) { // delete the cache for this document! It is invalid documentFile.delete(); // program will continue as normal, extracting events } else { // return the cached features return generatedEvents; } } // Extract the Events from the documents try { generatedEvents = cumulativeFeatureDriver.createEventSets(document, loadDocContents, isUsingCache); } catch (Exception e) { LOG.error("Failed to extract events from a document!", e); throw e; } // create metadata event to store document information EventSet documentInfo = new EventSet(); documentInfo.setEventSetID("<DOCUMENT METADATA>"); /* * Metadata Event format: * * EventSetID: "<DOCUMENT METADATA>" Event at Index: * 0 : author * 1 : title * 2 : Sentences in document * 3 : Words in document * 4 : Characters in document * 5 : Letters in document */ // Extract document title and author Event authorEvent = new Event(document.getAuthor()); // Event titleEvent = new Event(document.getFilePath()); Event titleEvent = new Event(document.getTitle()); documentInfo.addEvent(authorEvent); documentInfo.addEvent(titleEvent); // Extract normalization baselines // Sentences in doc { Document doc = null; SingleNumericEventDriver counter = new SentenceCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num sentences from document", e); throw e; } documentInfo.addEvent(tempEvent); } // Words in doc { Document doc = null; SingleNumericEventDriver counter = new WordCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num words from document", e); throw e; } documentInfo.addEvent(tempEvent); } // Characters in doc { Document doc = null; SingleNumericEventDriver counter = new CharCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num characters from document", e); throw e; } documentInfo.addEvent(tempEvent); } // Letters in doc { Document doc = null; SingleNumericEventDriver counter = new LetterCounterEventDriver(); doc = document; Event tempEvent = null; try { if (!loadDocContents) doc.load(); tempEvent = new Event("" + (int) counter.getValue(doc)); } catch (Exception e) { LOG.error("Failed to extract num letters from document", e); throw e; } documentInfo.addEvent(tempEvent); } // add the metadata EventSet to the List<EventSet> generatedEvents.add(documentInfo); // return the List<EventSet> return generatedEvents; }
/** * Converts the extracted document information into a JStylo DataMap * * @param features * @param relevantEvents * @param cumulativeFeatureDriver * @param documentData * @return */ public ConcurrentHashMap<Integer, FeatureData> createDocMap( List<String> features, List<EventSet> relevantEvents, CumulativeFeatureDriver cumulativeFeatureDriver, List<EventSet> documentData) { // generate training instances ConcurrentHashMap<Integer, FeatureData> documentMap = new ConcurrentHashMap<Integer, FeatureData>(); // remove metadata event EventSet metadata = documentData.remove(documentData.size() - 1); // go through all eventSets in the document for (EventSet es : documentData) { // initialize relevant information ArrayList<Integer> indices = new ArrayList<Integer>(); ArrayList<Event> events = new ArrayList<Event>(); EventHistogram currHistogram = new EventHistogram(); // whether or not we actually need this eventSet boolean eventSetIsRelevant = false; // find out if it is a histogram or not if (cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).isCalcHist()) { // find the event set in the list of relevant events for (EventSet res : relevantEvents) { if (es.getEventSetID().equals(res.getEventSetID())) { eventSetIsRelevant = true; break; } } // if it is relevant if (eventSetIsRelevant) { // find the indices of the events // and count all of the events for (Event e : es) { int currIndex = 0; boolean hasInner = false; // for the events n the set for (EventSet res : relevantEvents) { boolean found = false; for (Event re : res) { hasInner = true; // if they are the same event if (e.getEvent().equals(re.getEvent())) { boolean inList = false; for (Event el : events) { if (el.getEvent().equals(e.getEvent())) { inList = true; break; } } if (!inList) { indices.add(currIndex); events.add(e); } // Old location revert if change breaks currHistogram.add(e); found = true; } if (found) { break; } currIndex++; } if (found) { break; } // if there's no inner, it was a non-hist feature. // increment by one if (!hasInner) { currIndex++; } } } // calculate/add the histograms int index = 0; for (Integer i : indices) { documentMap.put( i, new FeatureData( cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(), cumulativeFeatureDriver .featureDriverAt(documentData.indexOf(es)) .getNormBaseline() .getTitle(), currHistogram.getAbsoluteFrequency(events.get(index)))); index++; } } } else { // non histogram feature // initialize the index int nonHistIndex = 0; // find the indices of the events // and count all of the events for (EventSet res : relevantEvents) { if (es.getEventSetID().equals(res.getEventSetID())) { break; } // count to find the index boolean hasInner = false; for (@SuppressWarnings("unused") Event re : res) { hasInner = true; nonHistIndex++; } // if ther's no inner feature, increment by one; we just passed a non-histogram if (!hasInner) nonHistIndex++; } // Extract and add the event String eventString = es.eventAt(0).getEvent(); int startIndex = eventString.indexOf("{"); int endIndex = eventString.indexOf("}"); eventString = eventString.substring(startIndex + 1, endIndex); FeatureData fd = new FeatureData( cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(), cumulativeFeatureDriver .featureDriverAt(documentData.indexOf(es)) .getNormBaseline() .getTitle(), Math.round((float) Double.parseDouble(eventString))); documentMap.put(nonHistIndex, fd); } } // add metadata back. Not sure if necessary documentData.add(metadata); return documentMap; }
/** * Extracts a list of all features to be used for analysis. * * @param culledEventSets * @param relevantEvents * @param cumulativeFeatureDriver * @return * @throws Exception */ public List<String> getFeatureList( List<List<EventSet>> culledEventSets, List<EventSet> relevantEvents, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // remove the metdata prior to generating attribute list ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : culledEventSets) { docMetaData.add(les.remove(les.size() - 1)); } // initialize useful things int numOfFeatureClasses = relevantEvents.size(); List<EventSet> list; List<String> features = new ArrayList<String>(numOfFeatureClasses); // initialize list of sets of events, which will eventually become the // attributes List<EventSet> allEvents = new ArrayList<EventSet>(numOfFeatureClasses); // Neither the doc title nor the author is in the List<List<EventSet>>, so this should work fine for (int currEventSet = 0; currEventSet < numOfFeatureClasses; currEventSet++) { // initialize relevant list of event sets and histograms list = new ArrayList<EventSet>(); for (int i = 0; i < numOfFeatureClasses; i++) list.add(relevantEvents.get(i)); // initialize eventSet EventSet events = new EventSet(); events.setEventSetID(relevantEvents.get(currEventSet).getEventSetID()); if (cumulativeFeatureDriver.featureDriverAt(currEventSet).isCalcHist()) { // histogram feature // generate event histograms and unique event list EventSet eventSet = list.get(currEventSet); for (Event event : eventSet) { events.addEvent(event); } allEvents.add(events); } else { // one unique numeric event // generate sole event (give placeholder value) Event event = new Event("{-}"); events.addEvent(event); allEvents.add(events); } } // Adds all of the events to the fast vector int featureIndex = 0; for (EventSet es : allEvents) { Iterator<Event> iterator = es.iterator(); if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) { if (iterator.hasNext()) { // grab first event; there should be at least one Event nextEvent = (Event) iterator.next(); // get and add all middle events if they exist while (iterator.hasNext()) { features.add(nextEvent.getEvent()); nextEvent = (Event) iterator.next(); } // add the last event features.add(nextEvent.getEvent()); } } else { features.add(es.getEventSetID()); } featureIndex++; } // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } return features; }
/** * Test method for {@link * com.jgaap.eventDrivers.PorterStemmerEventDriver#createEventSet(com.jgaap.generics.Document)}. * * @throws EventGenerationException */ @Test public void testCreateEventSetDocumentSet() throws EventGenerationException { /* test case 1 -- no punctuation */ Document doc = new Document(); doc.readStringText( "tests Tested TESTING TeSt " + "867-5309 " + "a aaron abaissiez abandon abandoned abase abash abate abated abatement " + "abatements abates abbess abbey abbeys abbominable abbot abbots abbreviated " + "abed abel aberga abergavenny abet abetting abhominable abhor abhorr abhorred " + "abhorring abhors abhorson abide abides abilities ability abject abjectly " + "abjects abjur abjure able abler aboard abode aboded abodements aboding " + "abominable abominably abominations abortive abortives abound abounding " + "about above abr abraham abram abreast abridg abridge abridged abridgment " + "abroach abroad abrogate abrook abrupt abruption abruptly absence absent " + "absey absolute absolutely absolv absolver abstains abstemious abstinence " + "abstract absurd absyrtus abundance abundant abundantly abus abuse abused " + "abuser abuses abusing abutting aby abysm ac academe academes accent accents " + "accept acceptable acceptance accepted accepts access accessary accessible " + "accidence accident accidental accidentally accidents accite accited accites " + "acclamations accommodate accommodated accommodation accommodations " + "accommodo accompanied accompany accompanying accomplices accomplish " + "accomplished accomplishing accomplishment accompt accord accordant accorded " + "accordeth according accordingly accords accost accosted account accountant " + "accounted accounts accoutred accoutrement accoutrements accrue"); EventSet sampleEventSet = new PorterStemmerEventDriver().createEventSet(doc); EventSet expectedEventSet = new EventSet(); Vector<Event> tmp = new Vector<Event>(); tmp.add(new Event("test")); tmp.add(new Event("Test")); tmp.add(new Event("TEST")); tmp.add(new Event("TeSt")); tmp.add(new Event("867-5309")); tmp.add(new Event("a")); tmp.add(new Event("aaron")); tmp.add(new Event("abaissiez")); tmp.add(new Event("abandon")); tmp.add(new Event("abandon")); tmp.add(new Event("abas")); tmp.add(new Event("abash")); tmp.add(new Event("abat")); tmp.add(new Event("abat")); tmp.add(new Event("abat")); tmp.add(new Event("abat")); tmp.add(new Event("abat")); tmp.add(new Event("abbess")); tmp.add(new Event("abbei")); tmp.add(new Event("abbei")); tmp.add(new Event("abbomin")); tmp.add(new Event("abbot")); tmp.add(new Event("abbot")); tmp.add(new Event("abbrevi")); tmp.add(new Event("ab")); tmp.add(new Event("abel")); tmp.add(new Event("aberga")); tmp.add(new Event("abergavenni")); tmp.add(new Event("abet")); tmp.add(new Event("abet")); tmp.add(new Event("abhomin")); tmp.add(new Event("abhor")); tmp.add(new Event("abhorr")); tmp.add(new Event("abhor")); tmp.add(new Event("abhor")); tmp.add(new Event("abhor")); tmp.add(new Event("abhorson")); tmp.add(new Event("abid")); tmp.add(new Event("abid")); tmp.add(new Event("abil")); tmp.add(new Event("abil")); tmp.add(new Event("abject")); tmp.add(new Event("abjectli")); tmp.add(new Event("abject")); tmp.add(new Event("abjur")); tmp.add(new Event("abjur")); tmp.add(new Event("abl")); tmp.add(new Event("abler")); tmp.add(new Event("aboard")); tmp.add(new Event("abod")); tmp.add(new Event("abod")); tmp.add(new Event("abod")); tmp.add(new Event("abod")); tmp.add(new Event("abomin")); tmp.add(new Event("abomin")); tmp.add(new Event("abomin")); tmp.add(new Event("abort")); tmp.add(new Event("abort")); tmp.add(new Event("abound")); tmp.add(new Event("abound")); tmp.add(new Event("about")); tmp.add(new Event("abov")); tmp.add(new Event("abr")); tmp.add(new Event("abraham")); tmp.add(new Event("abram")); tmp.add(new Event("abreast")); tmp.add(new Event("abridg")); tmp.add(new Event("abridg")); tmp.add(new Event("abridg")); tmp.add(new Event("abridg")); tmp.add(new Event("abroach")); tmp.add(new Event("abroad")); tmp.add(new Event("abrog")); tmp.add(new Event("abrook")); tmp.add(new Event("abrupt")); tmp.add(new Event("abrupt")); tmp.add(new Event("abruptli")); tmp.add(new Event("absenc")); tmp.add(new Event("absent")); tmp.add(new Event("absei")); tmp.add(new Event("absolut")); tmp.add(new Event("absolut")); tmp.add(new Event("absolv")); tmp.add(new Event("absolv")); tmp.add(new Event("abstain")); tmp.add(new Event("abstemi")); tmp.add(new Event("abstin")); tmp.add(new Event("abstract")); tmp.add(new Event("absurd")); tmp.add(new Event("absyrtu")); tmp.add(new Event("abund")); tmp.add(new Event("abund")); tmp.add(new Event("abundantli")); tmp.add(new Event("abu")); tmp.add(new Event("abus")); tmp.add(new Event("abus")); tmp.add(new Event("abus")); tmp.add(new Event("abus")); tmp.add(new Event("abus")); tmp.add(new Event("abut")); tmp.add(new Event("abi")); tmp.add(new Event("abysm")); tmp.add(new Event("ac")); tmp.add(new Event("academ")); tmp.add(new Event("academ")); tmp.add(new Event("accent")); tmp.add(new Event("accent")); tmp.add(new Event("accept")); tmp.add(new Event("accept")); tmp.add(new Event("accept")); tmp.add(new Event("accept")); tmp.add(new Event("accept")); tmp.add(new Event("access")); tmp.add(new Event("accessari")); tmp.add(new Event("access")); tmp.add(new Event("accid")); tmp.add(new Event("accid")); tmp.add(new Event("accident")); tmp.add(new Event("accident")); tmp.add(new Event("accid")); tmp.add(new Event("accit")); tmp.add(new Event("accit")); tmp.add(new Event("accit")); tmp.add(new Event("acclam")); tmp.add(new Event("accommod")); tmp.add(new Event("accommod")); tmp.add(new Event("accommod")); tmp.add(new Event("accommod")); tmp.add(new Event("accommodo")); tmp.add(new Event("accompani")); tmp.add(new Event("accompani")); tmp.add(new Event("accompani")); tmp.add(new Event("accomplic")); tmp.add(new Event("accomplish")); tmp.add(new Event("accomplish")); tmp.add(new Event("accomplish")); tmp.add(new Event("accomplish")); tmp.add(new Event("accompt")); tmp.add(new Event("accord")); tmp.add(new Event("accord")); tmp.add(new Event("accord")); tmp.add(new Event("accordeth")); tmp.add(new Event("accord")); tmp.add(new Event("accordingli")); tmp.add(new Event("accord")); tmp.add(new Event("accost")); tmp.add(new Event("accost")); tmp.add(new Event("account")); tmp.add(new Event("account")); tmp.add(new Event("account")); tmp.add(new Event("account")); tmp.add(new Event("accoutr")); tmp.add(new Event("accoutr")); tmp.add(new Event("accoutr")); tmp.add(new Event("accru")); expectedEventSet.addEvents(tmp); // System.out.println("Expected is " + expectedEventSet.toString()); // System.out.println("Actual is " + sampleEventSet.toString()); assertTrue(expectedEventSet.equals(sampleEventSet)); }
private EventSet window(EventSet e1, int offset, int windowSize) { return e1.subset(offset, offset + windowSize); }
/** * Goes over the culled List of Lists of EventSets and determines which events are histograms and * which have a single numerical value.<br> * Uses the information to prepare a List of EventSets to extract from the test document(s). * * @param culledEventSets The culled List of Lists of EventSets * @param cumulativeFeatureDriver The driver used to extract the EventSets * @return The List of EventSet to extract from the test document(s) * @throws Exception */ public List<EventSet> getRelevantEvents( List<List<EventSet>> culledEventSets, CumulativeFeatureDriver cumulativeFeatureDriver) throws Exception { // remove the metadata prior to generating the relevantEvents ArrayList<EventSet> docMetaData = new ArrayList<EventSet>(); for (List<EventSet> les : culledEventSets) { docMetaData.add(les.remove(les.size() - 1)); } // initialize the EventSet list List<EventSet> relevantEvents = new LinkedList<EventSet>(); // iterate over the List of Lists for (List<EventSet> l : culledEventSets) { // iterate over each inner list's eventSets int featureIndex = 0; for (EventSet esToAdd : l) { // whether or not to add the event set to the list (if false, it // is already on the list) boolean add = true; ; for (EventSet esl : relevantEvents) { // this should compare the category/name of the event set if (esToAdd.getEventSetID().equals(esl.getEventSetID())) { add = false; break; } } // this event set isn't on the list at all, just add it (which // also adds its internal events) to the list if (add) { EventSet temp = new EventSet(); temp.setEventSetID(esToAdd.getEventSetID()); // for all of the events for (Event e : esToAdd) { boolean absent = true; // check to see if it's been added yet or not for (Event ae : temp) { if (ae.getEvent().equals(e.getEvent())) { absent = false; break; } } // if it has not been added, add it if (absent) { if (!cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) temp.addEvent(new Event("{-}")); else temp.addEvent(e); } } relevantEvents.add(temp); } else { // go through this eventSet and add any events to the // relevant EventSet if they aren't already there. if (cumulativeFeatureDriver.featureDriverAt(featureIndex).isCalcHist()) { for (Event e : esToAdd) { boolean toAdd = true; // for all events in the relecant list for (Event re : relevantEvents.get(featureIndex)) { // if it's already there, don't add it if (e.getEvent().equals(re.getEvent())) { toAdd = false; break; } } // add it if it isn't there if (toAdd) { relevantEvents.get(featureIndex).addEvent(e); } } } } featureIndex++; } } // add the metadata back in int index = 0; for (List<EventSet> les : culledEventSets) { les.add(docMetaData.get(index)); index++; } return relevantEvents; }
/** * Returns KC distance between event sets es1 and es2 * * @param es1 The first EventSet * @param es2 The second EventSet * @return the KC distance between them */ @Override public double distance(EventSet es1, EventSet es2) { EventHistogram h1 = es1.getHistogram(); EventHistogram h2 = es2.getHistogram(); Set<Event> s = new HashSet<Event>(); List<Pair<Event, Double>> l1 = new ArrayList<Pair<Event, Double>>(); List<Pair<Event, Double>> l2 = new ArrayList<Pair<Event, Double>>(); HashMap<Event, Integer> hm1 = new HashMap<Event, Integer>(); HashMap<Event, Integer> hm2 = new HashMap<Event, Integer>(); double oldfreq = Double.POSITIVE_INFINITY; double correlation = 0.0; s.addAll(es1.uniqueEvents()); s.addAll(es2.uniqueEvents()); // System.out.println(h1.toString()); // System.out.println(h2.toString()); /* make lists of the histograms */ for (Event e : h1) { l1.add(new Pair<Event, Double>(e, h1.getRelativeFrequency(e), 2)); } for (Event e : h2) { l2.add(new Pair<Event, Double>(e, h2.getRelativeFrequency(e), 2)); } /* sort the list so the most frequent items are at the top */ /* NOTE : THIS MAY BE USEFUL ELSEWHERE : SAVE THIS CODE */ Collections.sort(l1); Collections.reverse(l1); Collections.sort(l2); Collections.reverse(l2); /* DEBUGGING STUFF for (Pair <Event,Double> p : l1) { System.out.println("L1: " + p.toString()); } for (Pair <Event,Double> p : l1) { System.out.println("L2: " + p.toString()); } */ /* Convert lists into a hashmap of event:rank pairs */ int rank = 0; int count = 0; for (Pair<Event, Double> p : l1) { Event e = (Event) (p.getFirst()); double f = (Double) (p.getSecond()); count++; if (f != oldfreq) { rank = count; oldfreq = f; } hm1.put(e, rank); } /* reset and do second list */ rank = 0; count = 0; for (Pair<Event, Double> p : l2) { Event e = (Event) (p.getFirst()); double f = (Double) (p.getSecond()); count++; if (f != oldfreq) { rank = count; oldfreq = f; } hm2.put(e, rank); } /* More debugging stuff System.out.println(hm1.toString()); System.out.println(hm2.toString()); System.out.println(s.toString()); */ Integer x1, x2, y1, y2; Set<Event> s2 = new HashSet<Event>(s); for (Event e1 : s) { // s2.remove(e1); for (Event e2 : s2) { if (e1.equals(e2)) continue; /* get ranks of events e1 and e2 in both x and y distributions */ x1 = hm1.get(e1); /* if not present, rank is size + 1 */ if (x1 == null) x1 = hm1.size() + 1; x2 = hm2.get(e1); if (x2 == null) x2 = hm2.size() + 1; y1 = hm1.get(e2); /* if not present, rank is size + 1 */ // broke because if (y1 == null) x1 = hm1.size()+1; x1 should be y1 if (y1 == null) y1 = hm1.size() + 1; y2 = hm2.get(e2); if (y2 == null) y2 = hm2.size() + 1; /* more debugging stuff System.out.println(e1.toString() + " is ("+x1+","+x2+")"); System.out.println(e2.toString() + " is ("+y1+","+y2+")"); System.out.println(sgn(x1.compareTo(y1)) + " " + sgn(x2.compareTo(y2)) ); System.out.println(""); */ correlation += (sgn(x1.compareTo(y1)) * sgn(x2.compareTo(y2))); // System.out.println(correlation); } } // System.out.println(correlation); correlation /= (hm1.size() * (hm2.size() - 1)); // System.out.println(correlation); // System.out.println("---"); return 1.0 - correlation; }
@Override public EventSet createEventSet(Document ds) throws EventGenerationException { String param; HashMap<String, String> transform = new HashMap<String, String>(); boolean whitelist = false; String line; String[] words; if (!(param = (getParameter("underlyingEvents"))).equals("")) { try { underlyingEvents = EventDriverFactory.getEventDriver(param); } catch (Exception e) { System.out.println("Error: cannot create EventDriver " + param); System.out.println(" -- Using NaiveWordEventSet"); underlyingEvents = new NaiveWordEventDriver(); } } else { // no underlyingEventsParameter, use NaiveWordEventSet underlyingEvents = new NaiveWordEventDriver(); } if (!(param = (getParameter("filename"))).equals("")) { filename = param; } else { // no underlyingfilename, filename = null; } if (!(param = (getParameter("implicitWhiteList"))).equals("")) { if (param.equalsIgnoreCase("true")) { whitelist = true; } } else { // no underlyingfilename, whitelist = false; } EventSet es = underlyingEvents.createEventSet(ds); EventSet newEs = new EventSet(); newEs.setAuthor(es.getAuthor()); newEs.setNewEventSetID(es.getAuthor()); BufferedReader br = null; if (filename != null) { try { FileInputStream fis = new FileInputStream(filename); br = new BufferedReader(new InputStreamReader(fis)); while ((line = br.readLine()) != null) { if (line.length() > 0) { String sep = line.substring(0, 1); words = line.substring(1).split(sep, -1); if (words.length > 1) { transform.put(words[0], words[1]); System.out.println("Adding \"" + words[0] + "\" : \"" + words[1] + "\""); } } } } catch (IOException e) { // catch io errors from FileInputStream or readLine() System.out.println("Cannot open/read " + filename); System.out.println("IOException error! " + e.getMessage()); transform = null; } finally { // if the file opened okay, make sure we close it if (br != null) { try { br.close(); } catch (IOException ioe) { } } } } else { transform = null; } for (Event e : es) { String s = e.toString(); if (transform == null) { newEs.addEvent(e); } else if (transform.containsKey(s)) { String newS = transform.get(s); if (newS.length() > 0) { newEs.addEvent(new Event(newS)); } } else // s is not in transformation list if (whitelist == false) { // add only if no implicit whitelisting newEs.addEvent(e); } // otherwise add nothing } return newEs; }