/** * Returns KC distance between event sets es1 and es2 * * @param es1 The first EventSet * @param es2 The second EventSet * @return the KC distance between them */ @Override public double distance(EventSet es1, EventSet es2) { EventHistogram h1 = es1.getHistogram(); EventHistogram h2 = es2.getHistogram(); Set<Event> s = new HashSet<Event>(); List<Pair<Event, Double>> l1 = new ArrayList<Pair<Event, Double>>(); List<Pair<Event, Double>> l2 = new ArrayList<Pair<Event, Double>>(); HashMap<Event, Integer> hm1 = new HashMap<Event, Integer>(); HashMap<Event, Integer> hm2 = new HashMap<Event, Integer>(); double oldfreq = Double.POSITIVE_INFINITY; double correlation = 0.0; s.addAll(es1.uniqueEvents()); s.addAll(es2.uniqueEvents()); // System.out.println(h1.toString()); // System.out.println(h2.toString()); /* make lists of the histograms */ for (Event e : h1) { l1.add(new Pair<Event, Double>(e, h1.getRelativeFrequency(e), 2)); } for (Event e : h2) { l2.add(new Pair<Event, Double>(e, h2.getRelativeFrequency(e), 2)); } /* sort the list so the most frequent items are at the top */ /* NOTE : THIS MAY BE USEFUL ELSEWHERE : SAVE THIS CODE */ Collections.sort(l1); Collections.reverse(l1); Collections.sort(l2); Collections.reverse(l2); /* DEBUGGING STUFF for (Pair <Event,Double> p : l1) { System.out.println("L1: " + p.toString()); } for (Pair <Event,Double> p : l1) { System.out.println("L2: " + p.toString()); } */ /* Convert lists into a hashmap of event:rank pairs */ int rank = 0; int count = 0; for (Pair<Event, Double> p : l1) { Event e = (Event) (p.getFirst()); double f = (Double) (p.getSecond()); count++; if (f != oldfreq) { rank = count; oldfreq = f; } hm1.put(e, rank); } /* reset and do second list */ rank = 0; count = 0; for (Pair<Event, Double> p : l2) { Event e = (Event) (p.getFirst()); double f = (Double) (p.getSecond()); count++; if (f != oldfreq) { rank = count; oldfreq = f; } hm2.put(e, rank); } /* More debugging stuff System.out.println(hm1.toString()); System.out.println(hm2.toString()); System.out.println(s.toString()); */ Integer x1, x2, y1, y2; Set<Event> s2 = new HashSet<Event>(s); for (Event e1 : s) { // s2.remove(e1); for (Event e2 : s2) { if (e1.equals(e2)) continue; /* get ranks of events e1 and e2 in both x and y distributions */ x1 = hm1.get(e1); /* if not present, rank is size + 1 */ if (x1 == null) x1 = hm1.size() + 1; x2 = hm2.get(e1); if (x2 == null) x2 = hm2.size() + 1; y1 = hm1.get(e2); /* if not present, rank is size + 1 */ // broke because if (y1 == null) x1 = hm1.size()+1; x1 should be y1 if (y1 == null) y1 = hm1.size() + 1; y2 = hm2.get(e2); if (y2 == null) y2 = hm2.size() + 1; /* more debugging stuff System.out.println(e1.toString() + " is ("+x1+","+x2+")"); System.out.println(e2.toString() + " is ("+y1+","+y2+")"); System.out.println(sgn(x1.compareTo(y1)) + " " + sgn(x2.compareTo(y2)) ); System.out.println(""); */ correlation += (sgn(x1.compareTo(y1)) * sgn(x2.compareTo(y2))); // System.out.println(correlation); } } // System.out.println(correlation); correlation /= (hm1.size() * (hm2.size() - 1)); // System.out.println(correlation); // System.out.println("---"); return 1.0 - correlation; }
/** * Converts the extracted document information into a JStylo DataMap * * @param features * @param relevantEvents * @param cumulativeFeatureDriver * @param documentData * @return */ public ConcurrentHashMap<Integer, FeatureData> createDocMap( List<String> features, List<EventSet> relevantEvents, CumulativeFeatureDriver cumulativeFeatureDriver, List<EventSet> documentData) { // generate training instances ConcurrentHashMap<Integer, FeatureData> documentMap = new ConcurrentHashMap<Integer, FeatureData>(); // remove metadata event EventSet metadata = documentData.remove(documentData.size() - 1); // go through all eventSets in the document for (EventSet es : documentData) { // initialize relevant information ArrayList<Integer> indices = new ArrayList<Integer>(); ArrayList<Event> events = new ArrayList<Event>(); EventHistogram currHistogram = new EventHistogram(); // whether or not we actually need this eventSet boolean eventSetIsRelevant = false; // find out if it is a histogram or not if (cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).isCalcHist()) { // find the event set in the list of relevant events for (EventSet res : relevantEvents) { if (es.getEventSetID().equals(res.getEventSetID())) { eventSetIsRelevant = true; break; } } // if it is relevant if (eventSetIsRelevant) { // find the indices of the events // and count all of the events for (Event e : es) { int currIndex = 0; boolean hasInner = false; // for the events n the set for (EventSet res : relevantEvents) { boolean found = false; for (Event re : res) { hasInner = true; // if they are the same event if (e.getEvent().equals(re.getEvent())) { boolean inList = false; for (Event el : events) { if (el.getEvent().equals(e.getEvent())) { inList = true; break; } } if (!inList) { indices.add(currIndex); events.add(e); } // Old location revert if change breaks currHistogram.add(e); found = true; } if (found) { break; } currIndex++; } if (found) { break; } // if there's no inner, it was a non-hist feature. // increment by one if (!hasInner) { currIndex++; } } } // calculate/add the histograms int index = 0; for (Integer i : indices) { documentMap.put( i, new FeatureData( cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(), cumulativeFeatureDriver .featureDriverAt(documentData.indexOf(es)) .getNormBaseline() .getTitle(), currHistogram.getAbsoluteFrequency(events.get(index)))); index++; } } } else { // non histogram feature // initialize the index int nonHistIndex = 0; // find the indices of the events // and count all of the events for (EventSet res : relevantEvents) { if (es.getEventSetID().equals(res.getEventSetID())) { break; } // count to find the index boolean hasInner = false; for (@SuppressWarnings("unused") Event re : res) { hasInner = true; nonHistIndex++; } // if ther's no inner feature, increment by one; we just passed a non-histogram if (!hasInner) nonHistIndex++; } // Extract and add the event String eventString = es.eventAt(0).getEvent(); int startIndex = eventString.indexOf("{"); int endIndex = eventString.indexOf("}"); eventString = eventString.substring(startIndex + 1, endIndex); FeatureData fd = new FeatureData( cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(), cumulativeFeatureDriver .featureDriverAt(documentData.indexOf(es)) .getNormBaseline() .getTitle(), Math.round((float) Double.parseDouble(eventString))); documentMap.put(nonHistIndex, fd); } } // add metadata back. Not sure if necessary documentData.add(metadata); return documentMap; }