예제 #1
0
  /**
   * Returns KC distance between event sets es1 and es2
   *
   * @param es1 The first EventSet
   * @param es2 The second EventSet
   * @return the KC distance between them
   */
  @Override
  public double distance(EventSet es1, EventSet es2) {

    EventHistogram h1 = es1.getHistogram();
    EventHistogram h2 = es2.getHistogram();

    Set<Event> s = new HashSet<Event>();

    List<Pair<Event, Double>> l1 = new ArrayList<Pair<Event, Double>>();
    List<Pair<Event, Double>> l2 = new ArrayList<Pair<Event, Double>>();

    HashMap<Event, Integer> hm1 = new HashMap<Event, Integer>();
    HashMap<Event, Integer> hm2 = new HashMap<Event, Integer>();

    double oldfreq = Double.POSITIVE_INFINITY;

    double correlation = 0.0;

    s.addAll(es1.uniqueEvents());
    s.addAll(es2.uniqueEvents());

    // System.out.println(h1.toString());
    // System.out.println(h2.toString());

    /* make lists of the histograms */
    for (Event e : h1) {
      l1.add(new Pair<Event, Double>(e, h1.getRelativeFrequency(e), 2));
    }
    for (Event e : h2) {
      l2.add(new Pair<Event, Double>(e, h2.getRelativeFrequency(e), 2));
    }

    /* sort the list so the most frequent items are at the top */
    /* NOTE : THIS MAY BE USEFUL ELSEWHERE : SAVE THIS CODE */
    Collections.sort(l1);
    Collections.reverse(l1);
    Collections.sort(l2);
    Collections.reverse(l2);

    /* DEBUGGING STUFF
    for (Pair <Event,Double> p : l1) {
    	System.out.println("L1: " + p.toString());
    }
    for (Pair <Event,Double> p : l1) {
    	System.out.println("L2: " + p.toString());
    }
    */

    /* Convert lists into a hashmap of event:rank pairs */
    int rank = 0;
    int count = 0;
    for (Pair<Event, Double> p : l1) {
      Event e = (Event) (p.getFirst());
      double f = (Double) (p.getSecond());
      count++;
      if (f != oldfreq) {
        rank = count;
        oldfreq = f;
      }
      hm1.put(e, rank);
    }

    /* reset and do second list */
    rank = 0;
    count = 0;
    for (Pair<Event, Double> p : l2) {
      Event e = (Event) (p.getFirst());
      double f = (Double) (p.getSecond());
      count++;
      if (f != oldfreq) {
        rank = count;
        oldfreq = f;
      }
      hm2.put(e, rank);
    }

    /* More debugging stuff
    System.out.println(hm1.toString());
    System.out.println(hm2.toString());
    System.out.println(s.toString());
    */

    Integer x1, x2, y1, y2;
    Set<Event> s2 = new HashSet<Event>(s);
    for (Event e1 : s) {
      // s2.remove(e1);
      for (Event e2 : s2) {

        if (e1.equals(e2)) continue;

        /* get ranks of events e1 and e2 in both x and y distributions */
        x1 = hm1.get(e1);
        /* if not present, rank is size + 1 */
        if (x1 == null) x1 = hm1.size() + 1;

        x2 = hm2.get(e1);
        if (x2 == null) x2 = hm2.size() + 1;

        y1 = hm1.get(e2);
        /* if not present, rank is size + 1 */
        // broke because if (y1 == null) x1 = hm1.size()+1; x1 should be y1
        if (y1 == null) y1 = hm1.size() + 1;

        y2 = hm2.get(e2);
        if (y2 == null) y2 = hm2.size() + 1;

        /* more debugging stuff
        System.out.println(e1.toString() + " is ("+x1+","+x2+")");
        System.out.println(e2.toString() + " is ("+y1+","+y2+")");
        System.out.println(sgn(x1.compareTo(y1)) + " " +
        		   sgn(x2.compareTo(y2)) );
        System.out.println("");
        */

        correlation += (sgn(x1.compareTo(y1)) * sgn(x2.compareTo(y2)));
        //				System.out.println(correlation);
      }
    }

    // System.out.println(correlation);
    correlation /= (hm1.size() * (hm2.size() - 1));
    // System.out.println(correlation);
    // System.out.println("---");

    return 1.0 - correlation;
  }
예제 #2
0
  /**
   * Converts the extracted document information into a JStylo DataMap
   *
   * @param features
   * @param relevantEvents
   * @param cumulativeFeatureDriver
   * @param documentData
   * @return
   */
  public ConcurrentHashMap<Integer, FeatureData> createDocMap(
      List<String> features,
      List<EventSet> relevantEvents,
      CumulativeFeatureDriver cumulativeFeatureDriver,
      List<EventSet> documentData) {

    // generate training instances
    ConcurrentHashMap<Integer, FeatureData> documentMap =
        new ConcurrentHashMap<Integer, FeatureData>();

    // remove metadata event
    EventSet metadata = documentData.remove(documentData.size() - 1);

    // go through all eventSets in the document
    for (EventSet es : documentData) {

      // initialize relevant information
      ArrayList<Integer> indices = new ArrayList<Integer>();
      ArrayList<Event> events = new ArrayList<Event>();
      EventHistogram currHistogram = new EventHistogram();

      // whether or not we actually need this eventSet
      boolean eventSetIsRelevant = false;

      // find out if it is a histogram or not
      if (cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).isCalcHist()) {

        // find the event set in the list of relevant events
        for (EventSet res : relevantEvents) {
          if (es.getEventSetID().equals(res.getEventSetID())) {
            eventSetIsRelevant = true;
            break;
          }
        }

        // if it is relevant
        if (eventSetIsRelevant) {

          // find the indices of the events
          // and count all of the events
          for (Event e : es) {
            int currIndex = 0;
            boolean hasInner = false;

            // for the events n the set
            for (EventSet res : relevantEvents) {
              boolean found = false;
              for (Event re : res) {
                hasInner = true;

                // if they are the same event
                if (e.getEvent().equals(re.getEvent())) {
                  boolean inList = false;
                  for (Event el : events) {
                    if (el.getEvent().equals(e.getEvent())) {
                      inList = true;
                      break;
                    }
                  }

                  if (!inList) {
                    indices.add(currIndex);
                    events.add(e);
                  }
                  // Old location revert if change breaks
                  currHistogram.add(e);
                  found = true;
                }
                if (found) {
                  break;
                }
                currIndex++;
              }
              if (found) {
                break;
              }

              // if there's no inner, it was a non-hist feature.
              // increment by one
              if (!hasInner) {
                currIndex++;
              }
            }
          }
          // calculate/add the histograms
          int index = 0;
          for (Integer i : indices) {
            documentMap.put(
                i,
                new FeatureData(
                    cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(),
                    cumulativeFeatureDriver
                        .featureDriverAt(documentData.indexOf(es))
                        .getNormBaseline()
                        .getTitle(),
                    currHistogram.getAbsoluteFrequency(events.get(index))));
            index++;
          }
        }
      } else { // non histogram feature

        // initialize the index
        int nonHistIndex = 0;

        // find the indices of the events
        // and count all of the events
        for (EventSet res : relevantEvents) {

          if (es.getEventSetID().equals(res.getEventSetID())) {
            break;
          }

          // count to find the index
          boolean hasInner = false;
          for (@SuppressWarnings("unused") Event re : res) {
            hasInner = true;
            nonHistIndex++;
          }

          // if ther's no inner feature, increment by one; we just passed a non-histogram
          if (!hasInner) nonHistIndex++;
        }

        // Extract and add the event
        String eventString = es.eventAt(0).getEvent();
        int startIndex = eventString.indexOf("{");
        int endIndex = eventString.indexOf("}");
        eventString = eventString.substring(startIndex + 1, endIndex);

        FeatureData fd =
            new FeatureData(
                cumulativeFeatureDriver.featureDriverAt(documentData.indexOf(es)).getName(),
                cumulativeFeatureDriver
                    .featureDriverAt(documentData.indexOf(es))
                    .getNormBaseline()
                    .getTitle(),
                Math.round((float) Double.parseDouble(eventString)));
        documentMap.put(nonHistIndex, fd);
      }
    }
    // add metadata back. Not sure if necessary
    documentData.add(metadata);

    return documentMap;
  }