Exemple #1
0
  public static String getProtovisForDates(
      String[] chartSpec,
      List<Date> outDates,
      List<Date> inDates,
      List<Date> intervals,
      int normalizeCount,
      int width,
      int height,
      boolean showTotals,
      boolean inNOut,
      String browseParams) {
    int[] inGram = CalendarUtil.computeHistogram(inDates, intervals);
    //	double[] normalizedInGram = Util.normalizeHistogramToBase(inGram, normalizeCount);
    int[] outGram = CalendarUtil.computeHistogram(outDates, intervals);
    //	double[] normalizedOutGram = Util.normalizeHistogramToBase(outGram, normalizeCount);

    return getProtoVizBox(
        chartSpec,
        outDates.size(),
        inDates.size(),
        outGram,
        inGram,
        normalizeCount,
        width,
        height,
        showTotals,
        inNOut,
        intervals.get(0),
        intervals.get(intervals.size() - 1),
        false /*focusOnly*/,
        browseParams);
  }
Exemple #2
0
  public static int findMaxInOrOutInAnInterval(
      List<Date> inDates, List<Date> outDates, List<Date> intervals) {
    int[] histogram = CalendarUtil.computeHistogram(inDates, intervals);
    int max = Integer.MIN_VALUE;
    for (int x : histogram) if (x > max) max = x;

    histogram = CalendarUtil.computeHistogram(outDates, intervals);
    for (int x : histogram) if (x > max) max = x;
    return max;
  }
Exemple #3
0
 private static int findNormalizingMax(List<Date> dates, List<Date> intervals) {
   if (dates == null) return Integer.MIN_VALUE;
   int[] histogram = CalendarUtil.computeHistogram(dates, intervals);
   int max = Integer.MIN_VALUE;
   for (int x : histogram) if (x > max) max = x;
   return max;
 }
Exemple #4
0
 public String toHTMLString() {
   String str = "";
   for (Clue clue : clues) {
     str +=
         "<tr><td><a href='"
             + link
             + "' target='_blank'>"
             + displayEntity
             + "</a></td><td>"
             + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate)
             + "</td><td>"
             + nMessages
             + "</td><td>"
             + nThreads
             + "</td><td>"
             + (clue != null ? clue.clueStats.finalScore : "-")
             + "</td></tr>"
             + "<tr><td class=\"clue\" colspan=\"6\">"
             + (clue != null
                 ? (clue.clue
                     + "<br/><br/><div class=\"stats\"> stats: "
                     + Util.fieldsToString(clue.clueStats, false))
                 : "No clue")
             + "</div><br/><br/></td></tr><br>";
   }
   return str;
 }
Exemple #5
0
  // Compute date intervals, working backwards from latestDate, until earliestDate is covered
  // most recent interval is interval 0.
  private static List<Pair<Date, Date>> computeDateIntervals(Date earliestDate, Date latestDate) {
    int DAYS_PER_INTERVAL = 30;
    List<Pair<Date, Date>> intervals = new ArrayList<Pair<Date, Date>>();
    {
      JSPHelper.log.info("computing time intervals");
      Date closingDate = latestDate;

      JSPHelper.log.info(
          "closing = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(closingDate));
      if (earliestDate == null || closingDate == null) return intervals;

      while (earliestDate.before(closingDate)) {
        Calendar cal = new GregorianCalendar();
        cal.setTime(closingDate);
        // scroll to the beginning of this month
        cal.set(Calendar.HOUR_OF_DAY, 23);
        cal.set(Calendar.MINUTE, 59);
        cal.set(Calendar.SECOND, 59);
        Date endDate = cal.getTime();

        // scroll back by DAYS_PER_INTERVAL days
        cal.add(
            Calendar.DATE,
            (1
                - DAYS_PER_INTERVAL)); // 1- because we want from 0:00 of first date to 23:59 of
                                       // last date
        cal.set(Calendar.HOUR_OF_DAY, 0);
        cal.set(Calendar.MINUTE, 0);
        cal.set(Calendar.SECOND, 0);
        Date startDate = cal.getTime();

        intervals.add(new Pair<Date, Date>(startDate, endDate));
        // ok we got an interval

        // closing date for the next interval is 1 day before endDate
        cal.add(Calendar.DATE, -1);
        closingDate = cal.getTime();
      }
    }
    return intervals;
  }
Exemple #6
0
  /**
   * generates protovis string for group activity (in/out) chart + all names in the group.
   * normalized across all groups. optionally group members names are included.
   *
   * @return
   */
  public static List<String> getProtovisForGroups(
      AddressBook addressBook,
      List<SimilarGroup<String>> groups,
      Collection<EmailDocument> allDocs,
      int nIntervals,
      int width,
      int height,
      boolean generateNames) {
    // compute in/out dates for each group
    List<Date>[] inDates = new ArrayList[groups.size()];
    List<Date>[] outDates = new ArrayList[groups.size()];
    for (int i = 0; i < groups.size(); i++) {
      inDates[i] = new ArrayList<Date>();
      outDates[i] = new ArrayList<Date>();
    }

    for (EmailDocument ed : allDocs) {
      List<String> rawEmailAddrs = ed.getParticipatingAddrsExcept(addressBook.getOwnAddrs());
      List<String> canonicalEmailAddrs = addressBook.convertToCanonicalAddrs(rawEmailAddrs);
      Collections.sort(canonicalEmailAddrs);
      Group<String> emailGroup = new Group<String>(canonicalEmailAddrs);
      int x = Group.bestFit(groups, emailGroup);
      if (x != -1) {
        int sentOrReceived = ed.sentOrReceived(addressBook);
        if ((sentOrReceived & EmailDocument.RECEIVED_MASK) != 0) inDates[x].add(ed.date);
        if ((sentOrReceived & EmailDocument.SENT_MASK) != 0) outDates[x].add(ed.date);
      }
    }

    // find normalizing max
    int max = Integer.MIN_VALUE;
    Pair<Date, Date> p = EmailUtils.getFirstLast(allDocs);
    Date globalStart = p.getFirst();
    Date globalEnd = p.getSecond();
    List<Date> intervals = CalendarUtil.divideIntoIntervals(globalStart, globalEnd, nIntervals);

    for (int i = 0; i < groups.size(); i++) {
      int x = normalizingMax(inDates[i], outDates[i], intervals, /* inNOut */ true);
      if (x >= max) max = x;
    }

    // generate protovis
    List<String> result = new ArrayList<String>();
    for (int i = 0; i < groups.size(); i++) {
      int[] inGram = CalendarUtil.computeHistogram(inDates[i], intervals);
      //	double[] normalizedInGram = Util.normalizeHistogramToBase(inGram, max);
      int[] outGram = CalendarUtil.computeHistogram(outDates[i], intervals);
      //	double[] normalizedOutGram = Util.normalizeHistogramToBase(outGram, max);
      String url = JSPHelper.getURLForGroupMessages(i);
      StringBuilder sb = new StringBuilder();
      sb.append(
          getProtoVizBox(
              null,
              outDates[i].size(),
              inDates[i].size(),
              outGram,
              inGram,
              max,
              width,
              height,
              true,
              true,
              intervals.get(0),
              intervals.get(intervals.size() - 1),
              true /*focusOnly*/,
              "'" + url + "'"));

      // add names to the mark if needed
      if (generateNames) {
        sb.append("<br/><span style=\"font-size:small\">");
        for (String str : groups.get(i).elements)
          sb.append(Util.strippedEmailAddress(str) + "<br/>");

        sb.append(
            "<a href=\""
                + url
                + "\" target=\"_new\"><img title=\"Messages\" src=\"/muse/images/email.jpg\" width=\"25\"/>"
                + "</a>");

        sb.append("</span>");
      }
      result.add(sb.toString());
    }

    return result;
  }
Exemple #7
0
  /**
   * gets protoviz javascript mark to plot normalized frequencies for a contact: out and in. all
   * out's and in's should be between 0 and 1 height of bar is sqrt of comm. volume chartSpec can be
   * null in which case it will draw the chart at the current location. if chartSpec is specified,
   * it will draw the chart inside the specified div "chartCanvas" with date slider for zooming
   * where the slider is specified by dateSlider and dateSliderText (see filter.jsp /
   * filter_common.html for detail on make_date_slider()). chartSpec[0..2] = chartCanvas,
   * dateSliderBar, dateSliderText. firstDate is inclusive, lastDate is exclusive.
   */
  public static String getProtoVizMark(
      String[] chartSpec,
      int[] out,
      int[] in,
      int normalizer,
      int width,
      int height,
      boolean inNOut,
      Date firstDate,
      Date lastDate,
      boolean focusOnly,
      String browseParams) {
    //	  example outcome
    //
    //	    <script type="text/javascript+protovis">
    //	          var w = 100, h = 40;
    //	        new pv.Panel().width(w).height(h)
    //	            .add(pv.Rule).bottom(h/2).lineWidth(2).left(200).right(200)
    //	            .add(pv.Bar).data([0.1,0.2,0.3,0.4,0.5,1.0]).width(4)
    //	              .left(function() 5 * this.index)
    //	              .height(function(d) Math.round(h/2 * d))
    //	              .bottom(h/2)
    //	            .add(pv.Bar).data([0.3,0.4,0.5,1.0]).width(4)
    //	              .left(function() 5 * this.index)
    //	              .height(function(d) Math.round(h/2*d))
    //	              .bottom(function(d) h/2 - Math.round(h/2 * d))
    //	        .root.render();
    //
    //	</script>

    if (Util.nullOrEmpty(browseParams)) browseParams = "''";

    // add padding to make first and last data always 0 to nicely contain/bound the area chart
    StringBuilder outgoingData = new StringBuilder("[0");
    // bar for incoming counts
    for (int x = 0; x < out.length; x++)
      // outgoingData.append ((x==0 ? "" : ",") + out[x]);
      outgoingData.append("," + out[x]);
    outgoingData.append(",0]");

    StringBuilder incomingData = null;
    if (inNOut) {
      incomingData = new StringBuilder("[0");
      for (int x = 0; x < in.length; x++)
        // incomingData.append ((x==0 ? "" : ",") + in[x]);
        incomingData.append("," + in[x]);
      incomingData.append(",0]");
    }

    // Date.getYear() is deprecated.
    Calendar cFirst = new GregorianCalendar();
    cFirst.setTime(firstDate); // inclusive
    Calendar cLast = new GregorianCalendar();
    cLast.setTime(lastDate); // exclusive
    int cFirst_year = cFirst.get(Calendar.YEAR);
    int cFirst_month =
        cFirst.get(Calendar.MONTH) - 1; // to represent the left dummy pad since cFirst is inclusive
    int nMonths = CalendarUtil.getDiffInMonths(firstDate, lastDate);
    Util.softAssert(nMonths == in.length);

    StringBuilder result = new StringBuilder("");

    String chartDivId = null;
    if (chartSpec != null) {
      Util.ASSERT(chartSpec[0].startsWith("#"));
      chartDivId = chartSpec[0].substring(chartSpec[0].startsWith("#") ? 1 : 0);
      // the canvas div must appear before the protovis invocation
      // if (focusOnly) result.append("<a href='#?custom=true&amp;id=" + chartDivId + "'
      // rel='subchart'>"); // should probably inject 'subchart' tag here but we don't have info for
      // 'title' so we inject at an upper layer instead.
      result.append("<div id='" + chartDivId + "'></div>");
      // if (focusOnly) result.append("</a>");
    }

    result.append("<script type=\"text/javascript\">");

    if (chartSpec == null) {
      result.append(
          "draw_protovis_box("
              + incomingData
              + ", "
              + outgoingData
              + ", "
              + width
              + ", "
              + height
              + ", "
              + normalizer
              + ", "
              + cFirst_year
              + ","
              + cFirst_month
              + ", "
              + cLast.get(Calendar.YEAR)
              + ","
              + cLast.get(Calendar.MONTH)
              + ");");
    } else {
      result.append(
          "draw_chart('"
              + chartSpec[0]
              + "'"
              + ", data_bottom['"
              + chartDivId
              + "']="
              + incomingData
              + ", data_top['"
              + chartDivId
              + "']="
              + outgoingData
              + ", "
              + normalizer
              + ", "
              + cFirst_year
              + ","
              + cFirst_month
              + ", "
              + width
              + ", "
              + height
              + ", "
              + focusOnly
              + ", browse_params['"
              + chartDivId
              + "']="
              + browseParams
              + ")");
      //			int lastMonth_inclusive = nMonths - 1; // slider will be created with range [0,
      // lastMonth_inclusive] inclusive on both ends for the total of nMonths.
      //	    	result.append("make_date_slider('" + chartSpec[1] + "','" + chartSpec[2] + "'");
      //	    	result.append(					"," + lastMonth_inclusive + "," + cFirst_year + "," +
      // cFirst_month); // global range
      //	    	result.append(					"," + lastMonth_inclusive + "," + cFirst_year + "," +
      // cFirst_month); // current range
      //	    	result.append(					", get_date_change_func('" + chartSpec[0] + "'"
      //	    										+ ", " + incomingData + ", " + outgoingData
      //	    										+ ", " + width + ", " + height
      //	    										+ ", " + cFirst_year + "," + cFirst_month
      //	    										+ ")");
      //	    	result.append(					");");
    }

    result.append("</script>");

    return result.toString();
  }
Exemple #8
0
  /** Generates person names tests from the given archive. @throws IOException */
  public void generatePersonNameQuestions(
      Archive archive,
      NERModel nerModel,
      Collection<EmailDocument> allDocs,
      Lexicon lex,
      int numClues)
      throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException,
          ParseException {
    this.archive = archive;
    questions = new ArrayList<>();
    ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex);

    tabooCluesSet = new LinkedHashSet<>();
    archive.assignThreadIds();

    List<ClueEvaluator> evaluators = getDefaultEvals();

    List<Document> docs = archive.getAllDocs();
    Multimap<Contact, EmailDocument> contactToMessages = LinkedHashMultimap.create();
    Multimap<Contact, Long> contactToThreadIds = LinkedHashMultimap.create();

    // sort by date
    Collections.sort(docs);

    Date earliestDate = null, latestDate = null;
    Map<Contact, Date> contactToLatestDate = new LinkedHashMap<>();

    // compute earliest and latest date across all messages in corpus
    for (Document doc : docs) {
      EmailDocument ed = (EmailDocument) doc;

      if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date;
      if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date;
    }
    JSPHelper.log.info(
        "===================\nStarting to generate person names memory questions from "
            + docs.size()
            + " messages with "
            + numClues
            + " questions"
            + ", earliest date = "
            + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate)
            + " latest date = "
            + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate));

    Set<Integer> tabooSentenceHashes = new LinkedHashSet<>();

    // create hashes of all sentences seen at least twice (case insensitive, lower cased)
    {
      Set<Integer> hashesSeen = new LinkedHashSet<>();
      for (Document d : docs) {
        String contents = archive.getContents(d, true);
        String cleanedContents = EmailUtils.cleanupEmailMessage(contents);
        SentenceTokenizer st = new SentenceTokenizer(cleanedContents);
        while (st.hasMoreSentences()) {
          String sentence = st.nextSentence();
          sentence = canonicalizeSentence(sentence);
          int hashCode = sentence.hashCode();
          if (hashesSeen.contains(hashCode)) {
            tabooSentenceHashes.add(hashCode);
            log.info("Marking sentence as taboo: " + sentence);
          } else hashesSeen.add(hashCode);
        }
      }
    }

    // compute contactToLatestDate that contact has been seen on
    for (Document doc : docs) {
      EmailDocument ed = (EmailDocument) doc;
      // discard doc if it is not a sent mail
      if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue;

      for (Contact c : ed.getParticipatingContactsExceptOwn(archive.addressBook)) {
        Date currentLatestDate = contactToLatestDate.get(c);
        if (currentLatestDate == null || currentLatestDate.before(ed.date))
          contactToLatestDate.put(c, ed.date);
        contactToMessages.put(c, ed);
        contactToThreadIds.put(c, ed.threadID);
      }
    }

    log.info("We are considering " + contactToLatestDate.size() + " contacts");

    Date currentDate = new Date();
    List<Pair<Date, Date>> intervals =
        computeDateIntervals(earliestDate, currentDate); // go back from current date
    // intervals[0] is the most recent.
    JSPHelper.log.info("done computing " + intervals.size() + " intervals");
    for (Pair<Date, Date> p : intervals)
      JSPHelper.log.info(
          "Interval: "
              + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst())
              + " - "
              + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond()));

    int cluesPerInterval =
        (numClues > 0 && intervals.size() > 0)
            ? (numClues + intervals.size() - 1) / intervals.size()
            : 0;
    JSPHelper.log.info(
        "Will try to generate " + Util.pluralize(cluesPerInterval, "questions") + " per interval");

    Multimap<Integer, Contact> intervalToContacts = LinkedHashMultimap.create();

    // nSent is the number of sentences allowed in a clue text
    int nSent = 2;
    for (Contact c : contactToLatestDate.keySet()) {
      Date lastSeenDate = contactToLatestDate.get(c);

      // which interval does this date belong to? we'll assign this contact in that interval in the
      // intervalToContacts map
      int interval = -1;
      Date intervalStart = null, intervalEnd = null;
      {
        int i = 0;
        for (Pair<Date, Date> p : intervals) {
          intervalStart = p.getFirst();
          intervalEnd = p.getSecond();

          if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate))
              || intervalStart.equals(lastSeenDate)
              || intervalEnd.equals(lastSeenDate)) {
            interval = i;
            break;
          }
          i++;
        }
      }

      if (interval < 0 || interval == intervals.size()) {
        JSPHelper.log.info(
            "What, no interval!? for "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate));
        continue;
      }

      intervalToContacts.put(interval, c);
    }

    log.info("Interval information (interval 0 is the most recent):");
    for (int interval = 0; interval < intervals.size(); interval++) {
      Collection<Contact> contacts = intervalToContacts.get(interval);
      int nContactsForThisInterval = (contacts == null) ? 0 : contacts.size();
      log.info(
          "In interval "
              + interval
              + " there are "
              + Util.pluralize(nContactsForThisInterval, "candidate contact")
              + " who were last seen in this interval");
    }

    for (int interval = 0; interval < intervals.size(); interval++) {
      Date intervalStart = intervals.get(interval).getFirst();
      Date intervalEnd = intervals.get(interval).getSecond();
      Collection<Contact> candidateContactsForThisInterval = intervalToContacts.get(interval);
      if (candidateContactsForThisInterval == null) {
        log.info("Skipping interval " + interval + " because there are no contacts");
        continue;
      }

      Map<Clue, Contact> clueToContact = new LinkedHashMap<>();
      log.info("=======\nGenerating questions for interval " + interval);

      outer:
      for (Contact c : candidateContactsForThisInterval) {
        String name = c.pickBestName();
        if (name.length() < 2) // could also check if alphanumberic only
        continue outer;

        // ignore contact if name does not contain all alphabets. Even a period is not allowed. only
        // space is allowed.
        for (char ch : name.toCharArray()) {
          if (!Character.isAlphabetic(ch) && !Character.isSpaceChar(ch)) continue outer;
        }

        Clue clue =
            cluer.createPersonNameClue(
                c,
                evaluators,
                nerModel,
                intervalStart,
                intervalEnd,
                nSent,
                archive,
                tabooSentenceHashes);
        if (clue != null) clueToContact.put(clue, c);
      }

      List<Clue> clueList = new ArrayList(clueToContact.keySet());
      Collections.sort(clueList);
      List<Clue> selectedClues = new ArrayList<>();
      for (int i = 0; i < cluesPerInterval && i < clueList.size(); i++) {
        selectedClues.add(clueList.get(i));
      }

      log.info(
          "For interval "
              + interval
              + " selected "
              + selectedClues.size()
              + " contacts out of "
              + clueList.size()
              + " possible candidates.");
      //            for (Clue c: clueList)
      //               log.info ("Clue candidate for " + clueToContact.get(c).pickBestName() + "
      // score = " + c.clueStats.finalScore+ " clue is " + c );
      //          for (Clue c: selectedClues)
      //             log.info ("Selected clue: " + clueToContact.get(c).pickBestName() + " score = "
      // + c.clueStats.finalScore+ " clue is " + c);

      for (Clue selectedClue : selectedClues) {
        Contact c = clueToContact.get(selectedClue);
        String name = c.pickBestName();

        List<Integer> lengthList = Crossword.convertToWord(name).getSecond();
        String lengthDescr = "";
        if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: ";

        for (Integer i : lengthList) {
          lengthDescr += Util.pluralize(i, "letter") + ", ";
        }
        lengthDescr =
            lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma.

        ClueInfo ci = new ClueInfo();
        ci.lastSeenDate = contactToLatestDate.get(c);
        ci.nMessages = contactToThreadIds.get(c).size();
        ci.nThreads = contactToThreadIds.get(c).size();

        questions.add(new MemoryQuestion(this, name, selectedClue, 1, lengthDescr));
      }
    }

    log.info(questions.size() + " questions generated");

    log.info("Top candidates are:");

    // sort q's by clue score
    Collections.sort(questions);

    //		log.info("Based on clue score, top answers:");
    //		for (MemoryQuestion mq: questions)
    //			log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue);

    int count = 0;
    for (MemoryQuestion mq : questions) {
      mq.setQuestionNum(count++);
    }

    // log the questions as well, just in case we don't get to the final point due to user fatigue
    // or crashes
    logStats("questions.final", false);
  }
Exemple #9
0
  /**
   * Generates list of questions and stores it in the current instance of MemoryStudy We handle two
   * kinds of questions namely, person names tests and non-person name tests. Non-person name test
   * is a fill in the blank kind where the blank is to be filled with the correct non-person entity
   * to complete the sentence person name test is to guess the person in correspondent list based on
   * some distinctive sentences in the mail
   *
   * @param maxInt - max. number of questions from a interval
   * @throws IOException
   */
  public void generateQuestions(
      Archive archive,
      NERModel nerModel,
      Collection<EmailDocument> allDocs,
      Lexicon lex,
      int maxInt,
      boolean personTest)
      throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException,
          ParseException {
    this.archive = archive;
    if (allDocs == null) allDocs = (Collection) archive.getAllDocs();
    questions = new ArrayList<>();
    ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex);

    Short[] itypes =
        new Short[] {
          FeatureDictionary.BUILDING,
          FeatureDictionary.PLACE,
          FeatureDictionary.RIVER,
          FeatureDictionary.ROAD,
          FeatureDictionary.UNIVERSITY,
          FeatureDictionary.MOUNTAIN,
          FeatureDictionary.AIRPORT,
          FeatureDictionary.ISLAND,
          FeatureDictionary.MUSEUM,
          FeatureDictionary.BRIDGE,
          FeatureDictionary.AIRLINE,
          FeatureDictionary.THEATRE,
          FeatureDictionary.LIBRARY,
          FeatureDictionary.LAWFIRM,
          FeatureDictionary.GOVAGENCY
        };
    double CUTOFF = 0.001;
    tabooCluesSet = new LinkedHashSet<>();
    archive.assignThreadIds();

    List<Document> docs = archive.getAllDocs();
    Map<String, Date> entityToLastDate = new LinkedHashMap<>();
    Multimap<String, EmailDocument> entityToMessages = LinkedHashMultimap.create();
    Multimap<String, Long> entityToThreads = LinkedHashMultimap.create();
    Multimap<String, String> ceToDisplayEntity = LinkedHashMultimap.create();

    int di = 0;

    // sort by date
    Collections.sort(docs);

    Set<String> ownerNames = archive.ownerNames;
    Date earliestDate = null, latestDate = null;
    Set<String> allEntities = new LinkedHashSet<>();
    for (Document doc : docs) {
      EmailDocument ed = (EmailDocument) doc;
      if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date;
      if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date;

      List<String> entities = new ArrayList<>();
      if (!personTest) {
        entities.addAll(
            Arrays.asList(archive.getAllNamesInDoc(doc, true))
                .stream()
                .filter(n -> n.typeScore > CUTOFF)
                .map(n -> n.text)
                .collect(Collectors.toList()));
      } else {
        // do not consider mailing lists
        if (ed.sentToMailingLists != null && ed.sentToMailingLists.length > 0) continue;
        // discard doc if it is not a sent mail
        if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue;

        List<Address> addrs = new ArrayList<>();
        if (ed.to != null) for (Address addr : ed.to) addrs.add(addr);

        List<String> names = new ArrayList<>();
        for (Address addr : addrs) {
          Contact c = archive.addressBook.lookupByAddress(addr);
          names.add(c.pickBestName());
        }

        for (String name : names) {
          if (!ownerNames.contains(name) && !DictUtils.hasDictionaryWord(name)) {
            entities.add(name);
          }
        }
      }
      allEntities.addAll(entities);

      // get entities
      for (String e : entities) {
        if (Util.nullOrEmpty(e)) continue;
        e = e.replaceAll("^\\W+|\\W+$", "");
        if (e.length() > 10 && e.toUpperCase().equals(e))
          continue; // all upper case, more than 10 letters, you're out.

        String ce = DictUtils.canonicalize(e); // canonicalize
        if (ce == null) {
          JSPHelper.log.info("Dropping entity: " + e);
          continue;
        }

        ceToDisplayEntity.put(ce, e);
        entityToLastDate.put(ce, ed.date);
        entityToMessages.put(ce, ed);
        entityToThreads.put(ce, ed.threadID);
      }

      if ((++di) % 1000 == 0) log.info(di + " of " + docs.size() + " messages processed...<br/>");
    }
    log.info(
        "Considered #"
            + allEntities.size()
            + " unique entities and #"
            + ceToDisplayEntity.size()
            + " good ones in #"
            + docs.size()
            + " docs<br>");
    log.info("Owner Names: " + ownerNames);
    JSPHelper.log.info(
        "Considered #"
            + allEntities.size()
            + " unique entities and #"
            + ceToDisplayEntity.size()
            + " good ones in #"
            + docs.size()
            + "docs");

    JSPHelper.log.info(
        "earliest date = "
            + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate));
    JSPHelper.log.info(
        "latest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate));

    Multimap<String, String> tokenToCE = LinkedHashMultimap.create();
    for (String ce : ceToDisplayEntity.keySet()) {
      List<String> tokens = Util.tokenize(ce);
      for (String t : tokens) tokenToCE.put(t, ce);
    }

    // Compute date intervals
    int DAYS_PER_INTERVAL = 30;
    List<Pair<Date, Date>> intervals = new ArrayList<Pair<Date, Date>>();
    {
      JSPHelper.log.info("computing time intervals");
      Date closingDate = latestDate;

      JSPHelper.log.info(
          "closing = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(closingDate));
      while (earliestDate.before(closingDate)) {
        Calendar cal = new GregorianCalendar();
        cal.setTime(closingDate); // this is the time of the last sighting of the term
        // scroll to the beginning of this month
        cal.set(Calendar.HOUR_OF_DAY, 23);
        cal.set(Calendar.MINUTE, 59);
        cal.set(Calendar.SECOND, 59);
        Date endDate = cal.getTime();

        cal.add(
            Calendar.DATE,
            (1
                - DAYS_PER_INTERVAL)); // 1- because we want from 0:00 of first date to 23:59 of
                                       // last date
        cal.set(Calendar.HOUR_OF_DAY, 0);
        cal.set(Calendar.MINUTE, 0);
        cal.set(Calendar.SECOND, 0);
        Date startDate = cal.getTime();

        intervals.add(new Pair<Date, Date>(startDate, endDate));
        // ok we got an interval

        // closing date for the next interval is 1 day before endDate
        cal.add(Calendar.DATE, -1);
        closingDate = cal.getTime();
      }
      JSPHelper.log.info("done computing intervals, #time intervals: " + intervals.size());
      for (Pair<Date, Date> p : intervals)
        JSPHelper.log.info(
            "Interval: "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst())
                + " - "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond()));
    }

    // initialize clueInfos to empty lists
    List<ClueInfo> clueInfos[] = new ArrayList[intervals.size()];
    for (int i = 0; i < intervals.size(); i++) {
      clueInfos[i] = new ArrayList<ClueInfo>();
    }

    Map<Integer, Integer> intervalCount = new LinkedHashMap<>();
    // nSent is the number of sentences allowed in a clue text
    int nvalidclues = 0, nSent = 2;
    // generate clueInfos for each entity
    for (String ce : entityToLastDate.keySet()) {
      Date lastSeenDate = entityToLastDate.get(ce);

      // compute displayEntity (which has red for core words) and fullAnswer, which is a simple
      // string
      String fullAnswer = "";
      {
        List<String> tokens = Util.tokenize(ceToDisplayEntity.get(ce).iterator().next());
        for (String t : tokens) {
          if (EnglishDictionary.stopWords.contains(t.toLowerCase())) continue;
          fullAnswer += t + " ";
        }
        fullAnswer = fullAnswer.trim();
      }
      // dont want the answer to be scored low just because it has extra non-word chars in the begin
      // or end
      fullAnswer = fullAnswer.replaceAll("^\\W+|\\W+$", "");

      // which interval does this date belong to?
      int interval = -1;
      Date intervalStart = null, intervalEnd = null;
      {
        int i = 0;
        for (Pair<Date, Date> p : intervals) {
          intervalStart = p.getFirst();
          intervalEnd = p.getSecond();

          if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate))
              || intervalStart.equals(lastSeenDate)
              || intervalEnd.equals(lastSeenDate)) {
            interval = i;
            break;
          }
          i++;
        }
      }
      if (interval < 0 || interval == intervals.size())
        JSPHelper.log.info(
            "What, no interval!? for "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate));
      if (!intervalCount.containsKey(interval)) intervalCount.put(interval, 0);
      if (intervalCount.get(interval) > maxInt) continue;
      intervalCount.put(interval, intervalCount.get(interval) + 1);

      List<Integer> lengthList = Crossword.convertToWord(fullAnswer).getSecond();
      String lengthDescr = "";
      if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: ";

      for (Integer i : lengthList) {
        lengthDescr += Util.pluralize(i, "letter") + ", ";
      }
      lengthDescr = lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma.

      ClueInfo ci = new ClueInfo();
      ci.link = "../browse?term=\"" + fullAnswer + "\"&sort_by=recent&searchType=original";
      ci.lastSeenDate = lastSeenDate;
      ci.nMessages = entityToMessages.get(ce).size();
      ci.nThreads = entityToThreads.get(ce).size();

      // TODO: we are doing default initialisation of evaluators by setting it to null below, it is
      // more appropriate to consider it as an argument for this method
      Clue clue =
          cluer.createClue(
              fullAnswer,
              (personTest
                  ? ArchiveCluer.QuestionType.GUESS_CORRESPONDENT
                  : ArchiveCluer.QuestionType.FILL_IN_THE_BLANK),
              null,
              tabooCluesSet,
              null,
              intervalStart,
              intervalEnd,
              nSent,
              archive);
      if (clue != null) ci.clues = new Clue[] {clue};

      if (ci.clues == null || ci.clues.length == 0 || clue == null) {
        JSPHelper.log.warn("Did not find any clue for: " + fullAnswer);
      } else {
        // is the times value of the clue important?
        questions.add(new MemoryQuestion(this, fullAnswer, clue, 1, lengthDescr));
        nvalidclues++;
        // makes sure that the clue with the same statement is not generated again
        tabooCluesSet.add(clue.clue);
      }
      clueInfos[interval].add(ci);
    }
    log.info("Found valid clues for " + nvalidclues + " answers");
    JSPHelper.log.info("Found valid clues for " + nvalidclues + " answers");

    log.info("Top candidates are:");
    for (MemoryQuestion mq : questions)
      log.info(mq.correctAnswer + " times=" + mq.stats.nMessagesWithAnswer);

    // sort q's by clue score
    Collections.sort(questions);

    //		log.info("Based on clue score, top answers:");
    //		for (MemoryQuestion mq: questions)
    //			log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue);

    // now we have up to 2*N questions, sorted by cluescore.
    // drop ones that are prefix/suffix of another, and cap to N
    int prev_size = questions.size();

    int new_size = questions.size();

    //	log.info ("#questions before prefix-suffix elim: " + prev_size + " after: " + new_size);

    int count = 0;
    for (MemoryQuestion mq : questions) {
      mq.setQuestionNum(count++);
    }

    // log the questions as well, just in case we don't get to the final point due to user fatigue
    // or crashes
    logStats("questions.final", false);
  }