public static String getProtovisForDates( String[] chartSpec, List<Date> outDates, List<Date> inDates, List<Date> intervals, int normalizeCount, int width, int height, boolean showTotals, boolean inNOut, String browseParams) { int[] inGram = CalendarUtil.computeHistogram(inDates, intervals); // double[] normalizedInGram = Util.normalizeHistogramToBase(inGram, normalizeCount); int[] outGram = CalendarUtil.computeHistogram(outDates, intervals); // double[] normalizedOutGram = Util.normalizeHistogramToBase(outGram, normalizeCount); return getProtoVizBox( chartSpec, outDates.size(), inDates.size(), outGram, inGram, normalizeCount, width, height, showTotals, inNOut, intervals.get(0), intervals.get(intervals.size() - 1), false /*focusOnly*/, browseParams); }
public static int findMaxInOrOutInAnInterval( List<Date> inDates, List<Date> outDates, List<Date> intervals) { int[] histogram = CalendarUtil.computeHistogram(inDates, intervals); int max = Integer.MIN_VALUE; for (int x : histogram) if (x > max) max = x; histogram = CalendarUtil.computeHistogram(outDates, intervals); for (int x : histogram) if (x > max) max = x; return max; }
private static int findNormalizingMax(List<Date> dates, List<Date> intervals) { if (dates == null) return Integer.MIN_VALUE; int[] histogram = CalendarUtil.computeHistogram(dates, intervals); int max = Integer.MIN_VALUE; for (int x : histogram) if (x > max) max = x; return max; }
public String toHTMLString() { String str = ""; for (Clue clue : clues) { str += "<tr><td><a href='" + link + "' target='_blank'>" + displayEntity + "</a></td><td>" + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate) + "</td><td>" + nMessages + "</td><td>" + nThreads + "</td><td>" + (clue != null ? clue.clueStats.finalScore : "-") + "</td></tr>" + "<tr><td class=\"clue\" colspan=\"6\">" + (clue != null ? (clue.clue + "<br/><br/><div class=\"stats\"> stats: " + Util.fieldsToString(clue.clueStats, false)) : "No clue") + "</div><br/><br/></td></tr><br>"; } return str; }
// Compute date intervals, working backwards from latestDate, until earliestDate is covered // most recent interval is interval 0. private static List<Pair<Date, Date>> computeDateIntervals(Date earliestDate, Date latestDate) { int DAYS_PER_INTERVAL = 30; List<Pair<Date, Date>> intervals = new ArrayList<Pair<Date, Date>>(); { JSPHelper.log.info("computing time intervals"); Date closingDate = latestDate; JSPHelper.log.info( "closing = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(closingDate)); if (earliestDate == null || closingDate == null) return intervals; while (earliestDate.before(closingDate)) { Calendar cal = new GregorianCalendar(); cal.setTime(closingDate); // scroll to the beginning of this month cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); Date endDate = cal.getTime(); // scroll back by DAYS_PER_INTERVAL days cal.add( Calendar.DATE, (1 - DAYS_PER_INTERVAL)); // 1- because we want from 0:00 of first date to 23:59 of // last date cal.set(Calendar.HOUR_OF_DAY, 0); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); Date startDate = cal.getTime(); intervals.add(new Pair<Date, Date>(startDate, endDate)); // ok we got an interval // closing date for the next interval is 1 day before endDate cal.add(Calendar.DATE, -1); closingDate = cal.getTime(); } } return intervals; }
/** * generates protovis string for group activity (in/out) chart + all names in the group. * normalized across all groups. optionally group members names are included. * * @return */ public static List<String> getProtovisForGroups( AddressBook addressBook, List<SimilarGroup<String>> groups, Collection<EmailDocument> allDocs, int nIntervals, int width, int height, boolean generateNames) { // compute in/out dates for each group List<Date>[] inDates = new ArrayList[groups.size()]; List<Date>[] outDates = new ArrayList[groups.size()]; for (int i = 0; i < groups.size(); i++) { inDates[i] = new ArrayList<Date>(); outDates[i] = new ArrayList<Date>(); } for (EmailDocument ed : allDocs) { List<String> rawEmailAddrs = ed.getParticipatingAddrsExcept(addressBook.getOwnAddrs()); List<String> canonicalEmailAddrs = addressBook.convertToCanonicalAddrs(rawEmailAddrs); Collections.sort(canonicalEmailAddrs); Group<String> emailGroup = new Group<String>(canonicalEmailAddrs); int x = Group.bestFit(groups, emailGroup); if (x != -1) { int sentOrReceived = ed.sentOrReceived(addressBook); if ((sentOrReceived & EmailDocument.RECEIVED_MASK) != 0) inDates[x].add(ed.date); if ((sentOrReceived & EmailDocument.SENT_MASK) != 0) outDates[x].add(ed.date); } } // find normalizing max int max = Integer.MIN_VALUE; Pair<Date, Date> p = EmailUtils.getFirstLast(allDocs); Date globalStart = p.getFirst(); Date globalEnd = p.getSecond(); List<Date> intervals = CalendarUtil.divideIntoIntervals(globalStart, globalEnd, nIntervals); for (int i = 0; i < groups.size(); i++) { int x = normalizingMax(inDates[i], outDates[i], intervals, /* inNOut */ true); if (x >= max) max = x; } // generate protovis List<String> result = new ArrayList<String>(); for (int i = 0; i < groups.size(); i++) { int[] inGram = CalendarUtil.computeHistogram(inDates[i], intervals); // double[] normalizedInGram = Util.normalizeHistogramToBase(inGram, max); int[] outGram = CalendarUtil.computeHistogram(outDates[i], intervals); // double[] normalizedOutGram = Util.normalizeHistogramToBase(outGram, max); String url = JSPHelper.getURLForGroupMessages(i); StringBuilder sb = new StringBuilder(); sb.append( getProtoVizBox( null, outDates[i].size(), inDates[i].size(), outGram, inGram, max, width, height, true, true, intervals.get(0), intervals.get(intervals.size() - 1), true /*focusOnly*/, "'" + url + "'")); // add names to the mark if needed if (generateNames) { sb.append("<br/><span style=\"font-size:small\">"); for (String str : groups.get(i).elements) sb.append(Util.strippedEmailAddress(str) + "<br/>"); sb.append( "<a href=\"" + url + "\" target=\"_new\"><img title=\"Messages\" src=\"/muse/images/email.jpg\" width=\"25\"/>" + "</a>"); sb.append("</span>"); } result.add(sb.toString()); } return result; }
/** * gets protoviz javascript mark to plot normalized frequencies for a contact: out and in. all * out's and in's should be between 0 and 1 height of bar is sqrt of comm. volume chartSpec can be * null in which case it will draw the chart at the current location. if chartSpec is specified, * it will draw the chart inside the specified div "chartCanvas" with date slider for zooming * where the slider is specified by dateSlider and dateSliderText (see filter.jsp / * filter_common.html for detail on make_date_slider()). chartSpec[0..2] = chartCanvas, * dateSliderBar, dateSliderText. firstDate is inclusive, lastDate is exclusive. */ public static String getProtoVizMark( String[] chartSpec, int[] out, int[] in, int normalizer, int width, int height, boolean inNOut, Date firstDate, Date lastDate, boolean focusOnly, String browseParams) { // example outcome // // <script type="text/javascript+protovis"> // var w = 100, h = 40; // new pv.Panel().width(w).height(h) // .add(pv.Rule).bottom(h/2).lineWidth(2).left(200).right(200) // .add(pv.Bar).data([0.1,0.2,0.3,0.4,0.5,1.0]).width(4) // .left(function() 5 * this.index) // .height(function(d) Math.round(h/2 * d)) // .bottom(h/2) // .add(pv.Bar).data([0.3,0.4,0.5,1.0]).width(4) // .left(function() 5 * this.index) // .height(function(d) Math.round(h/2*d)) // .bottom(function(d) h/2 - Math.round(h/2 * d)) // .root.render(); // // </script> if (Util.nullOrEmpty(browseParams)) browseParams = "''"; // add padding to make first and last data always 0 to nicely contain/bound the area chart StringBuilder outgoingData = new StringBuilder("[0"); // bar for incoming counts for (int x = 0; x < out.length; x++) // outgoingData.append ((x==0 ? "" : ",") + out[x]); outgoingData.append("," + out[x]); outgoingData.append(",0]"); StringBuilder incomingData = null; if (inNOut) { incomingData = new StringBuilder("[0"); for (int x = 0; x < in.length; x++) // incomingData.append ((x==0 ? "" : ",") + in[x]); incomingData.append("," + in[x]); incomingData.append(",0]"); } // Date.getYear() is deprecated. Calendar cFirst = new GregorianCalendar(); cFirst.setTime(firstDate); // inclusive Calendar cLast = new GregorianCalendar(); cLast.setTime(lastDate); // exclusive int cFirst_year = cFirst.get(Calendar.YEAR); int cFirst_month = cFirst.get(Calendar.MONTH) - 1; // to represent the left dummy pad since cFirst is inclusive int nMonths = CalendarUtil.getDiffInMonths(firstDate, lastDate); Util.softAssert(nMonths == in.length); StringBuilder result = new StringBuilder(""); String chartDivId = null; if (chartSpec != null) { Util.ASSERT(chartSpec[0].startsWith("#")); chartDivId = chartSpec[0].substring(chartSpec[0].startsWith("#") ? 1 : 0); // the canvas div must appear before the protovis invocation // if (focusOnly) result.append("<a href='#?custom=true&id=" + chartDivId + "' // rel='subchart'>"); // should probably inject 'subchart' tag here but we don't have info for // 'title' so we inject at an upper layer instead. result.append("<div id='" + chartDivId + "'></div>"); // if (focusOnly) result.append("</a>"); } result.append("<script type=\"text/javascript\">"); if (chartSpec == null) { result.append( "draw_protovis_box(" + incomingData + ", " + outgoingData + ", " + width + ", " + height + ", " + normalizer + ", " + cFirst_year + "," + cFirst_month + ", " + cLast.get(Calendar.YEAR) + "," + cLast.get(Calendar.MONTH) + ");"); } else { result.append( "draw_chart('" + chartSpec[0] + "'" + ", data_bottom['" + chartDivId + "']=" + incomingData + ", data_top['" + chartDivId + "']=" + outgoingData + ", " + normalizer + ", " + cFirst_year + "," + cFirst_month + ", " + width + ", " + height + ", " + focusOnly + ", browse_params['" + chartDivId + "']=" + browseParams + ")"); // int lastMonth_inclusive = nMonths - 1; // slider will be created with range [0, // lastMonth_inclusive] inclusive on both ends for the total of nMonths. // result.append("make_date_slider('" + chartSpec[1] + "','" + chartSpec[2] + "'"); // result.append( "," + lastMonth_inclusive + "," + cFirst_year + "," + // cFirst_month); // global range // result.append( "," + lastMonth_inclusive + "," + cFirst_year + "," + // cFirst_month); // current range // result.append( ", get_date_change_func('" + chartSpec[0] + "'" // + ", " + incomingData + ", " + outgoingData // + ", " + width + ", " + height // + ", " + cFirst_year + "," + cFirst_month // + ")"); // result.append( ");"); } result.append("</script>"); return result.toString(); }
/** Generates person names tests from the given archive. @throws IOException */ public void generatePersonNameQuestions( Archive archive, NERModel nerModel, Collection<EmailDocument> allDocs, Lexicon lex, int numClues) throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException, ParseException { this.archive = archive; questions = new ArrayList<>(); ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex); tabooCluesSet = new LinkedHashSet<>(); archive.assignThreadIds(); List<ClueEvaluator> evaluators = getDefaultEvals(); List<Document> docs = archive.getAllDocs(); Multimap<Contact, EmailDocument> contactToMessages = LinkedHashMultimap.create(); Multimap<Contact, Long> contactToThreadIds = LinkedHashMultimap.create(); // sort by date Collections.sort(docs); Date earliestDate = null, latestDate = null; Map<Contact, Date> contactToLatestDate = new LinkedHashMap<>(); // compute earliest and latest date across all messages in corpus for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date; if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date; } JSPHelper.log.info( "===================\nStarting to generate person names memory questions from " + docs.size() + " messages with " + numClues + " questions" + ", earliest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate) + " latest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate)); Set<Integer> tabooSentenceHashes = new LinkedHashSet<>(); // create hashes of all sentences seen at least twice (case insensitive, lower cased) { Set<Integer> hashesSeen = new LinkedHashSet<>(); for (Document d : docs) { String contents = archive.getContents(d, true); String cleanedContents = EmailUtils.cleanupEmailMessage(contents); SentenceTokenizer st = new SentenceTokenizer(cleanedContents); while (st.hasMoreSentences()) { String sentence = st.nextSentence(); sentence = canonicalizeSentence(sentence); int hashCode = sentence.hashCode(); if (hashesSeen.contains(hashCode)) { tabooSentenceHashes.add(hashCode); log.info("Marking sentence as taboo: " + sentence); } else hashesSeen.add(hashCode); } } } // compute contactToLatestDate that contact has been seen on for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; // discard doc if it is not a sent mail if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue; for (Contact c : ed.getParticipatingContactsExceptOwn(archive.addressBook)) { Date currentLatestDate = contactToLatestDate.get(c); if (currentLatestDate == null || currentLatestDate.before(ed.date)) contactToLatestDate.put(c, ed.date); contactToMessages.put(c, ed); contactToThreadIds.put(c, ed.threadID); } } log.info("We are considering " + contactToLatestDate.size() + " contacts"); Date currentDate = new Date(); List<Pair<Date, Date>> intervals = computeDateIntervals(earliestDate, currentDate); // go back from current date // intervals[0] is the most recent. JSPHelper.log.info("done computing " + intervals.size() + " intervals"); for (Pair<Date, Date> p : intervals) JSPHelper.log.info( "Interval: " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst()) + " - " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond())); int cluesPerInterval = (numClues > 0 && intervals.size() > 0) ? (numClues + intervals.size() - 1) / intervals.size() : 0; JSPHelper.log.info( "Will try to generate " + Util.pluralize(cluesPerInterval, "questions") + " per interval"); Multimap<Integer, Contact> intervalToContacts = LinkedHashMultimap.create(); // nSent is the number of sentences allowed in a clue text int nSent = 2; for (Contact c : contactToLatestDate.keySet()) { Date lastSeenDate = contactToLatestDate.get(c); // which interval does this date belong to? we'll assign this contact in that interval in the // intervalToContacts map int interval = -1; Date intervalStart = null, intervalEnd = null; { int i = 0; for (Pair<Date, Date> p : intervals) { intervalStart = p.getFirst(); intervalEnd = p.getSecond(); if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate)) || intervalStart.equals(lastSeenDate) || intervalEnd.equals(lastSeenDate)) { interval = i; break; } i++; } } if (interval < 0 || interval == intervals.size()) { JSPHelper.log.info( "What, no interval!? for " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate)); continue; } intervalToContacts.put(interval, c); } log.info("Interval information (interval 0 is the most recent):"); for (int interval = 0; interval < intervals.size(); interval++) { Collection<Contact> contacts = intervalToContacts.get(interval); int nContactsForThisInterval = (contacts == null) ? 0 : contacts.size(); log.info( "In interval " + interval + " there are " + Util.pluralize(nContactsForThisInterval, "candidate contact") + " who were last seen in this interval"); } for (int interval = 0; interval < intervals.size(); interval++) { Date intervalStart = intervals.get(interval).getFirst(); Date intervalEnd = intervals.get(interval).getSecond(); Collection<Contact> candidateContactsForThisInterval = intervalToContacts.get(interval); if (candidateContactsForThisInterval == null) { log.info("Skipping interval " + interval + " because there are no contacts"); continue; } Map<Clue, Contact> clueToContact = new LinkedHashMap<>(); log.info("=======\nGenerating questions for interval " + interval); outer: for (Contact c : candidateContactsForThisInterval) { String name = c.pickBestName(); if (name.length() < 2) // could also check if alphanumberic only continue outer; // ignore contact if name does not contain all alphabets. Even a period is not allowed. only // space is allowed. for (char ch : name.toCharArray()) { if (!Character.isAlphabetic(ch) && !Character.isSpaceChar(ch)) continue outer; } Clue clue = cluer.createPersonNameClue( c, evaluators, nerModel, intervalStart, intervalEnd, nSent, archive, tabooSentenceHashes); if (clue != null) clueToContact.put(clue, c); } List<Clue> clueList = new ArrayList(clueToContact.keySet()); Collections.sort(clueList); List<Clue> selectedClues = new ArrayList<>(); for (int i = 0; i < cluesPerInterval && i < clueList.size(); i++) { selectedClues.add(clueList.get(i)); } log.info( "For interval " + interval + " selected " + selectedClues.size() + " contacts out of " + clueList.size() + " possible candidates."); // for (Clue c: clueList) // log.info ("Clue candidate for " + clueToContact.get(c).pickBestName() + " // score = " + c.clueStats.finalScore+ " clue is " + c ); // for (Clue c: selectedClues) // log.info ("Selected clue: " + clueToContact.get(c).pickBestName() + " score = " // + c.clueStats.finalScore+ " clue is " + c); for (Clue selectedClue : selectedClues) { Contact c = clueToContact.get(selectedClue); String name = c.pickBestName(); List<Integer> lengthList = Crossword.convertToWord(name).getSecond(); String lengthDescr = ""; if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: "; for (Integer i : lengthList) { lengthDescr += Util.pluralize(i, "letter") + ", "; } lengthDescr = lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma. ClueInfo ci = new ClueInfo(); ci.lastSeenDate = contactToLatestDate.get(c); ci.nMessages = contactToThreadIds.get(c).size(); ci.nThreads = contactToThreadIds.get(c).size(); questions.add(new MemoryQuestion(this, name, selectedClue, 1, lengthDescr)); } } log.info(questions.size() + " questions generated"); log.info("Top candidates are:"); // sort q's by clue score Collections.sort(questions); // log.info("Based on clue score, top answers:"); // for (MemoryQuestion mq: questions) // log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue); int count = 0; for (MemoryQuestion mq : questions) { mq.setQuestionNum(count++); } // log the questions as well, just in case we don't get to the final point due to user fatigue // or crashes logStats("questions.final", false); }
/** * Generates list of questions and stores it in the current instance of MemoryStudy We handle two * kinds of questions namely, person names tests and non-person name tests. Non-person name test * is a fill in the blank kind where the blank is to be filled with the correct non-person entity * to complete the sentence person name test is to guess the person in correspondent list based on * some distinctive sentences in the mail * * @param maxInt - max. number of questions from a interval * @throws IOException */ public void generateQuestions( Archive archive, NERModel nerModel, Collection<EmailDocument> allDocs, Lexicon lex, int maxInt, boolean personTest) throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException, ParseException { this.archive = archive; if (allDocs == null) allDocs = (Collection) archive.getAllDocs(); questions = new ArrayList<>(); ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex); Short[] itypes = new Short[] { FeatureDictionary.BUILDING, FeatureDictionary.PLACE, FeatureDictionary.RIVER, FeatureDictionary.ROAD, FeatureDictionary.UNIVERSITY, FeatureDictionary.MOUNTAIN, FeatureDictionary.AIRPORT, FeatureDictionary.ISLAND, FeatureDictionary.MUSEUM, FeatureDictionary.BRIDGE, FeatureDictionary.AIRLINE, FeatureDictionary.THEATRE, FeatureDictionary.LIBRARY, FeatureDictionary.LAWFIRM, FeatureDictionary.GOVAGENCY }; double CUTOFF = 0.001; tabooCluesSet = new LinkedHashSet<>(); archive.assignThreadIds(); List<Document> docs = archive.getAllDocs(); Map<String, Date> entityToLastDate = new LinkedHashMap<>(); Multimap<String, EmailDocument> entityToMessages = LinkedHashMultimap.create(); Multimap<String, Long> entityToThreads = LinkedHashMultimap.create(); Multimap<String, String> ceToDisplayEntity = LinkedHashMultimap.create(); int di = 0; // sort by date Collections.sort(docs); Set<String> ownerNames = archive.ownerNames; Date earliestDate = null, latestDate = null; Set<String> allEntities = new LinkedHashSet<>(); for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date; if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date; List<String> entities = new ArrayList<>(); if (!personTest) { entities.addAll( Arrays.asList(archive.getAllNamesInDoc(doc, true)) .stream() .filter(n -> n.typeScore > CUTOFF) .map(n -> n.text) .collect(Collectors.toList())); } else { // do not consider mailing lists if (ed.sentToMailingLists != null && ed.sentToMailingLists.length > 0) continue; // discard doc if it is not a sent mail if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue; List<Address> addrs = new ArrayList<>(); if (ed.to != null) for (Address addr : ed.to) addrs.add(addr); List<String> names = new ArrayList<>(); for (Address addr : addrs) { Contact c = archive.addressBook.lookupByAddress(addr); names.add(c.pickBestName()); } for (String name : names) { if (!ownerNames.contains(name) && !DictUtils.hasDictionaryWord(name)) { entities.add(name); } } } allEntities.addAll(entities); // get entities for (String e : entities) { if (Util.nullOrEmpty(e)) continue; e = e.replaceAll("^\\W+|\\W+$", ""); if (e.length() > 10 && e.toUpperCase().equals(e)) continue; // all upper case, more than 10 letters, you're out. String ce = DictUtils.canonicalize(e); // canonicalize if (ce == null) { JSPHelper.log.info("Dropping entity: " + e); continue; } ceToDisplayEntity.put(ce, e); entityToLastDate.put(ce, ed.date); entityToMessages.put(ce, ed); entityToThreads.put(ce, ed.threadID); } if ((++di) % 1000 == 0) log.info(di + " of " + docs.size() + " messages processed...<br/>"); } log.info( "Considered #" + allEntities.size() + " unique entities and #" + ceToDisplayEntity.size() + " good ones in #" + docs.size() + " docs<br>"); log.info("Owner Names: " + ownerNames); JSPHelper.log.info( "Considered #" + allEntities.size() + " unique entities and #" + ceToDisplayEntity.size() + " good ones in #" + docs.size() + "docs"); JSPHelper.log.info( "earliest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate)); JSPHelper.log.info( "latest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate)); Multimap<String, String> tokenToCE = LinkedHashMultimap.create(); for (String ce : ceToDisplayEntity.keySet()) { List<String> tokens = Util.tokenize(ce); for (String t : tokens) tokenToCE.put(t, ce); } // Compute date intervals int DAYS_PER_INTERVAL = 30; List<Pair<Date, Date>> intervals = new ArrayList<Pair<Date, Date>>(); { JSPHelper.log.info("computing time intervals"); Date closingDate = latestDate; JSPHelper.log.info( "closing = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(closingDate)); while (earliestDate.before(closingDate)) { Calendar cal = new GregorianCalendar(); cal.setTime(closingDate); // this is the time of the last sighting of the term // scroll to the beginning of this month cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); Date endDate = cal.getTime(); cal.add( Calendar.DATE, (1 - DAYS_PER_INTERVAL)); // 1- because we want from 0:00 of first date to 23:59 of // last date cal.set(Calendar.HOUR_OF_DAY, 0); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); Date startDate = cal.getTime(); intervals.add(new Pair<Date, Date>(startDate, endDate)); // ok we got an interval // closing date for the next interval is 1 day before endDate cal.add(Calendar.DATE, -1); closingDate = cal.getTime(); } JSPHelper.log.info("done computing intervals, #time intervals: " + intervals.size()); for (Pair<Date, Date> p : intervals) JSPHelper.log.info( "Interval: " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst()) + " - " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond())); } // initialize clueInfos to empty lists List<ClueInfo> clueInfos[] = new ArrayList[intervals.size()]; for (int i = 0; i < intervals.size(); i++) { clueInfos[i] = new ArrayList<ClueInfo>(); } Map<Integer, Integer> intervalCount = new LinkedHashMap<>(); // nSent is the number of sentences allowed in a clue text int nvalidclues = 0, nSent = 2; // generate clueInfos for each entity for (String ce : entityToLastDate.keySet()) { Date lastSeenDate = entityToLastDate.get(ce); // compute displayEntity (which has red for core words) and fullAnswer, which is a simple // string String fullAnswer = ""; { List<String> tokens = Util.tokenize(ceToDisplayEntity.get(ce).iterator().next()); for (String t : tokens) { if (EnglishDictionary.stopWords.contains(t.toLowerCase())) continue; fullAnswer += t + " "; } fullAnswer = fullAnswer.trim(); } // dont want the answer to be scored low just because it has extra non-word chars in the begin // or end fullAnswer = fullAnswer.replaceAll("^\\W+|\\W+$", ""); // which interval does this date belong to? int interval = -1; Date intervalStart = null, intervalEnd = null; { int i = 0; for (Pair<Date, Date> p : intervals) { intervalStart = p.getFirst(); intervalEnd = p.getSecond(); if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate)) || intervalStart.equals(lastSeenDate) || intervalEnd.equals(lastSeenDate)) { interval = i; break; } i++; } } if (interval < 0 || interval == intervals.size()) JSPHelper.log.info( "What, no interval!? for " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate)); if (!intervalCount.containsKey(interval)) intervalCount.put(interval, 0); if (intervalCount.get(interval) > maxInt) continue; intervalCount.put(interval, intervalCount.get(interval) + 1); List<Integer> lengthList = Crossword.convertToWord(fullAnswer).getSecond(); String lengthDescr = ""; if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: "; for (Integer i : lengthList) { lengthDescr += Util.pluralize(i, "letter") + ", "; } lengthDescr = lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma. ClueInfo ci = new ClueInfo(); ci.link = "../browse?term=\"" + fullAnswer + "\"&sort_by=recent&searchType=original"; ci.lastSeenDate = lastSeenDate; ci.nMessages = entityToMessages.get(ce).size(); ci.nThreads = entityToThreads.get(ce).size(); // TODO: we are doing default initialisation of evaluators by setting it to null below, it is // more appropriate to consider it as an argument for this method Clue clue = cluer.createClue( fullAnswer, (personTest ? ArchiveCluer.QuestionType.GUESS_CORRESPONDENT : ArchiveCluer.QuestionType.FILL_IN_THE_BLANK), null, tabooCluesSet, null, intervalStart, intervalEnd, nSent, archive); if (clue != null) ci.clues = new Clue[] {clue}; if (ci.clues == null || ci.clues.length == 0 || clue == null) { JSPHelper.log.warn("Did not find any clue for: " + fullAnswer); } else { // is the times value of the clue important? questions.add(new MemoryQuestion(this, fullAnswer, clue, 1, lengthDescr)); nvalidclues++; // makes sure that the clue with the same statement is not generated again tabooCluesSet.add(clue.clue); } clueInfos[interval].add(ci); } log.info("Found valid clues for " + nvalidclues + " answers"); JSPHelper.log.info("Found valid clues for " + nvalidclues + " answers"); log.info("Top candidates are:"); for (MemoryQuestion mq : questions) log.info(mq.correctAnswer + " times=" + mq.stats.nMessagesWithAnswer); // sort q's by clue score Collections.sort(questions); // log.info("Based on clue score, top answers:"); // for (MemoryQuestion mq: questions) // log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue); // now we have up to 2*N questions, sorted by cluescore. // drop ones that are prefix/suffix of another, and cap to N int prev_size = questions.size(); int new_size = questions.size(); // log.info ("#questions before prefix-suffix elim: " + prev_size + " after: " + new_size); int count = 0; for (MemoryQuestion mq : questions) { mq.setQuestionNum(count++); } // log the questions as well, just in case we don't get to the final point due to user fatigue // or crashes logStats("questions.final", false); }