/* Drops terms that contain other terms (prefixes, suffixes, etc.) and trims the questionlist to the target size.*/ private ArrayList<MemoryQuestion> dropPrefixSuffixfromSortedList( List<MemoryQuestion> questionlist, int targetSize) { ArrayList<MemoryQuestion> resultList = new ArrayList<MemoryQuestion>(); Set<Integer> badIndexSet = new LinkedHashSet<Integer>(); int terms = questionlist.size(); for (int first = 0; first < terms; first++) { MemoryQuestion mq1 = questionlist.get(first); String name1 = mq1.correctAnswer; if (badIndexSet.contains(first)) continue; resultList.add(mq1); if (resultList.size() >= targetSize) return resultList; for (int second = first + 1; second < terms; second++) { MemoryQuestion mq2 = questionlist.get(second); String name2 = mq2.correctAnswer; if (name1.contains(name2) || name2.contains(name1)) { badIndexSet.add(second); } } } return resultList; }
/** * lookup doesn't have to be synchronized * * @throws ClassNotFoundException * @throws GeneralSecurityException * @throws IOException */ public static synchronized UserStats lookup(String userCode) throws IOException, GeneralSecurityException, ClassNotFoundException { List<UserStats> users = readUsersFile(); if (codes == null) return null; for (int i = 0; i < codes.size(); i++) { if (codes.get(i).equals(userCode)) return users.get( i); // simple logic: user is providing code #1. therefore s/he must be user #i. what's // not to like?. } return null; }
/** Generates person names tests from the given archive. @throws IOException */ public void generatePersonNameQuestions( Archive archive, NERModel nerModel, Collection<EmailDocument> allDocs, Lexicon lex, int numClues) throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException, ParseException { this.archive = archive; questions = new ArrayList<>(); ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex); tabooCluesSet = new LinkedHashSet<>(); archive.assignThreadIds(); List<ClueEvaluator> evaluators = getDefaultEvals(); List<Document> docs = archive.getAllDocs(); Multimap<Contact, EmailDocument> contactToMessages = LinkedHashMultimap.create(); Multimap<Contact, Long> contactToThreadIds = LinkedHashMultimap.create(); // sort by date Collections.sort(docs); Date earliestDate = null, latestDate = null; Map<Contact, Date> contactToLatestDate = new LinkedHashMap<>(); // compute earliest and latest date across all messages in corpus for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date; if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date; } JSPHelper.log.info( "===================\nStarting to generate person names memory questions from " + docs.size() + " messages with " + numClues + " questions" + ", earliest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate) + " latest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate)); Set<Integer> tabooSentenceHashes = new LinkedHashSet<>(); // create hashes of all sentences seen at least twice (case insensitive, lower cased) { Set<Integer> hashesSeen = new LinkedHashSet<>(); for (Document d : docs) { String contents = archive.getContents(d, true); String cleanedContents = EmailUtils.cleanupEmailMessage(contents); SentenceTokenizer st = new SentenceTokenizer(cleanedContents); while (st.hasMoreSentences()) { String sentence = st.nextSentence(); sentence = canonicalizeSentence(sentence); int hashCode = sentence.hashCode(); if (hashesSeen.contains(hashCode)) { tabooSentenceHashes.add(hashCode); log.info("Marking sentence as taboo: " + sentence); } else hashesSeen.add(hashCode); } } } // compute contactToLatestDate that contact has been seen on for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; // discard doc if it is not a sent mail if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue; for (Contact c : ed.getParticipatingContactsExceptOwn(archive.addressBook)) { Date currentLatestDate = contactToLatestDate.get(c); if (currentLatestDate == null || currentLatestDate.before(ed.date)) contactToLatestDate.put(c, ed.date); contactToMessages.put(c, ed); contactToThreadIds.put(c, ed.threadID); } } log.info("We are considering " + contactToLatestDate.size() + " contacts"); Date currentDate = new Date(); List<Pair<Date, Date>> intervals = computeDateIntervals(earliestDate, currentDate); // go back from current date // intervals[0] is the most recent. JSPHelper.log.info("done computing " + intervals.size() + " intervals"); for (Pair<Date, Date> p : intervals) JSPHelper.log.info( "Interval: " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst()) + " - " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond())); int cluesPerInterval = (numClues > 0 && intervals.size() > 0) ? (numClues + intervals.size() - 1) / intervals.size() : 0; JSPHelper.log.info( "Will try to generate " + Util.pluralize(cluesPerInterval, "questions") + " per interval"); Multimap<Integer, Contact> intervalToContacts = LinkedHashMultimap.create(); // nSent is the number of sentences allowed in a clue text int nSent = 2; for (Contact c : contactToLatestDate.keySet()) { Date lastSeenDate = contactToLatestDate.get(c); // which interval does this date belong to? we'll assign this contact in that interval in the // intervalToContacts map int interval = -1; Date intervalStart = null, intervalEnd = null; { int i = 0; for (Pair<Date, Date> p : intervals) { intervalStart = p.getFirst(); intervalEnd = p.getSecond(); if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate)) || intervalStart.equals(lastSeenDate) || intervalEnd.equals(lastSeenDate)) { interval = i; break; } i++; } } if (interval < 0 || interval == intervals.size()) { JSPHelper.log.info( "What, no interval!? for " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate)); continue; } intervalToContacts.put(interval, c); } log.info("Interval information (interval 0 is the most recent):"); for (int interval = 0; interval < intervals.size(); interval++) { Collection<Contact> contacts = intervalToContacts.get(interval); int nContactsForThisInterval = (contacts == null) ? 0 : contacts.size(); log.info( "In interval " + interval + " there are " + Util.pluralize(nContactsForThisInterval, "candidate contact") + " who were last seen in this interval"); } for (int interval = 0; interval < intervals.size(); interval++) { Date intervalStart = intervals.get(interval).getFirst(); Date intervalEnd = intervals.get(interval).getSecond(); Collection<Contact> candidateContactsForThisInterval = intervalToContacts.get(interval); if (candidateContactsForThisInterval == null) { log.info("Skipping interval " + interval + " because there are no contacts"); continue; } Map<Clue, Contact> clueToContact = new LinkedHashMap<>(); log.info("=======\nGenerating questions for interval " + interval); outer: for (Contact c : candidateContactsForThisInterval) { String name = c.pickBestName(); if (name.length() < 2) // could also check if alphanumberic only continue outer; // ignore contact if name does not contain all alphabets. Even a period is not allowed. only // space is allowed. for (char ch : name.toCharArray()) { if (!Character.isAlphabetic(ch) && !Character.isSpaceChar(ch)) continue outer; } Clue clue = cluer.createPersonNameClue( c, evaluators, nerModel, intervalStart, intervalEnd, nSent, archive, tabooSentenceHashes); if (clue != null) clueToContact.put(clue, c); } List<Clue> clueList = new ArrayList(clueToContact.keySet()); Collections.sort(clueList); List<Clue> selectedClues = new ArrayList<>(); for (int i = 0; i < cluesPerInterval && i < clueList.size(); i++) { selectedClues.add(clueList.get(i)); } log.info( "For interval " + interval + " selected " + selectedClues.size() + " contacts out of " + clueList.size() + " possible candidates."); // for (Clue c: clueList) // log.info ("Clue candidate for " + clueToContact.get(c).pickBestName() + " // score = " + c.clueStats.finalScore+ " clue is " + c ); // for (Clue c: selectedClues) // log.info ("Selected clue: " + clueToContact.get(c).pickBestName() + " score = " // + c.clueStats.finalScore+ " clue is " + c); for (Clue selectedClue : selectedClues) { Contact c = clueToContact.get(selectedClue); String name = c.pickBestName(); List<Integer> lengthList = Crossword.convertToWord(name).getSecond(); String lengthDescr = ""; if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: "; for (Integer i : lengthList) { lengthDescr += Util.pluralize(i, "letter") + ", "; } lengthDescr = lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma. ClueInfo ci = new ClueInfo(); ci.lastSeenDate = contactToLatestDate.get(c); ci.nMessages = contactToThreadIds.get(c).size(); ci.nThreads = contactToThreadIds.get(c).size(); questions.add(new MemoryQuestion(this, name, selectedClue, 1, lengthDescr)); } } log.info(questions.size() + " questions generated"); log.info("Top candidates are:"); // sort q's by clue score Collections.sort(questions); // log.info("Based on clue score, top answers:"); // for (MemoryQuestion mq: questions) // log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue); int count = 0; for (MemoryQuestion mq : questions) { mq.setQuestionNum(count++); } // log the questions as well, just in case we don't get to the final point due to user fatigue // or crashes logStats("questions.final", false); }