private void expandQueries() { captionToExpandedQuery .clear(); // clear the expanded query map first, we don't want any residue from the // previous state for (String caption : captionToRawQuery.keySet()) { String query = captionToRawQuery.get(caption); List<String> orTerms = Util.tokenize(query, "|"); String expandedQuery = ""; for (int i = 0; i < orTerms.size(); i++) { String t = orTerms.get(i).trim(); if (t.length() == 0) continue; if (t.startsWith("{") && t.endsWith("}")) { String c = t.substring(1, t.length() - 1); String exp = captionToExpandedQuery.get( c); // note: expanded map, not rawmap, to allow multi-level expansion if (exp == null) { t = captionToRawQuery.get(c); if (t == null) { log.warn("ERROR: no prev. caption: " + c + " in query " + query); continue; } } else t = exp; usedInOtherCaptions.add(c); } expandedQuery += t; // there is no point adding or(|), as the query is treated just as a text string and is // not handled specially in Indexer.lookupDocsAsId // however, adding a non-word, non-special character will enable tokenization at that // index and will be appended as many "or" terms. if (i < orTerms.size() - 1) expandedQuery += "|"; } if (caption.length() > 0 && expandedQuery.length() > 0) { // if caption already exists, just add to it String existingQuery = captionToExpandedQuery.get(caption); if (!Util.nullOrEmpty(existingQuery)) expandedQuery = existingQuery + "|" + expandedQuery; captionToExpandedQuery.put(caption, expandedQuery); } } // remove the non top-level captions for (String caption : usedInOtherCaptions) captionToExpandedQuery.remove(caption); }
/** * Generates list of questions and stores it in the current instance of MemoryStudy We handle two * kinds of questions namely, person names tests and non-person name tests. Non-person name test * is a fill in the blank kind where the blank is to be filled with the correct non-person entity * to complete the sentence person name test is to guess the person in correspondent list based on * some distinctive sentences in the mail * * @param maxInt - max. number of questions from a interval * @throws IOException */ public void generateQuestions( Archive archive, NERModel nerModel, Collection<EmailDocument> allDocs, Lexicon lex, int maxInt, boolean personTest) throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException, ParseException { this.archive = archive; if (allDocs == null) allDocs = (Collection) archive.getAllDocs(); questions = new ArrayList<>(); ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex); Short[] itypes = new Short[] { FeatureDictionary.BUILDING, FeatureDictionary.PLACE, FeatureDictionary.RIVER, FeatureDictionary.ROAD, FeatureDictionary.UNIVERSITY, FeatureDictionary.MOUNTAIN, FeatureDictionary.AIRPORT, FeatureDictionary.ISLAND, FeatureDictionary.MUSEUM, FeatureDictionary.BRIDGE, FeatureDictionary.AIRLINE, FeatureDictionary.THEATRE, FeatureDictionary.LIBRARY, FeatureDictionary.LAWFIRM, FeatureDictionary.GOVAGENCY }; double CUTOFF = 0.001; tabooCluesSet = new LinkedHashSet<>(); archive.assignThreadIds(); List<Document> docs = archive.getAllDocs(); Map<String, Date> entityToLastDate = new LinkedHashMap<>(); Multimap<String, EmailDocument> entityToMessages = LinkedHashMultimap.create(); Multimap<String, Long> entityToThreads = LinkedHashMultimap.create(); Multimap<String, String> ceToDisplayEntity = LinkedHashMultimap.create(); int di = 0; // sort by date Collections.sort(docs); Set<String> ownerNames = archive.ownerNames; Date earliestDate = null, latestDate = null; Set<String> allEntities = new LinkedHashSet<>(); for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date; if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date; List<String> entities = new ArrayList<>(); if (!personTest) { entities.addAll( Arrays.asList(archive.getAllNamesInDoc(doc, true)) .stream() .filter(n -> n.typeScore > CUTOFF) .map(n -> n.text) .collect(Collectors.toList())); } else { // do not consider mailing lists if (ed.sentToMailingLists != null && ed.sentToMailingLists.length > 0) continue; // discard doc if it is not a sent mail if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue; List<Address> addrs = new ArrayList<>(); if (ed.to != null) for (Address addr : ed.to) addrs.add(addr); List<String> names = new ArrayList<>(); for (Address addr : addrs) { Contact c = archive.addressBook.lookupByAddress(addr); names.add(c.pickBestName()); } for (String name : names) { if (!ownerNames.contains(name) && !DictUtils.hasDictionaryWord(name)) { entities.add(name); } } } allEntities.addAll(entities); // get entities for (String e : entities) { if (Util.nullOrEmpty(e)) continue; e = e.replaceAll("^\\W+|\\W+$", ""); if (e.length() > 10 && e.toUpperCase().equals(e)) continue; // all upper case, more than 10 letters, you're out. String ce = DictUtils.canonicalize(e); // canonicalize if (ce == null) { JSPHelper.log.info("Dropping entity: " + e); continue; } ceToDisplayEntity.put(ce, e); entityToLastDate.put(ce, ed.date); entityToMessages.put(ce, ed); entityToThreads.put(ce, ed.threadID); } if ((++di) % 1000 == 0) log.info(di + " of " + docs.size() + " messages processed...<br/>"); } log.info( "Considered #" + allEntities.size() + " unique entities and #" + ceToDisplayEntity.size() + " good ones in #" + docs.size() + " docs<br>"); log.info("Owner Names: " + ownerNames); JSPHelper.log.info( "Considered #" + allEntities.size() + " unique entities and #" + ceToDisplayEntity.size() + " good ones in #" + docs.size() + "docs"); JSPHelper.log.info( "earliest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate)); JSPHelper.log.info( "latest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate)); Multimap<String, String> tokenToCE = LinkedHashMultimap.create(); for (String ce : ceToDisplayEntity.keySet()) { List<String> tokens = Util.tokenize(ce); for (String t : tokens) tokenToCE.put(t, ce); } // Compute date intervals int DAYS_PER_INTERVAL = 30; List<Pair<Date, Date>> intervals = new ArrayList<Pair<Date, Date>>(); { JSPHelper.log.info("computing time intervals"); Date closingDate = latestDate; JSPHelper.log.info( "closing = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(closingDate)); while (earliestDate.before(closingDate)) { Calendar cal = new GregorianCalendar(); cal.setTime(closingDate); // this is the time of the last sighting of the term // scroll to the beginning of this month cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); Date endDate = cal.getTime(); cal.add( Calendar.DATE, (1 - DAYS_PER_INTERVAL)); // 1- because we want from 0:00 of first date to 23:59 of // last date cal.set(Calendar.HOUR_OF_DAY, 0); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); Date startDate = cal.getTime(); intervals.add(new Pair<Date, Date>(startDate, endDate)); // ok we got an interval // closing date for the next interval is 1 day before endDate cal.add(Calendar.DATE, -1); closingDate = cal.getTime(); } JSPHelper.log.info("done computing intervals, #time intervals: " + intervals.size()); for (Pair<Date, Date> p : intervals) JSPHelper.log.info( "Interval: " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst()) + " - " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond())); } // initialize clueInfos to empty lists List<ClueInfo> clueInfos[] = new ArrayList[intervals.size()]; for (int i = 0; i < intervals.size(); i++) { clueInfos[i] = new ArrayList<ClueInfo>(); } Map<Integer, Integer> intervalCount = new LinkedHashMap<>(); // nSent is the number of sentences allowed in a clue text int nvalidclues = 0, nSent = 2; // generate clueInfos for each entity for (String ce : entityToLastDate.keySet()) { Date lastSeenDate = entityToLastDate.get(ce); // compute displayEntity (which has red for core words) and fullAnswer, which is a simple // string String fullAnswer = ""; { List<String> tokens = Util.tokenize(ceToDisplayEntity.get(ce).iterator().next()); for (String t : tokens) { if (EnglishDictionary.stopWords.contains(t.toLowerCase())) continue; fullAnswer += t + " "; } fullAnswer = fullAnswer.trim(); } // dont want the answer to be scored low just because it has extra non-word chars in the begin // or end fullAnswer = fullAnswer.replaceAll("^\\W+|\\W+$", ""); // which interval does this date belong to? int interval = -1; Date intervalStart = null, intervalEnd = null; { int i = 0; for (Pair<Date, Date> p : intervals) { intervalStart = p.getFirst(); intervalEnd = p.getSecond(); if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate)) || intervalStart.equals(lastSeenDate) || intervalEnd.equals(lastSeenDate)) { interval = i; break; } i++; } } if (interval < 0 || interval == intervals.size()) JSPHelper.log.info( "What, no interval!? for " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate)); if (!intervalCount.containsKey(interval)) intervalCount.put(interval, 0); if (intervalCount.get(interval) > maxInt) continue; intervalCount.put(interval, intervalCount.get(interval) + 1); List<Integer> lengthList = Crossword.convertToWord(fullAnswer).getSecond(); String lengthDescr = ""; if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: "; for (Integer i : lengthList) { lengthDescr += Util.pluralize(i, "letter") + ", "; } lengthDescr = lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma. ClueInfo ci = new ClueInfo(); ci.link = "../browse?term=\"" + fullAnswer + "\"&sort_by=recent&searchType=original"; ci.lastSeenDate = lastSeenDate; ci.nMessages = entityToMessages.get(ce).size(); ci.nThreads = entityToThreads.get(ce).size(); // TODO: we are doing default initialisation of evaluators by setting it to null below, it is // more appropriate to consider it as an argument for this method Clue clue = cluer.createClue( fullAnswer, (personTest ? ArchiveCluer.QuestionType.GUESS_CORRESPONDENT : ArchiveCluer.QuestionType.FILL_IN_THE_BLANK), null, tabooCluesSet, null, intervalStart, intervalEnd, nSent, archive); if (clue != null) ci.clues = new Clue[] {clue}; if (ci.clues == null || ci.clues.length == 0 || clue == null) { JSPHelper.log.warn("Did not find any clue for: " + fullAnswer); } else { // is the times value of the clue important? questions.add(new MemoryQuestion(this, fullAnswer, clue, 1, lengthDescr)); nvalidclues++; // makes sure that the clue with the same statement is not generated again tabooCluesSet.add(clue.clue); } clueInfos[interval].add(ci); } log.info("Found valid clues for " + nvalidclues + " answers"); JSPHelper.log.info("Found valid clues for " + nvalidclues + " answers"); log.info("Top candidates are:"); for (MemoryQuestion mq : questions) log.info(mq.correctAnswer + " times=" + mq.stats.nMessagesWithAnswer); // sort q's by clue score Collections.sort(questions); // log.info("Based on clue score, top answers:"); // for (MemoryQuestion mq: questions) // log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue); // now we have up to 2*N questions, sorted by cluescore. // drop ones that are prefix/suffix of another, and cap to N int prev_size = questions.size(); int new_size = questions.size(); // log.info ("#questions before prefix-suffix elim: " + prev_size + " after: " + new_size); int count = 0; for (MemoryQuestion mq : questions) { mq.setQuestionNum(count++); } // log the questions as well, just in case we don't get to the final point due to user fatigue // or crashes logStats("questions.final", false); }