示例#1
0
文件: Lexicon.java 项目: ePADD/muse
    public Lexicon1Lang(String filename) throws IOException {
      captionToRawQuery = new LinkedHashMap<String, String>();
      captionToExpandedQuery = new LinkedHashMap<String, String>();
      List<String> lines =
          Util.getLinesFromInputStream(
              new FileInputStream(filename),
              false /* ignore comment lines = false, we'll strip comments here */);
      for (String line : lines) {
        int idx = line.indexOf('#'); // strip everything after the comment char
        if (idx >= 0) line = line.substring(0, idx);
        line = line.trim();
        if (line.length() == 0) continue; // ignore blank lines
        StringTokenizer st = new StringTokenizer(line, ":");
        if (st.countTokens() != 2) {
          log.warn("line ignored: " + line);
          continue;
        }

        String caption = st.nextToken().trim();
        String query = st.nextToken().trim();
        String existingQuery = captionToRawQuery.get(caption);
        if (!Util.nullOrEmpty(existingQuery)) query = existingQuery + "|" + query;
        captionToRawQuery.put(caption, query);
      }
      expandQueries();
    }
示例#2
0
文件: Lexicon.java 项目: ePADD/muse
    private void expandQueries() {
      captionToExpandedQuery
          .clear(); // clear the expanded query map first, we don't want any residue from the
      // previous state
      for (String caption : captionToRawQuery.keySet()) {
        String query = captionToRawQuery.get(caption);
        List<String> orTerms = Util.tokenize(query, "|");
        String expandedQuery = "";
        for (int i = 0; i < orTerms.size(); i++) {
          String t = orTerms.get(i).trim();
          if (t.length() == 0) continue;
          if (t.startsWith("{") && t.endsWith("}")) {
            String c = t.substring(1, t.length() - 1);
            String exp =
                captionToExpandedQuery.get(
                    c); // note: expanded map, not rawmap, to allow multi-level expansion
            if (exp == null) {
              t = captionToRawQuery.get(c);
              if (t == null) {
                log.warn("ERROR: no prev. caption: " + c + " in query " + query);
                continue;
              }
            } else t = exp;

            usedInOtherCaptions.add(c);
          }
          expandedQuery += t;
          // there is no point adding or(|), as the query is treated just as a text string and is
          // not handled specially in Indexer.lookupDocsAsId
          // however, adding a non-word, non-special character will enable tokenization at that
          // index and will be appended as many "or" terms.
          if (i < orTerms.size() - 1) expandedQuery += "|";
        }

        if (caption.length() > 0 && expandedQuery.length() > 0) {
          // if caption already exists, just add to it
          String existingQuery = captionToExpandedQuery.get(caption);
          if (!Util.nullOrEmpty(existingQuery)) expandedQuery = existingQuery + "|" + expandedQuery;
          captionToExpandedQuery.put(caption, expandedQuery);
        }
      }

      // remove the non top-level captions
      for (String caption : usedInOtherCaptions) captionToExpandedQuery.remove(caption);
    }
示例#3
0
文件: Lexicon.java 项目: ePADD/muse
    /**
     * main entry point: returns a category -> docs map for each (non-zero) category in the current
     * captionToQueryMap.
     *
     * @indexer must already have run
     * @docs results are restrictes to these docs. assumes all docs if docs is null or empty.
     * @captions (null/none = all)
     *     <p>vihari This is a weird name for a method that returns documents with emotions instead
     *     of emotions.
     */
    public Map<String, Collection<Document>> getEmotions(
        Indexer indexer,
        Collection<Document> docs,
        boolean originalContentOnly,
        String... captions) {
      Map<String, Collection<Document>> result = new LinkedHashMap<String, Collection<Document>>();
      Set<Document> docs_set = Util.castOrCloneAsSet(docs);
      //			for (String[] emotion: emotionsData)
      String[] selected_captions =
          captions.length > 0 ? captions : captionToExpandedQuery.keySet().toArray(new String[0]);
      for (String caption : selected_captions) {
        String query = captionToExpandedQuery.get(caption);
        if (query == null) {
          log.warn("Skipping unknown caption '" + caption + "'");
          continue;
        }

        // query is simply word1|word2|word3 etc for that sentiment
        // the -1 indicates that we want all docs in the indexer that match the query
        int threshold = 1;
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setThreshold(threshold);
        options.setQueryType(Indexer.QueryType.ORIGINAL);
        Collection<Document> docsForCaption = indexer.docsForQuery(query, options);
        /*
        log.info (docsForCaption.size() + " before");
        threshold = 2;
        docsForCaption = indexer.docsForQuery(query, -1, threshold);
        log.info (docsForCaption.size() + " after");
        */
        //				Set<Document> docs = indexer.docsWithPhraseThreshold(query, -1, 2); // in future, we
        // may have a higher threshold for sentiment matching
        // if @param docs is present, retain only those docs that match, otherwise retain all
        if (!Util.nullOrEmpty(docs_set))
          // docsForCaption.retainAll(docs_set);
          docsForCaption = Util.listIntersection(docsForCaption, docs_set);

        // put it in the result only if at least 1 doc matches
        if (docsForCaption.size() > 0) result.put(caption, docsForCaption);
      }
      return result;
    }
示例#4
0
  /**
   * Generates list of questions and stores it in the current instance of MemoryStudy We handle two
   * kinds of questions namely, person names tests and non-person name tests. Non-person name test
   * is a fill in the blank kind where the blank is to be filled with the correct non-person entity
   * to complete the sentence person name test is to guess the person in correspondent list based on
   * some distinctive sentences in the mail
   *
   * @param maxInt - max. number of questions from a interval
   * @throws IOException
   */
  public void generateQuestions(
      Archive archive,
      NERModel nerModel,
      Collection<EmailDocument> allDocs,
      Lexicon lex,
      int maxInt,
      boolean personTest)
      throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException,
          ParseException {
    this.archive = archive;
    if (allDocs == null) allDocs = (Collection) archive.getAllDocs();
    questions = new ArrayList<>();
    ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex);

    Short[] itypes =
        new Short[] {
          FeatureDictionary.BUILDING,
          FeatureDictionary.PLACE,
          FeatureDictionary.RIVER,
          FeatureDictionary.ROAD,
          FeatureDictionary.UNIVERSITY,
          FeatureDictionary.MOUNTAIN,
          FeatureDictionary.AIRPORT,
          FeatureDictionary.ISLAND,
          FeatureDictionary.MUSEUM,
          FeatureDictionary.BRIDGE,
          FeatureDictionary.AIRLINE,
          FeatureDictionary.THEATRE,
          FeatureDictionary.LIBRARY,
          FeatureDictionary.LAWFIRM,
          FeatureDictionary.GOVAGENCY
        };
    double CUTOFF = 0.001;
    tabooCluesSet = new LinkedHashSet<>();
    archive.assignThreadIds();

    List<Document> docs = archive.getAllDocs();
    Map<String, Date> entityToLastDate = new LinkedHashMap<>();
    Multimap<String, EmailDocument> entityToMessages = LinkedHashMultimap.create();
    Multimap<String, Long> entityToThreads = LinkedHashMultimap.create();
    Multimap<String, String> ceToDisplayEntity = LinkedHashMultimap.create();

    int di = 0;

    // sort by date
    Collections.sort(docs);

    Set<String> ownerNames = archive.ownerNames;
    Date earliestDate = null, latestDate = null;
    Set<String> allEntities = new LinkedHashSet<>();
    for (Document doc : docs) {
      EmailDocument ed = (EmailDocument) doc;
      if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date;
      if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date;

      List<String> entities = new ArrayList<>();
      if (!personTest) {
        entities.addAll(
            Arrays.asList(archive.getAllNamesInDoc(doc, true))
                .stream()
                .filter(n -> n.typeScore > CUTOFF)
                .map(n -> n.text)
                .collect(Collectors.toList()));
      } else {
        // do not consider mailing lists
        if (ed.sentToMailingLists != null && ed.sentToMailingLists.length > 0) continue;
        // discard doc if it is not a sent mail
        if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue;

        List<Address> addrs = new ArrayList<>();
        if (ed.to != null) for (Address addr : ed.to) addrs.add(addr);

        List<String> names = new ArrayList<>();
        for (Address addr : addrs) {
          Contact c = archive.addressBook.lookupByAddress(addr);
          names.add(c.pickBestName());
        }

        for (String name : names) {
          if (!ownerNames.contains(name) && !DictUtils.hasDictionaryWord(name)) {
            entities.add(name);
          }
        }
      }
      allEntities.addAll(entities);

      // get entities
      for (String e : entities) {
        if (Util.nullOrEmpty(e)) continue;
        e = e.replaceAll("^\\W+|\\W+$", "");
        if (e.length() > 10 && e.toUpperCase().equals(e))
          continue; // all upper case, more than 10 letters, you're out.

        String ce = DictUtils.canonicalize(e); // canonicalize
        if (ce == null) {
          JSPHelper.log.info("Dropping entity: " + e);
          continue;
        }

        ceToDisplayEntity.put(ce, e);
        entityToLastDate.put(ce, ed.date);
        entityToMessages.put(ce, ed);
        entityToThreads.put(ce, ed.threadID);
      }

      if ((++di) % 1000 == 0) log.info(di + " of " + docs.size() + " messages processed...<br/>");
    }
    log.info(
        "Considered #"
            + allEntities.size()
            + " unique entities and #"
            + ceToDisplayEntity.size()
            + " good ones in #"
            + docs.size()
            + " docs<br>");
    log.info("Owner Names: " + ownerNames);
    JSPHelper.log.info(
        "Considered #"
            + allEntities.size()
            + " unique entities and #"
            + ceToDisplayEntity.size()
            + " good ones in #"
            + docs.size()
            + "docs");

    JSPHelper.log.info(
        "earliest date = "
            + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate));
    JSPHelper.log.info(
        "latest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate));

    Multimap<String, String> tokenToCE = LinkedHashMultimap.create();
    for (String ce : ceToDisplayEntity.keySet()) {
      List<String> tokens = Util.tokenize(ce);
      for (String t : tokens) tokenToCE.put(t, ce);
    }

    // Compute date intervals
    int DAYS_PER_INTERVAL = 30;
    List<Pair<Date, Date>> intervals = new ArrayList<Pair<Date, Date>>();
    {
      JSPHelper.log.info("computing time intervals");
      Date closingDate = latestDate;

      JSPHelper.log.info(
          "closing = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(closingDate));
      while (earliestDate.before(closingDate)) {
        Calendar cal = new GregorianCalendar();
        cal.setTime(closingDate); // this is the time of the last sighting of the term
        // scroll to the beginning of this month
        cal.set(Calendar.HOUR_OF_DAY, 23);
        cal.set(Calendar.MINUTE, 59);
        cal.set(Calendar.SECOND, 59);
        Date endDate = cal.getTime();

        cal.add(
            Calendar.DATE,
            (1
                - DAYS_PER_INTERVAL)); // 1- because we want from 0:00 of first date to 23:59 of
                                       // last date
        cal.set(Calendar.HOUR_OF_DAY, 0);
        cal.set(Calendar.MINUTE, 0);
        cal.set(Calendar.SECOND, 0);
        Date startDate = cal.getTime();

        intervals.add(new Pair<Date, Date>(startDate, endDate));
        // ok we got an interval

        // closing date for the next interval is 1 day before endDate
        cal.add(Calendar.DATE, -1);
        closingDate = cal.getTime();
      }
      JSPHelper.log.info("done computing intervals, #time intervals: " + intervals.size());
      for (Pair<Date, Date> p : intervals)
        JSPHelper.log.info(
            "Interval: "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst())
                + " - "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond()));
    }

    // initialize clueInfos to empty lists
    List<ClueInfo> clueInfos[] = new ArrayList[intervals.size()];
    for (int i = 0; i < intervals.size(); i++) {
      clueInfos[i] = new ArrayList<ClueInfo>();
    }

    Map<Integer, Integer> intervalCount = new LinkedHashMap<>();
    // nSent is the number of sentences allowed in a clue text
    int nvalidclues = 0, nSent = 2;
    // generate clueInfos for each entity
    for (String ce : entityToLastDate.keySet()) {
      Date lastSeenDate = entityToLastDate.get(ce);

      // compute displayEntity (which has red for core words) and fullAnswer, which is a simple
      // string
      String fullAnswer = "";
      {
        List<String> tokens = Util.tokenize(ceToDisplayEntity.get(ce).iterator().next());
        for (String t : tokens) {
          if (EnglishDictionary.stopWords.contains(t.toLowerCase())) continue;
          fullAnswer += t + " ";
        }
        fullAnswer = fullAnswer.trim();
      }
      // dont want the answer to be scored low just because it has extra non-word chars in the begin
      // or end
      fullAnswer = fullAnswer.replaceAll("^\\W+|\\W+$", "");

      // which interval does this date belong to?
      int interval = -1;
      Date intervalStart = null, intervalEnd = null;
      {
        int i = 0;
        for (Pair<Date, Date> p : intervals) {
          intervalStart = p.getFirst();
          intervalEnd = p.getSecond();

          if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate))
              || intervalStart.equals(lastSeenDate)
              || intervalEnd.equals(lastSeenDate)) {
            interval = i;
            break;
          }
          i++;
        }
      }
      if (interval < 0 || interval == intervals.size())
        JSPHelper.log.info(
            "What, no interval!? for "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate));
      if (!intervalCount.containsKey(interval)) intervalCount.put(interval, 0);
      if (intervalCount.get(interval) > maxInt) continue;
      intervalCount.put(interval, intervalCount.get(interval) + 1);

      List<Integer> lengthList = Crossword.convertToWord(fullAnswer).getSecond();
      String lengthDescr = "";
      if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: ";

      for (Integer i : lengthList) {
        lengthDescr += Util.pluralize(i, "letter") + ", ";
      }
      lengthDescr = lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma.

      ClueInfo ci = new ClueInfo();
      ci.link = "../browse?term=\"" + fullAnswer + "\"&sort_by=recent&searchType=original";
      ci.lastSeenDate = lastSeenDate;
      ci.nMessages = entityToMessages.get(ce).size();
      ci.nThreads = entityToThreads.get(ce).size();

      // TODO: we are doing default initialisation of evaluators by setting it to null below, it is
      // more appropriate to consider it as an argument for this method
      Clue clue =
          cluer.createClue(
              fullAnswer,
              (personTest
                  ? ArchiveCluer.QuestionType.GUESS_CORRESPONDENT
                  : ArchiveCluer.QuestionType.FILL_IN_THE_BLANK),
              null,
              tabooCluesSet,
              null,
              intervalStart,
              intervalEnd,
              nSent,
              archive);
      if (clue != null) ci.clues = new Clue[] {clue};

      if (ci.clues == null || ci.clues.length == 0 || clue == null) {
        JSPHelper.log.warn("Did not find any clue for: " + fullAnswer);
      } else {
        // is the times value of the clue important?
        questions.add(new MemoryQuestion(this, fullAnswer, clue, 1, lengthDescr));
        nvalidclues++;
        // makes sure that the clue with the same statement is not generated again
        tabooCluesSet.add(clue.clue);
      }
      clueInfos[interval].add(ci);
    }
    log.info("Found valid clues for " + nvalidclues + " answers");
    JSPHelper.log.info("Found valid clues for " + nvalidclues + " answers");

    log.info("Top candidates are:");
    for (MemoryQuestion mq : questions)
      log.info(mq.correctAnswer + " times=" + mq.stats.nMessagesWithAnswer);

    // sort q's by clue score
    Collections.sort(questions);

    //		log.info("Based on clue score, top answers:");
    //		for (MemoryQuestion mq: questions)
    //			log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue);

    // now we have up to 2*N questions, sorted by cluescore.
    // drop ones that are prefix/suffix of another, and cap to N
    int prev_size = questions.size();

    int new_size = questions.size();

    //	log.info ("#questions before prefix-suffix elim: " + prev_size + " after: " + new_size);

    int count = 0;
    for (MemoryQuestion mq : questions) {
      mq.setQuestionNum(count++);
    }

    // log the questions as well, just in case we don't get to the final point due to user fatigue
    // or crashes
    logStats("questions.final", false);
  }