Пример #1
0
  // check the lexim type
  Type getType(String _lexim) {
    if (_lexim == null
        || _lexim.length()
            == 0) { // checking the _lexim to see if we reached the end of file or and empty one
      System.out.println("EoF");
      return null;
      // System.exit(0);
    }

    if (Character.isAlphabetic(
        _lexim.charAt(
            0))) // if the first char is alphabetic then chech to see if the token is a keyword if
                 // not return type is ID
    {
      for (String keyword1 : keyword) {
        if (_lexim.equals(keyword1)) {
          return Type.keyword;
        }
      }
      return Type.id;

    } else if (Character.isDigit(_lexim.charAt(0))
        || _lexim.charAt(0) == '-') { // check if it is a number
      for (int i = 1; i < _lexim.length(); i++) {
        if (!Character.isDigit(_lexim.charAt(i))) return Type.unkown;
      }

      return Type.number;
    } else if (!Character.isAlphabetic(_lexim.charAt(0))
        && !Character.isDigit(_lexim.charAt(0))) { // checking the other casese like the =,+ ...
      for (String tempkeyword : keyword) {
        if (_lexim.equals(tempkeyword)) {
          return Type.keyword;
        }
      }
      if (_lexim.charAt(0) == '\''
          && _lexim.charAt(2) == '\'') { // checking the value of a char var type
        return Type.value;
      }
    }
    return Type.unkown;
  }
  /**
   * Parses an entity's attribute string to get attribute values
   *
   * @param attributeStringBuilder
   * @param attributeValueType
   * @return a single attribute value or list of attribute values
   * @throws IfcFormatException
   * @throws IfcNotFoundException
   * @throws IfcValueTypeConflictException
   */
  private List<IfcValue> parseAttributeValues(
      StrBuilderWrapper attributeStrBuilderWrapper,
      IfcEntity entity,
      List<IfcAttributeInfo> entityAttributeInfos,
      IfcTypeInfo commonAttributeTypeInfo,
      EnumSet<IfcTypeEnum> commonValueTypes)
      throws IfcFormatException, IfcNotFoundException {

    logger.debug(String.format("Parsing entity '%s'", entity));

    List<IfcValue> attributeValues = new ArrayList<>();

    for (int attributeIndex = 0;
        !attributeStrBuilderWrapper.trimLeft().isEmpty();
        ++attributeIndex) {

      EnumSet<IfcTypeEnum> attributeValueTypes;
      IfcAttributeInfo attributeInfo;
      IfcTypeInfo attributeTypeInfo;
      if (commonValueTypes == null) {
        assert (attributeIndex < entityAttributeInfos.size())
            : String.format(
                "attributeIndex=%d, entityAttributeInfos.size=%s, attributeStrBuilderWrapper='%s'",
                attributeIndex, entityAttributeInfos, attributeStrBuilderWrapper);
        attributeInfo = entityAttributeInfos.get(attributeIndex);
        attributeTypeInfo = attributeInfo.getAttributeTypeInfo();
        attributeValueTypes = attributeTypeInfo.getValueTypes();
      } else {
        assert (commonAttributeTypeInfo != null);
        attributeInfo = entityAttributeInfos.get(0);
        attributeTypeInfo = commonAttributeTypeInfo;
        attributeValueTypes = commonValueTypes;
      }

      if (attributeTypeInfo instanceof IfcCollectionTypeInfo) {
        attributeTypeInfo = ((IfcCollectionTypeInfo) attributeTypeInfo).getItemTypeInfo();
      }

      switch (attributeStrBuilderWrapper.charAt(0)) {
        case IfcVocabulary.StepFormat.LINE_NUMBER_SYMBOL: // Entity
          attributeStrBuilderWrapper.skip(1);
          Long remoteLineNumber = attributeStrBuilderWrapper.getLong();
          IfcEntity remoteEntity = getEntity(remoteLineNumber);
          if (remoteEntity == null) {
            throw new IfcNotFoundException("Entity not found: #" + remoteLineNumber);
          }
          attributeValues.add(remoteEntity);
          break;

        case IfcVocabulary.StepFormat.STRING_VALUE_SYMBOL:
          String s = attributeStrBuilderWrapper.getStringBetweenSingleQuotes();
          assert attributeValueTypes.size() == 1 : "Expect attributeValueTypes.size() == 1";
          //				if (!attributeValueTypes.contains(IfcTypeEnum.GUID)) {
          attributeValues.add(new IfcLiteralValue(s, attributeTypeInfo, IfcTypeEnum.STRING));
          //					break;
          //				} else {
          //					attributeValues.add(new IfcGuidValue(s));
          //					break;
          //				}
          break;

        case IfcVocabulary.StepFormat.ENUMERATION_VALUE_SYMBOL:
          s =
              attributeStrBuilderWrapper.getStringBetweenSimilarCharacters(
                  IfcVocabulary.StepFormat.ENUMERATION_VALUE_SYMBOL);

          assert attributeValueTypes.size() == 1 : "Expect attributeValueTypes.size() == 1";
          if (!attributeValueTypes.contains(IfcTypeEnum.LOGICAL)) {
            attributeValues.add(new IfcLiteralValue(s, attributeTypeInfo, IfcTypeEnum.ENUM));
          } else {
            switch (s) {
              case "T":
              case "TRUE":
                attributeValues.add(
                    new IfcLiteralValue(
                        LogicalEnum.TRUE.toString(), attributeTypeInfo, IfcTypeEnum.LOGICAL));
                break;
              case "F":
              case "FALSE":
                attributeValues.add(
                    new IfcLiteralValue(
                        LogicalEnum.FALSE.toString(), attributeTypeInfo, IfcTypeEnum.LOGICAL));
                break;
              default:
                attributeValues.add(
                    new IfcLiteralValue(
                        LogicalEnum.UNKNOWN.toString(), attributeTypeInfo, IfcTypeEnum.LOGICAL));
                break;
            }
          }
          break;

        case IfcVocabulary.StepFormat.NULL_SYMBOL: // $
          attributeValues.add(IfcValue.NULL);
          attributeStrBuilderWrapper.skip(1);
          break;

        case IfcVocabulary.StepFormat.ANY_SYMBOL: // *
          attributeValues.add(IfcValue.ANY);
          attributeStrBuilderWrapper.skip(1);
          break;

        case StringUtils.OPENING_ROUND_BRACKET_CHAR: // List or Set
          String stringBetweenBrackets = attributeStrBuilderWrapper.getStringBetweenRoundBrackets();

          StrBuilderWrapper sbWrapper = new StrBuilderWrapper(stringBetweenBrackets);

          List<IfcAttributeInfo> attributeInfos = new ArrayList<>(1);
          attributeInfos.add(attributeInfo);

          List<IfcValue> values =
              parseAttributeValues(
                  sbWrapper, null, attributeInfos, attributeTypeInfo, attributeValueTypes);
          attributeValues.add(new IfcTemporaryCollectionValueWrapper(values));
          break;

        default:
          if (Character.isAlphabetic(attributeStrBuilderWrapper.charAt(0))) {

            //
            // parsing sub entity
            //
            String subEntityTypeInfoName = attributeStrBuilderWrapper.getIdentifierName();
            IfcNonEntityTypeInfo subNonEntityTypeInfo =
                schema.getNonEntityTypeInfo(subEntityTypeInfoName);
            attributeValueTypes = subNonEntityTypeInfo.getValueTypes();
            s = attributeStrBuilderWrapper.getStringBetweenRoundBrackets();

            assert (s != null);

            attributeInfos = new ArrayList<>(1);
            attributeInfos.add(attributeInfo);

            values =
                parseAttributeValues(
                    new StrBuilderWrapper(s),
                    null,
                    attributeInfos,
                    subNonEntityTypeInfo,
                    attributeValueTypes);
            assert values.size() == 1
                : "Expect only 1 argument: " + entity + ":" + values.toString();
            // attributeValues.add(new IfcShortEntity(subNonEntityTypeInfo,
            // (IfcLiteralValue)values.get(0)));
            attributeValues.add((IfcLiteralValue) values.get(0));
          } else {

            //
            // parsing number or datetime
            //
            assert attributeValueTypes.size() == 1 : "Expect attributeValueTypes.size() == 1";
            IfcTypeEnum attributeValueType = (IfcTypeEnum) attributeValueTypes.toArray()[0];
            Object value;
            if (attributeValueType == IfcTypeEnum.INTEGER) {
              value = attributeStrBuilderWrapper.getLong();
            } else if (attributeValueType == IfcTypeEnum.REAL) {
              value = attributeStrBuilderWrapper.getDouble();
            } else {
              assert attributeValueType == IfcTypeEnum.DATETIME;
              long timeStamp = attributeStrBuilderWrapper.getLong();
              value = Calendar.getInstance();
              ((Calendar) value).setTimeInMillis(timeStamp * 1000);
            }

            attributeValues.add(
                new IfcLiteralValue(
                    value, (IfcNonEntityTypeInfo) attributeTypeInfo, attributeValueType));
          }

          break;
      }

      attributeStrBuilderWrapper.trimLeft();
      attributeStrBuilderWrapper.getFirstMatch(StrMatcher.commaMatcher());
    }

    return attributeValues;
  }
Пример #3
0
  /** Generates person names tests from the given archive. @throws IOException */
  public void generatePersonNameQuestions(
      Archive archive,
      NERModel nerModel,
      Collection<EmailDocument> allDocs,
      Lexicon lex,
      int numClues)
      throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException,
          ParseException {
    this.archive = archive;
    questions = new ArrayList<>();
    ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex);

    tabooCluesSet = new LinkedHashSet<>();
    archive.assignThreadIds();

    List<ClueEvaluator> evaluators = getDefaultEvals();

    List<Document> docs = archive.getAllDocs();
    Multimap<Contact, EmailDocument> contactToMessages = LinkedHashMultimap.create();
    Multimap<Contact, Long> contactToThreadIds = LinkedHashMultimap.create();

    // sort by date
    Collections.sort(docs);

    Date earliestDate = null, latestDate = null;
    Map<Contact, Date> contactToLatestDate = new LinkedHashMap<>();

    // compute earliest and latest date across all messages in corpus
    for (Document doc : docs) {
      EmailDocument ed = (EmailDocument) doc;

      if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date;
      if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date;
    }
    JSPHelper.log.info(
        "===================\nStarting to generate person names memory questions from "
            + docs.size()
            + " messages with "
            + numClues
            + " questions"
            + ", earliest date = "
            + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate)
            + " latest date = "
            + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate));

    Set<Integer> tabooSentenceHashes = new LinkedHashSet<>();

    // create hashes of all sentences seen at least twice (case insensitive, lower cased)
    {
      Set<Integer> hashesSeen = new LinkedHashSet<>();
      for (Document d : docs) {
        String contents = archive.getContents(d, true);
        String cleanedContents = EmailUtils.cleanupEmailMessage(contents);
        SentenceTokenizer st = new SentenceTokenizer(cleanedContents);
        while (st.hasMoreSentences()) {
          String sentence = st.nextSentence();
          sentence = canonicalizeSentence(sentence);
          int hashCode = sentence.hashCode();
          if (hashesSeen.contains(hashCode)) {
            tabooSentenceHashes.add(hashCode);
            log.info("Marking sentence as taboo: " + sentence);
          } else hashesSeen.add(hashCode);
        }
      }
    }

    // compute contactToLatestDate that contact has been seen on
    for (Document doc : docs) {
      EmailDocument ed = (EmailDocument) doc;
      // discard doc if it is not a sent mail
      if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue;

      for (Contact c : ed.getParticipatingContactsExceptOwn(archive.addressBook)) {
        Date currentLatestDate = contactToLatestDate.get(c);
        if (currentLatestDate == null || currentLatestDate.before(ed.date))
          contactToLatestDate.put(c, ed.date);
        contactToMessages.put(c, ed);
        contactToThreadIds.put(c, ed.threadID);
      }
    }

    log.info("We are considering " + contactToLatestDate.size() + " contacts");

    Date currentDate = new Date();
    List<Pair<Date, Date>> intervals =
        computeDateIntervals(earliestDate, currentDate); // go back from current date
    // intervals[0] is the most recent.
    JSPHelper.log.info("done computing " + intervals.size() + " intervals");
    for (Pair<Date, Date> p : intervals)
      JSPHelper.log.info(
          "Interval: "
              + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst())
              + " - "
              + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond()));

    int cluesPerInterval =
        (numClues > 0 && intervals.size() > 0)
            ? (numClues + intervals.size() - 1) / intervals.size()
            : 0;
    JSPHelper.log.info(
        "Will try to generate " + Util.pluralize(cluesPerInterval, "questions") + " per interval");

    Multimap<Integer, Contact> intervalToContacts = LinkedHashMultimap.create();

    // nSent is the number of sentences allowed in a clue text
    int nSent = 2;
    for (Contact c : contactToLatestDate.keySet()) {
      Date lastSeenDate = contactToLatestDate.get(c);

      // which interval does this date belong to? we'll assign this contact in that interval in the
      // intervalToContacts map
      int interval = -1;
      Date intervalStart = null, intervalEnd = null;
      {
        int i = 0;
        for (Pair<Date, Date> p : intervals) {
          intervalStart = p.getFirst();
          intervalEnd = p.getSecond();

          if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate))
              || intervalStart.equals(lastSeenDate)
              || intervalEnd.equals(lastSeenDate)) {
            interval = i;
            break;
          }
          i++;
        }
      }

      if (interval < 0 || interval == intervals.size()) {
        JSPHelper.log.info(
            "What, no interval!? for "
                + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate));
        continue;
      }

      intervalToContacts.put(interval, c);
    }

    log.info("Interval information (interval 0 is the most recent):");
    for (int interval = 0; interval < intervals.size(); interval++) {
      Collection<Contact> contacts = intervalToContacts.get(interval);
      int nContactsForThisInterval = (contacts == null) ? 0 : contacts.size();
      log.info(
          "In interval "
              + interval
              + " there are "
              + Util.pluralize(nContactsForThisInterval, "candidate contact")
              + " who were last seen in this interval");
    }

    for (int interval = 0; interval < intervals.size(); interval++) {
      Date intervalStart = intervals.get(interval).getFirst();
      Date intervalEnd = intervals.get(interval).getSecond();
      Collection<Contact> candidateContactsForThisInterval = intervalToContacts.get(interval);
      if (candidateContactsForThisInterval == null) {
        log.info("Skipping interval " + interval + " because there are no contacts");
        continue;
      }

      Map<Clue, Contact> clueToContact = new LinkedHashMap<>();
      log.info("=======\nGenerating questions for interval " + interval);

      outer:
      for (Contact c : candidateContactsForThisInterval) {
        String name = c.pickBestName();
        if (name.length() < 2) // could also check if alphanumberic only
        continue outer;

        // ignore contact if name does not contain all alphabets. Even a period is not allowed. only
        // space is allowed.
        for (char ch : name.toCharArray()) {
          if (!Character.isAlphabetic(ch) && !Character.isSpaceChar(ch)) continue outer;
        }

        Clue clue =
            cluer.createPersonNameClue(
                c,
                evaluators,
                nerModel,
                intervalStart,
                intervalEnd,
                nSent,
                archive,
                tabooSentenceHashes);
        if (clue != null) clueToContact.put(clue, c);
      }

      List<Clue> clueList = new ArrayList(clueToContact.keySet());
      Collections.sort(clueList);
      List<Clue> selectedClues = new ArrayList<>();
      for (int i = 0; i < cluesPerInterval && i < clueList.size(); i++) {
        selectedClues.add(clueList.get(i));
      }

      log.info(
          "For interval "
              + interval
              + " selected "
              + selectedClues.size()
              + " contacts out of "
              + clueList.size()
              + " possible candidates.");
      //            for (Clue c: clueList)
      //               log.info ("Clue candidate for " + clueToContact.get(c).pickBestName() + "
      // score = " + c.clueStats.finalScore+ " clue is " + c );
      //          for (Clue c: selectedClues)
      //             log.info ("Selected clue: " + clueToContact.get(c).pickBestName() + " score = "
      // + c.clueStats.finalScore+ " clue is " + c);

      for (Clue selectedClue : selectedClues) {
        Contact c = clueToContact.get(selectedClue);
        String name = c.pickBestName();

        List<Integer> lengthList = Crossword.convertToWord(name).getSecond();
        String lengthDescr = "";
        if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: ";

        for (Integer i : lengthList) {
          lengthDescr += Util.pluralize(i, "letter") + ", ";
        }
        lengthDescr =
            lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma.

        ClueInfo ci = new ClueInfo();
        ci.lastSeenDate = contactToLatestDate.get(c);
        ci.nMessages = contactToThreadIds.get(c).size();
        ci.nThreads = contactToThreadIds.get(c).size();

        questions.add(new MemoryQuestion(this, name, selectedClue, 1, lengthDescr));
      }
    }

    log.info(questions.size() + " questions generated");

    log.info("Top candidates are:");

    // sort q's by clue score
    Collections.sort(questions);

    //		log.info("Based on clue score, top answers:");
    //		for (MemoryQuestion mq: questions)
    //			log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue);

    int count = 0;
    for (MemoryQuestion mq : questions) {
      mq.setQuestionNum(count++);
    }

    // log the questions as well, just in case we don't get to the final point due to user fatigue
    // or crashes
    logStats("questions.final", false);
  }