// check the lexim type Type getType(String _lexim) { if (_lexim == null || _lexim.length() == 0) { // checking the _lexim to see if we reached the end of file or and empty one System.out.println("EoF"); return null; // System.exit(0); } if (Character.isAlphabetic( _lexim.charAt( 0))) // if the first char is alphabetic then chech to see if the token is a keyword if // not return type is ID { for (String keyword1 : keyword) { if (_lexim.equals(keyword1)) { return Type.keyword; } } return Type.id; } else if (Character.isDigit(_lexim.charAt(0)) || _lexim.charAt(0) == '-') { // check if it is a number for (int i = 1; i < _lexim.length(); i++) { if (!Character.isDigit(_lexim.charAt(i))) return Type.unkown; } return Type.number; } else if (!Character.isAlphabetic(_lexim.charAt(0)) && !Character.isDigit(_lexim.charAt(0))) { // checking the other casese like the =,+ ... for (String tempkeyword : keyword) { if (_lexim.equals(tempkeyword)) { return Type.keyword; } } if (_lexim.charAt(0) == '\'' && _lexim.charAt(2) == '\'') { // checking the value of a char var type return Type.value; } } return Type.unkown; }
/** * Parses an entity's attribute string to get attribute values * * @param attributeStringBuilder * @param attributeValueType * @return a single attribute value or list of attribute values * @throws IfcFormatException * @throws IfcNotFoundException * @throws IfcValueTypeConflictException */ private List<IfcValue> parseAttributeValues( StrBuilderWrapper attributeStrBuilderWrapper, IfcEntity entity, List<IfcAttributeInfo> entityAttributeInfos, IfcTypeInfo commonAttributeTypeInfo, EnumSet<IfcTypeEnum> commonValueTypes) throws IfcFormatException, IfcNotFoundException { logger.debug(String.format("Parsing entity '%s'", entity)); List<IfcValue> attributeValues = new ArrayList<>(); for (int attributeIndex = 0; !attributeStrBuilderWrapper.trimLeft().isEmpty(); ++attributeIndex) { EnumSet<IfcTypeEnum> attributeValueTypes; IfcAttributeInfo attributeInfo; IfcTypeInfo attributeTypeInfo; if (commonValueTypes == null) { assert (attributeIndex < entityAttributeInfos.size()) : String.format( "attributeIndex=%d, entityAttributeInfos.size=%s, attributeStrBuilderWrapper='%s'", attributeIndex, entityAttributeInfos, attributeStrBuilderWrapper); attributeInfo = entityAttributeInfos.get(attributeIndex); attributeTypeInfo = attributeInfo.getAttributeTypeInfo(); attributeValueTypes = attributeTypeInfo.getValueTypes(); } else { assert (commonAttributeTypeInfo != null); attributeInfo = entityAttributeInfos.get(0); attributeTypeInfo = commonAttributeTypeInfo; attributeValueTypes = commonValueTypes; } if (attributeTypeInfo instanceof IfcCollectionTypeInfo) { attributeTypeInfo = ((IfcCollectionTypeInfo) attributeTypeInfo).getItemTypeInfo(); } switch (attributeStrBuilderWrapper.charAt(0)) { case IfcVocabulary.StepFormat.LINE_NUMBER_SYMBOL: // Entity attributeStrBuilderWrapper.skip(1); Long remoteLineNumber = attributeStrBuilderWrapper.getLong(); IfcEntity remoteEntity = getEntity(remoteLineNumber); if (remoteEntity == null) { throw new IfcNotFoundException("Entity not found: #" + remoteLineNumber); } attributeValues.add(remoteEntity); break; case IfcVocabulary.StepFormat.STRING_VALUE_SYMBOL: String s = attributeStrBuilderWrapper.getStringBetweenSingleQuotes(); assert attributeValueTypes.size() == 1 : "Expect attributeValueTypes.size() == 1"; // if (!attributeValueTypes.contains(IfcTypeEnum.GUID)) { attributeValues.add(new IfcLiteralValue(s, attributeTypeInfo, IfcTypeEnum.STRING)); // break; // } else { // attributeValues.add(new IfcGuidValue(s)); // break; // } break; case IfcVocabulary.StepFormat.ENUMERATION_VALUE_SYMBOL: s = attributeStrBuilderWrapper.getStringBetweenSimilarCharacters( IfcVocabulary.StepFormat.ENUMERATION_VALUE_SYMBOL); assert attributeValueTypes.size() == 1 : "Expect attributeValueTypes.size() == 1"; if (!attributeValueTypes.contains(IfcTypeEnum.LOGICAL)) { attributeValues.add(new IfcLiteralValue(s, attributeTypeInfo, IfcTypeEnum.ENUM)); } else { switch (s) { case "T": case "TRUE": attributeValues.add( new IfcLiteralValue( LogicalEnum.TRUE.toString(), attributeTypeInfo, IfcTypeEnum.LOGICAL)); break; case "F": case "FALSE": attributeValues.add( new IfcLiteralValue( LogicalEnum.FALSE.toString(), attributeTypeInfo, IfcTypeEnum.LOGICAL)); break; default: attributeValues.add( new IfcLiteralValue( LogicalEnum.UNKNOWN.toString(), attributeTypeInfo, IfcTypeEnum.LOGICAL)); break; } } break; case IfcVocabulary.StepFormat.NULL_SYMBOL: // $ attributeValues.add(IfcValue.NULL); attributeStrBuilderWrapper.skip(1); break; case IfcVocabulary.StepFormat.ANY_SYMBOL: // * attributeValues.add(IfcValue.ANY); attributeStrBuilderWrapper.skip(1); break; case StringUtils.OPENING_ROUND_BRACKET_CHAR: // List or Set String stringBetweenBrackets = attributeStrBuilderWrapper.getStringBetweenRoundBrackets(); StrBuilderWrapper sbWrapper = new StrBuilderWrapper(stringBetweenBrackets); List<IfcAttributeInfo> attributeInfos = new ArrayList<>(1); attributeInfos.add(attributeInfo); List<IfcValue> values = parseAttributeValues( sbWrapper, null, attributeInfos, attributeTypeInfo, attributeValueTypes); attributeValues.add(new IfcTemporaryCollectionValueWrapper(values)); break; default: if (Character.isAlphabetic(attributeStrBuilderWrapper.charAt(0))) { // // parsing sub entity // String subEntityTypeInfoName = attributeStrBuilderWrapper.getIdentifierName(); IfcNonEntityTypeInfo subNonEntityTypeInfo = schema.getNonEntityTypeInfo(subEntityTypeInfoName); attributeValueTypes = subNonEntityTypeInfo.getValueTypes(); s = attributeStrBuilderWrapper.getStringBetweenRoundBrackets(); assert (s != null); attributeInfos = new ArrayList<>(1); attributeInfos.add(attributeInfo); values = parseAttributeValues( new StrBuilderWrapper(s), null, attributeInfos, subNonEntityTypeInfo, attributeValueTypes); assert values.size() == 1 : "Expect only 1 argument: " + entity + ":" + values.toString(); // attributeValues.add(new IfcShortEntity(subNonEntityTypeInfo, // (IfcLiteralValue)values.get(0))); attributeValues.add((IfcLiteralValue) values.get(0)); } else { // // parsing number or datetime // assert attributeValueTypes.size() == 1 : "Expect attributeValueTypes.size() == 1"; IfcTypeEnum attributeValueType = (IfcTypeEnum) attributeValueTypes.toArray()[0]; Object value; if (attributeValueType == IfcTypeEnum.INTEGER) { value = attributeStrBuilderWrapper.getLong(); } else if (attributeValueType == IfcTypeEnum.REAL) { value = attributeStrBuilderWrapper.getDouble(); } else { assert attributeValueType == IfcTypeEnum.DATETIME; long timeStamp = attributeStrBuilderWrapper.getLong(); value = Calendar.getInstance(); ((Calendar) value).setTimeInMillis(timeStamp * 1000); } attributeValues.add( new IfcLiteralValue( value, (IfcNonEntityTypeInfo) attributeTypeInfo, attributeValueType)); } break; } attributeStrBuilderWrapper.trimLeft(); attributeStrBuilderWrapper.getFirstMatch(StrMatcher.commaMatcher()); } return attributeValues; }
/** Generates person names tests from the given archive. @throws IOException */ public void generatePersonNameQuestions( Archive archive, NERModel nerModel, Collection<EmailDocument> allDocs, Lexicon lex, int numClues) throws IOException, GeneralSecurityException, ClassNotFoundException, ReadContentsException, ParseException { this.archive = archive; questions = new ArrayList<>(); ArchiveCluer cluer = new ArchiveCluer(null, archive, nerModel, null, lex); tabooCluesSet = new LinkedHashSet<>(); archive.assignThreadIds(); List<ClueEvaluator> evaluators = getDefaultEvals(); List<Document> docs = archive.getAllDocs(); Multimap<Contact, EmailDocument> contactToMessages = LinkedHashMultimap.create(); Multimap<Contact, Long> contactToThreadIds = LinkedHashMultimap.create(); // sort by date Collections.sort(docs); Date earliestDate = null, latestDate = null; Map<Contact, Date> contactToLatestDate = new LinkedHashMap<>(); // compute earliest and latest date across all messages in corpus for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; if (earliestDate == null || ed.date.before(earliestDate)) earliestDate = ed.date; if (latestDate == null || ed.date.after(latestDate)) latestDate = ed.date; } JSPHelper.log.info( "===================\nStarting to generate person names memory questions from " + docs.size() + " messages with " + numClues + " questions" + ", earliest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(earliestDate) + " latest date = " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(latestDate)); Set<Integer> tabooSentenceHashes = new LinkedHashSet<>(); // create hashes of all sentences seen at least twice (case insensitive, lower cased) { Set<Integer> hashesSeen = new LinkedHashSet<>(); for (Document d : docs) { String contents = archive.getContents(d, true); String cleanedContents = EmailUtils.cleanupEmailMessage(contents); SentenceTokenizer st = new SentenceTokenizer(cleanedContents); while (st.hasMoreSentences()) { String sentence = st.nextSentence(); sentence = canonicalizeSentence(sentence); int hashCode = sentence.hashCode(); if (hashesSeen.contains(hashCode)) { tabooSentenceHashes.add(hashCode); log.info("Marking sentence as taboo: " + sentence); } else hashesSeen.add(hashCode); } } } // compute contactToLatestDate that contact has been seen on for (Document doc : docs) { EmailDocument ed = (EmailDocument) doc; // discard doc if it is not a sent mail if ((ed.sentOrReceived(archive.addressBook) & EmailDocument.SENT_MASK) == 0) continue; for (Contact c : ed.getParticipatingContactsExceptOwn(archive.addressBook)) { Date currentLatestDate = contactToLatestDate.get(c); if (currentLatestDate == null || currentLatestDate.before(ed.date)) contactToLatestDate.put(c, ed.date); contactToMessages.put(c, ed); contactToThreadIds.put(c, ed.threadID); } } log.info("We are considering " + contactToLatestDate.size() + " contacts"); Date currentDate = new Date(); List<Pair<Date, Date>> intervals = computeDateIntervals(earliestDate, currentDate); // go back from current date // intervals[0] is the most recent. JSPHelper.log.info("done computing " + intervals.size() + " intervals"); for (Pair<Date, Date> p : intervals) JSPHelper.log.info( "Interval: " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getFirst()) + " - " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(p.getSecond())); int cluesPerInterval = (numClues > 0 && intervals.size() > 0) ? (numClues + intervals.size() - 1) / intervals.size() : 0; JSPHelper.log.info( "Will try to generate " + Util.pluralize(cluesPerInterval, "questions") + " per interval"); Multimap<Integer, Contact> intervalToContacts = LinkedHashMultimap.create(); // nSent is the number of sentences allowed in a clue text int nSent = 2; for (Contact c : contactToLatestDate.keySet()) { Date lastSeenDate = contactToLatestDate.get(c); // which interval does this date belong to? we'll assign this contact in that interval in the // intervalToContacts map int interval = -1; Date intervalStart = null, intervalEnd = null; { int i = 0; for (Pair<Date, Date> p : intervals) { intervalStart = p.getFirst(); intervalEnd = p.getSecond(); if ((intervalStart.before(lastSeenDate) && intervalEnd.after(lastSeenDate)) || intervalStart.equals(lastSeenDate) || intervalEnd.equals(lastSeenDate)) { interval = i; break; } i++; } } if (interval < 0 || interval == intervals.size()) { JSPHelper.log.info( "What, no interval!? for " + edu.stanford.muse.email.CalendarUtil.formatDateForDisplay(lastSeenDate)); continue; } intervalToContacts.put(interval, c); } log.info("Interval information (interval 0 is the most recent):"); for (int interval = 0; interval < intervals.size(); interval++) { Collection<Contact> contacts = intervalToContacts.get(interval); int nContactsForThisInterval = (contacts == null) ? 0 : contacts.size(); log.info( "In interval " + interval + " there are " + Util.pluralize(nContactsForThisInterval, "candidate contact") + " who were last seen in this interval"); } for (int interval = 0; interval < intervals.size(); interval++) { Date intervalStart = intervals.get(interval).getFirst(); Date intervalEnd = intervals.get(interval).getSecond(); Collection<Contact> candidateContactsForThisInterval = intervalToContacts.get(interval); if (candidateContactsForThisInterval == null) { log.info("Skipping interval " + interval + " because there are no contacts"); continue; } Map<Clue, Contact> clueToContact = new LinkedHashMap<>(); log.info("=======\nGenerating questions for interval " + interval); outer: for (Contact c : candidateContactsForThisInterval) { String name = c.pickBestName(); if (name.length() < 2) // could also check if alphanumberic only continue outer; // ignore contact if name does not contain all alphabets. Even a period is not allowed. only // space is allowed. for (char ch : name.toCharArray()) { if (!Character.isAlphabetic(ch) && !Character.isSpaceChar(ch)) continue outer; } Clue clue = cluer.createPersonNameClue( c, evaluators, nerModel, intervalStart, intervalEnd, nSent, archive, tabooSentenceHashes); if (clue != null) clueToContact.put(clue, c); } List<Clue> clueList = new ArrayList(clueToContact.keySet()); Collections.sort(clueList); List<Clue> selectedClues = new ArrayList<>(); for (int i = 0; i < cluesPerInterval && i < clueList.size(); i++) { selectedClues.add(clueList.get(i)); } log.info( "For interval " + interval + " selected " + selectedClues.size() + " contacts out of " + clueList.size() + " possible candidates."); // for (Clue c: clueList) // log.info ("Clue candidate for " + clueToContact.get(c).pickBestName() + " // score = " + c.clueStats.finalScore+ " clue is " + c ); // for (Clue c: selectedClues) // log.info ("Selected clue: " + clueToContact.get(c).pickBestName() + " score = " // + c.clueStats.finalScore+ " clue is " + c); for (Clue selectedClue : selectedClues) { Contact c = clueToContact.get(selectedClue); String name = c.pickBestName(); List<Integer> lengthList = Crossword.convertToWord(name).getSecond(); String lengthDescr = ""; if (lengthList.size() > 1) lengthDescr += Integer.toString(lengthList.size()) + " words: "; for (Integer i : lengthList) { lengthDescr += Util.pluralize(i, "letter") + ", "; } lengthDescr = lengthDescr.substring(0, lengthDescr.length() - 2); // subtract the extra comma. ClueInfo ci = new ClueInfo(); ci.lastSeenDate = contactToLatestDate.get(c); ci.nMessages = contactToThreadIds.get(c).size(); ci.nThreads = contactToThreadIds.get(c).size(); questions.add(new MemoryQuestion(this, name, selectedClue, 1, lengthDescr)); } } log.info(questions.size() + " questions generated"); log.info("Top candidates are:"); // sort q's by clue score Collections.sort(questions); // log.info("Based on clue score, top answers:"); // for (MemoryQuestion mq: questions) // log.info (mq.correctAnswer + " times= clue=" + mq.clue.clue); int count = 0; for (MemoryQuestion mq : questions) { mq.setQuestionNum(count++); } // log the questions as well, just in case we don't get to the final point due to user fatigue // or crashes logStats("questions.final", false); }