/** * @param desc * @return */ private String extractContents(String desc) { Pattern p = Pattern.compile("((?:an?|the) (.*?)(?: of (.*))?)(?:,|\\s*located)"); Pattern p2 = Pattern.compile("((incomplete .*?)(?: of (.*))?)(?:,|\\s*located)"); String result = desc; Matcher mat = p.matcher(desc); Matcher mat2 = p2.matcher(desc); mat = mat.find() ? mat : mat2.find() ? mat2 : null; if (mat != null) { String matchedText = mat.group(1); int start = mat.start(); int end = (matchedText.indexOf("located") >= 0) ? end = mat.start(1) + matchedText.indexOf("located") : mat.end(1); String contents = desc.substring(mat.start(1), end); result = desc.substring(0, start) + desc.substring(end); contents = StringUtils.trimToEmpty(contents); if (contents.lastIndexOf(",") == contents.length() - 1) { contents = contents.substring(0, contents.length() - 1); } ms.setContents(contents); LOGGER.debug(" Contents: " + contents); } return result; }
private void parseDescription() { String desc = this.description; ms.setDescription(desc); desc = desc.replaceFirst("-+see German footnotes?;?-*\\s*", ""); if (StringUtils.isBlank(desc)) { return; } else if (desc.equals("paraphrase")) { return; } if (debug) System.out.println("Parsing: " + desc); desc = stripSeeAlso(desc); desc = extractDate(desc); desc = extractCodexName(desc); desc = extractContents(desc); desc = extractLocation(desc); desc = extractRuinedMs(desc); desc = extractClassification(desc); if (!debug && !desc.matches("\\W*")) { if (desc.matches("\\W*(from( the)?)? ?same.*")) { // TODO parse related MS info. } else { // System.out.println(" Result: " + desc + "\t:: " + this.description ); unparsedDateCt++; } } if (debug) System.out.println(" Result: " + desc); if (debug) System.out.println(); }
/** * @param desc * @return */ private String extractLocation(String desc) { Pattern locationPattern = Pattern.compile( "(?:, located, )?" + "lo?cate?d " + "(formerly|possibly)? ?" + "(?:at|on|in(?: the)?|with(?: the)?)? ?" + "((?:[^,]|(?:, [A-Z]{2}))*),? ?"); String result = desc; Matcher mat = locationPattern.matcher(desc); while (mat.find()) { String location = mat.group(2); Institution inst = institutionRepo.findOrCreate(location); // em.merge(inst); ms.setCurrentInstitution(inst); result = desc.substring(0, mat.start()) + desc.substring(mat.end()); LOGGER.debug(" Location: " + location); } Pattern oldLocationPattern = Pattern.compile("formerly " + "(?:at|in(?: the)?|with(?: the)?) " + "(.*)?"); mat = oldLocationPattern.matcher(result); while (mat.find()) { String institution = mat.group(1); Institution inst = institutionRepo.findOrCreate(institution); System.out.println(institution); // em.merge(inst); ms.addPreviousInstitution(inst); result = result.substring(0, mat.start()) + result.substring(mat.end()); LOGGER.debug("Former Location: " + institution); } return result; }
/** * @param desc * @return */ public String extractDate(String desc) { String result = desc; Matcher mat = datePattern.matcher(desc); if (mat.find() && (mat.start() == 0)) { HistoricalDate date = new HistoricalDate(mat.group(1)); em.persist(date); ms.setDate(date); result = desc.substring(mat.end()); if (date.getText().matches(UNKNOWN_DATE_RE)) { return result; } String prefix = StringUtils.trimToNull(mat.group(2)); String firstDate = StringUtils.trimToNull(mat.group(3)); String separator = StringUtils.trimToNull(mat.group(4)); String lastDate = StringUtils.trimToNull(mat.group(5)); String suffix = StringUtils.trimToNull(mat.group(6)); if (suffix != null) { if (suffix.matches("c\\.?")) { date.setPrecision(Precision.CENTURY); } else { // Doesn't seem to happen } } if (prefix != null) { if (prefix.equalsIgnoreCase("ca.")) { // mark date as approximate date.setCertainty(Certainty.APPROXIMATE); } else if (prefix.equalsIgnoreCase("possibly")) { // mark date as uncertain date.setCertainty(Certainty.POSSIBLE); } else { // doesen't occur } } setStartDate(firstDate, date); setEndDate(firstDate, lastDate, separator, date); LOGGER.debug(" Date: " + date.getText()); } else { // System.out.println(desc); } return result; }