/** * Check if the file is an index.html file to an eDoc. * * @param uri - the URL as string to the index.html file. * @return true if the index.html file belongs to an eDoc. * @throws ApplicationException if the stream couldn't get opened. */ public static boolean isEDocIndex(String uri) throws ApplicationException { InputStream in = reader.read(uri); if (in != null) { Scanner scanner = new Scanner(in); scanner.useDelimiter("\n"); StringBuilder builder = new StringBuilder(); while (scanner.hasNext()) { builder.append(scanner.next()); } String content = builder.toString(); Pattern p = Pattern.compile("(?i)<META NAME=\"(.*?)\" CONTENT=\"(.*?)\">(?i)"); for (Matcher m = p.matcher(content); m.find(); ) { String tag = m.group(1); String value = m.group(2); if (tag.equals("DC.Identifier") && value.contains("edoc.bbaw.de/")) { return true; } } } return false; }
/** * This class reads from an URL and fetches the DC tags directly (with String operations). * * <p>It's designed for the eDoc server. * * @param srcUrl - the basic Url as String * @param mdRecord - the {@link MetadataRecord} to fill * @return the complete {@link MetadataRecord} * @throws ApplicationException */ public static MetadataRecord fetchHtmlDirectly(final String srcUrl, final MetadataRecord mdRecord) throws ApplicationException { InputStream in; try { in = reader.read(srcUrl); System.out.println("Building input string..."); Scanner scanner = new Scanner(in); scanner.useDelimiter("\n"); // delimiter via line break StringBuilder builder = new StringBuilder(); while (scanner.hasNext()) { builder.append(scanner.next()); // concat to one String } String line = builder.toString(); StringBuilder creatorBuilder = new StringBuilder(); // fix: more than one // creator Pattern p = Pattern.compile("(?i)<META NAME=\"(.*?)\" CONTENT=\"(.*?)\">(?i)"); // meta // pattern for (Matcher m = p.matcher(line); m.find(); ) { String tag = m.group(1); String content = m.group(2); if (tag.equals("DC.Date.Creation_of_intellectual_content")) { // creation // date Calendar cal = new GregorianCalendar(); cal.set(Calendar.YEAR, Integer.parseInt(content)); cal.set(Calendar.DAY_OF_YEAR, 1); cal.set(Calendar.HOUR, 0); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); mdRecord.setCreationDate(cal.getTime()); } else if (tag.equals("DC.Title")) { mdRecord.setTitle(content); } else if (tag.equals("DC.Creator")) { if (creatorBuilder.toString().length() == 0) { creatorBuilder.append(content); } else { creatorBuilder.append(" ; " + content); } mdRecord.setCreator(creatorBuilder.toString()); } else if (tag.equals("DC.Subject")) { mdRecord.setSwd(content); // DC.Subject follows the // Schlagwortnormdatei } else if (tag.equals("DC.Description")) { mdRecord.setDescription(content); } else if (tag.equals("DC.Identifier")) { if (content.contains("http://")) { mdRecord.setUri(content); } else if (content.contains("urn:")) { mdRecord.setUrn(content); } } } Pattern p2 = Pattern.compile( "(?i)<TD class=\"frontdoor\" valign=\"top\"><B>(.*?)</B></TD>.*?<TD class=\"frontdoor\" valign=\"top\">(.*?)</TD><"); for (Matcher m = p2.matcher(line); m.find(); ) { String key = m.group(1); String value = m.group(2).trim(); System.out.println("key:" + key); System.out.println("value:" + value); if (key.contains("pdf-Format")) { Pattern pLink = Pattern.compile("(?i)<a href=\"(.*?)(\".*?)\">.*?</a>"); Matcher mLink = pLink.matcher(key); mLink.find(); mdRecord.setRealDocUrl(mLink.group(1)); System.out.println(mLink.group(1)); } else if (key.contains("Freie Schlagwörter")) { mdRecord.setSubject(value); } else if (key.contains("DDC-Sachgruppe")) { mdRecord.setDdc(value); } else if (key.contains("Sprache")) { mdRecord.setLanguage(value); } else if (key.contains("Dokumentart")) { mdRecord.setDocumentType(value); } else if (key.contains("Publikationsdatum")) { final int day = Integer.parseInt(value.substring(0, value.indexOf("."))); final int month = Integer.parseInt(value.substring(value.indexOf(".") + 1, value.lastIndexOf("."))); final int year = Integer.parseInt(value.substring(value.lastIndexOf(".") + 1)); Calendar cal = new GregorianCalendar(); cal.set(year, month - 1, day); // bugfixed: month is 0 based! mdRecord.setPublishingDate(cal.getTime()); } else if (key.contains("ISBN")) { mdRecord.setIsbn(value); } else if (key.contains("Institut")) { mdRecord.setPublisher(value); } else if (key.contains("Collection")) { Pattern pColl = Pattern.compile("(?i)<a.*?>(.*?)</a>"); Matcher mColl = pColl.matcher(value); mColl.find(); String collections = mColl.group(1); mdRecord.setCollectionNames(collections); } else if (key.contains("In:")) { Pattern pIn = Pattern.compile("(?i)<I>(.*?)</I>"); Matcher mIn = pIn.matcher(value); if (mIn.find()) { String inPubli = mIn.group(1); mdRecord.setInPublication(inPubli); } } } // Bugfix: Institut Pattern p3 = Pattern.compile( "(?i)<TD class=\"frontdoor\" valign=\"top\"><B>Institut:</B></TD>.*?<TD class=\"frontdoor\" valign=\"top\">(.*?)</TD><"); for (Matcher m = p3.matcher(line); m.find(); ) { mdRecord.setPublisher(m.group(1)); } in.close(); return mdRecord; } catch (IOException e) { throw new ApplicationException( "Problem while parsing " + srcUrl + " for DC tags " + e.getMessage()); } }