private void addAuthorsToReference(String authors, Reference reference) { if (authors.trim().endsWith(".")) authors = authors.substring(0, authors.lastIndexOf(".")).trim(); String[] authorList = authors.split(","); String firstName, lastName; for (String author : authorList) { author = author.trim(); if (!author.isEmpty() && author.length() > 2) { if (author.contains(" ")) { lastName = author.substring(0, author.indexOf(" ")).trim(); firstName = author.substring(author.indexOf(" ")).trim(); } else { lastName = author; firstName = ""; } reference.addAuthor(reference.new Author(lastName, firstName)); } } }
public void extractReferences(Document htmlDoc) { Elements references = htmlDoc.select(ContentXPath.REFERENCE.path); Element firstAuthorSNM, firstAuthorFNM, authors, citationAuthorsList, source, volume, fpage, lpage, date, citeComplete, medline, titleElement, publisherName, publisherLocation, referenceUnstructured; Elements citationAuthorsEntries, authorElements; Document referenceHtml, authorsHtml; Reference refInfo; String authorsList, firstPage, lastPage, completePages, title, completeCitation, citeNodeText, medlineLink, authorClass, firstName, lastName; if (references != null) { for (Element reference : references) { refInfo = new Reference(); // doi refInfo.setDoi(reference.attr(ContentXPath.REFERENCE_DOI_ATTR.path)); // parse content of reference referenceHtml = HtmlDocumentUtil.getHtmlDocumentFromString(reference.html()); // check if reference is unstructured referenceUnstructured = referenceHtml.select(ContentXPath.REFERENCE_UNSTRUCTURED.path).first(); if (referenceUnstructured != null) { // TODO: handle unstructured Data } else { // authors authorsList = ""; citationAuthorsList = referenceHtml.select(ContentXPath.AUTHORS2_ROOT.path).first(); if (citationAuthorsList != null) { // citation version 2 authorsHtml = HtmlDocumentUtil.getHtmlDocumentFromString(citationAuthorsList.html()); citationAuthorsEntries = authorsHtml.select(ContentXPath.AUTHORS2_ENTRY.path); for (Element authorEntry : citationAuthorsEntries) { authorElements = authorEntry.children(); lastName = ""; firstName = ""; for (Element authorElement : authorElements) { authorClass = authorElement.attr("class"); if (authorClass.contains(ContentXPath.AUTHORS2_SURNAME.path)) lastName = authorElement.text(); else if (authorClass.contains(ContentXPath.AUTHORS2_FIRSTNAME.path)) firstName = authorElement.text(); } refInfo.addAuthor(refInfo.new Author(lastName, firstName)); } } else { // citation version 1 firstAuthorSNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_SURNAME.path).first(); firstAuthorFNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_FIRSTNAME.path).first(); refInfo.addAuthor(refInfo.new Author(firstAuthorSNM.text(), firstAuthorFNM.text())); authors = referenceHtml.select(ContentXPath.AUTHORS.path).first(); authorsList = ""; if (authors != null) { authorsList = authors.text(); addAuthorsToReference(authorsList, refInfo); } } // reference source/journal source = referenceHtml.select(ContentXPath.CITE_SOURCE.path).first(); if (source != null) { refInfo.setSource(source.text()); } else { source = referenceHtml.select(ContentXPath.CITE_SOURCE_JNL.path).first(); if (source != null) { refInfo.setSource(source.text()); } } // reference volume volume = referenceHtml.select(ContentXPath.CITE_VOLUME.path).first(); if (volume != null) refInfo.setVolume(volume.text()); // reference date date = referenceHtml.select(ContentXPath.CITE_DATE.path).first(); if (date != null) refInfo.setDate(date.text()); // complete citation citeComplete = referenceHtml.select(ContentXPath.CITE_COMPLETE.path).first(); completeCitation = citeComplete.text(); refInfo.setCompleteCitation(completeCitation); // reference title titleElement = referenceHtml.select(ContentXPath.CITE_TITLE.path).first(); if (titleElement != null) { title = titleElement.text(); refInfo.setTitle(title); } else { citeNodeText = citeComplete.ownText(); title = extractTitleFromCitation(citeNodeText); refInfo.setTitle(title); } // reference first page fpage = referenceHtml.select(ContentXPath.CITE_FPAGE.path).first(); firstPage = ""; if (fpage != null) firstPage = fpage.text(); // reference last page lpage = referenceHtml.select(ContentXPath.CITE_LPAGE.path).first(); if (lpage != null) { lastPage = lpage.text(); completePages = firstPage + "-" + lastPage; refInfo.setPages(completePages); } else if (fpage != null) { completePages = extractCompletePagesFromCitation(completeCitation, firstPage); refInfo.setPages(completePages); } // pmid from medline link (if available) medline = referenceHtml.select(ContentXPath.MEDLINE_LINK.path).first(); if (medline != null) { medlineLink = medline.attr(ContentXPath.MEDLINE_LINK_ATTR.path); refInfo.setPmid(extractPMIDFromMedlineLink(medlineLink)); } // publisherName publisherName = referenceHtml.select(ContentXPath.PUBL_NAME.path).first(); if (publisherName != null) refInfo.setPublisherName(publisherName.text()); // publisherLocation publisherLocation = referenceHtml.select(ContentXPath.PUBL_LOC.path).first(); if (publisherLocation != null) refInfo.setPublisherLocation(publisherLocation.text()); } // System.out.print("doi: " + refInfo.getDoi() + "; authors: "); // for(Author author: refInfo.getAuthors()){ // System.out.print(author.getLastName() + ", " + author.getFirstName() + "; "); // } // System.out.print("source: " + refInfo.getSource() + "; volume: " + refInfo.getVolume() // + "; date: " + refInfo.getDate() + "; pages: " + refInfo.getPages() + "; title: " + // title); // System.out.println(); // System.out.println("pages: " + refInfo.getPages()); // System.out.println("pmid: " + refInfo.getPmid()); // System.out.println("citeNode: " + citeNodeText); // System.out.println("title: " + refInfo.getTitle()); // System.out.println("publisher: " + refInfo.getPublisherName() + "; loc: " + // refInfo.getPublisherLocation()); } } }