private static void testWrapper() { JNPhysiologyWrapper jpw = new JNPhysiologyWrapper(); jpw.extractReferences( HtmlDocumentUtil.getHtmlDocumentFromFile( "C:/Users/du/Projekte/SMAFIRA/git/data/html/fulltexts/18003881_jn.physiology.org.html")); // // jpw.extractReferences(HtmlDocumentUtil.getHtmlDocumentFromFile("C:/Users/du/Projekte/SMAFIRA/git/data/html/fulltexts/12686573_jn.physiology.org.html")); }
private static void testWrapperOnAllPhysio() { JNPhysiologyWrapper jpw = new JNPhysiologyWrapper(); String directory = "C:/Users/du/Projekte/SMAFIRA/git/data/html/fulltextsWithRefs/"; String[] fileNameList = (new File(directory)).list(); for (String fileName : fileNameList) { // System.out.println("fileName: " + fileName); if (fileName.contains("jn.physiology.org")) { System.out.println("fileName_extracted: " + fileName); jpw.extractReferences(HtmlDocumentUtil.getHtmlDocumentFromFile(directory + fileName)); } } }
public void extractReferences(Document htmlDoc) { Elements references = htmlDoc.select(ContentXPath.REFERENCE.path); Element firstAuthorSNM, firstAuthorFNM, authors, citationAuthorsList, source, volume, fpage, lpage, date, citeComplete, medline, titleElement, publisherName, publisherLocation, referenceUnstructured; Elements citationAuthorsEntries, authorElements; Document referenceHtml, authorsHtml; Reference refInfo; String authorsList, firstPage, lastPage, completePages, title, completeCitation, citeNodeText, medlineLink, authorClass, firstName, lastName; if (references != null) { for (Element reference : references) { refInfo = new Reference(); // doi refInfo.setDoi(reference.attr(ContentXPath.REFERENCE_DOI_ATTR.path)); // parse content of reference referenceHtml = HtmlDocumentUtil.getHtmlDocumentFromString(reference.html()); // check if reference is unstructured referenceUnstructured = referenceHtml.select(ContentXPath.REFERENCE_UNSTRUCTURED.path).first(); if (referenceUnstructured != null) { // TODO: handle unstructured Data } else { // authors authorsList = ""; citationAuthorsList = referenceHtml.select(ContentXPath.AUTHORS2_ROOT.path).first(); if (citationAuthorsList != null) { // citation version 2 authorsHtml = HtmlDocumentUtil.getHtmlDocumentFromString(citationAuthorsList.html()); citationAuthorsEntries = authorsHtml.select(ContentXPath.AUTHORS2_ENTRY.path); for (Element authorEntry : citationAuthorsEntries) { authorElements = authorEntry.children(); lastName = ""; firstName = ""; for (Element authorElement : authorElements) { authorClass = authorElement.attr("class"); if (authorClass.contains(ContentXPath.AUTHORS2_SURNAME.path)) lastName = authorElement.text(); else if (authorClass.contains(ContentXPath.AUTHORS2_FIRSTNAME.path)) firstName = authorElement.text(); } refInfo.addAuthor(refInfo.new Author(lastName, firstName)); } } else { // citation version 1 firstAuthorSNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_SURNAME.path).first(); firstAuthorFNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_FIRSTNAME.path).first(); refInfo.addAuthor(refInfo.new Author(firstAuthorSNM.text(), firstAuthorFNM.text())); authors = referenceHtml.select(ContentXPath.AUTHORS.path).first(); authorsList = ""; if (authors != null) { authorsList = authors.text(); addAuthorsToReference(authorsList, refInfo); } } // reference source/journal source = referenceHtml.select(ContentXPath.CITE_SOURCE.path).first(); if (source != null) { refInfo.setSource(source.text()); } else { source = referenceHtml.select(ContentXPath.CITE_SOURCE_JNL.path).first(); if (source != null) { refInfo.setSource(source.text()); } } // reference volume volume = referenceHtml.select(ContentXPath.CITE_VOLUME.path).first(); if (volume != null) refInfo.setVolume(volume.text()); // reference date date = referenceHtml.select(ContentXPath.CITE_DATE.path).first(); if (date != null) refInfo.setDate(date.text()); // complete citation citeComplete = referenceHtml.select(ContentXPath.CITE_COMPLETE.path).first(); completeCitation = citeComplete.text(); refInfo.setCompleteCitation(completeCitation); // reference title titleElement = referenceHtml.select(ContentXPath.CITE_TITLE.path).first(); if (titleElement != null) { title = titleElement.text(); refInfo.setTitle(title); } else { citeNodeText = citeComplete.ownText(); title = extractTitleFromCitation(citeNodeText); refInfo.setTitle(title); } // reference first page fpage = referenceHtml.select(ContentXPath.CITE_FPAGE.path).first(); firstPage = ""; if (fpage != null) firstPage = fpage.text(); // reference last page lpage = referenceHtml.select(ContentXPath.CITE_LPAGE.path).first(); if (lpage != null) { lastPage = lpage.text(); completePages = firstPage + "-" + lastPage; refInfo.setPages(completePages); } else if (fpage != null) { completePages = extractCompletePagesFromCitation(completeCitation, firstPage); refInfo.setPages(completePages); } // pmid from medline link (if available) medline = referenceHtml.select(ContentXPath.MEDLINE_LINK.path).first(); if (medline != null) { medlineLink = medline.attr(ContentXPath.MEDLINE_LINK_ATTR.path); refInfo.setPmid(extractPMIDFromMedlineLink(medlineLink)); } // publisherName publisherName = referenceHtml.select(ContentXPath.PUBL_NAME.path).first(); if (publisherName != null) refInfo.setPublisherName(publisherName.text()); // publisherLocation publisherLocation = referenceHtml.select(ContentXPath.PUBL_LOC.path).first(); if (publisherLocation != null) refInfo.setPublisherLocation(publisherLocation.text()); } // System.out.print("doi: " + refInfo.getDoi() + "; authors: "); // for(Author author: refInfo.getAuthors()){ // System.out.print(author.getLastName() + ", " + author.getFirstName() + "; "); // } // System.out.print("source: " + refInfo.getSource() + "; volume: " + refInfo.getVolume() // + "; date: " + refInfo.getDate() + "; pages: " + refInfo.getPages() + "; title: " + // title); // System.out.println(); // System.out.println("pages: " + refInfo.getPages()); // System.out.println("pmid: " + refInfo.getPmid()); // System.out.println("citeNode: " + citeNodeText); // System.out.println("title: " + refInfo.getTitle()); // System.out.println("publisher: " + refInfo.getPublisherName() + "; loc: " + // refInfo.getPublisherLocation()); } } }