Exemplo n.º 1
0
  private void addAuthorsToReference(String authors, Reference reference) {
    if (authors.trim().endsWith("."))
      authors = authors.substring(0, authors.lastIndexOf(".")).trim();
    String[] authorList = authors.split(",");
    String firstName, lastName;

    for (String author : authorList) {
      author = author.trim();
      if (!author.isEmpty() && author.length() > 2) {
        if (author.contains(" ")) {
          lastName = author.substring(0, author.indexOf(" ")).trim();
          firstName = author.substring(author.indexOf(" ")).trim();
        } else {
          lastName = author;
          firstName = "";
        }

        reference.addAuthor(reference.new Author(lastName, firstName));
      }
    }
  }
Exemplo n.º 2
0
  public void extractReferences(Document htmlDoc) {
    Elements references = htmlDoc.select(ContentXPath.REFERENCE.path);
    Element firstAuthorSNM,
        firstAuthorFNM,
        authors,
        citationAuthorsList,
        source,
        volume,
        fpage,
        lpage,
        date,
        citeComplete,
        medline,
        titleElement,
        publisherName,
        publisherLocation,
        referenceUnstructured;
    Elements citationAuthorsEntries, authorElements;
    Document referenceHtml, authorsHtml;
    Reference refInfo;
    String authorsList,
        firstPage,
        lastPage,
        completePages,
        title,
        completeCitation,
        citeNodeText,
        medlineLink,
        authorClass,
        firstName,
        lastName;

    if (references != null) {
      for (Element reference : references) {
        refInfo = new Reference();

        //				doi
        refInfo.setDoi(reference.attr(ContentXPath.REFERENCE_DOI_ATTR.path));

        //				parse content of reference
        referenceHtml = HtmlDocumentUtil.getHtmlDocumentFromString(reference.html());

        //				check if reference is unstructured
        referenceUnstructured =
            referenceHtml.select(ContentXPath.REFERENCE_UNSTRUCTURED.path).first();
        if (referenceUnstructured != null) {
          //					TODO: handle unstructured Data
        } else {

          //					authors
          authorsList = "";
          citationAuthorsList = referenceHtml.select(ContentXPath.AUTHORS2_ROOT.path).first();
          if (citationAuthorsList != null) {
            //						citation version 2
            authorsHtml = HtmlDocumentUtil.getHtmlDocumentFromString(citationAuthorsList.html());
            citationAuthorsEntries = authorsHtml.select(ContentXPath.AUTHORS2_ENTRY.path);
            for (Element authorEntry : citationAuthorsEntries) {
              authorElements = authorEntry.children();
              lastName = "";
              firstName = "";
              for (Element authorElement : authorElements) {
                authorClass = authorElement.attr("class");
                if (authorClass.contains(ContentXPath.AUTHORS2_SURNAME.path))
                  lastName = authorElement.text();
                else if (authorClass.contains(ContentXPath.AUTHORS2_FIRSTNAME.path))
                  firstName = authorElement.text();
              }

              refInfo.addAuthor(refInfo.new Author(lastName, firstName));
            }
          } else {
            //						citation version 1
            firstAuthorSNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_SURNAME.path).first();
            firstAuthorFNM = referenceHtml.select(ContentXPath.FIRST_AUTHOR_FIRSTNAME.path).first();

            refInfo.addAuthor(refInfo.new Author(firstAuthorSNM.text(), firstAuthorFNM.text()));

            authors = referenceHtml.select(ContentXPath.AUTHORS.path).first();
            authorsList = "";
            if (authors != null) {
              authorsList = authors.text();
              addAuthorsToReference(authorsList, refInfo);
            }
          }

          //					reference source/journal
          source = referenceHtml.select(ContentXPath.CITE_SOURCE.path).first();
          if (source != null) {
            refInfo.setSource(source.text());
          } else {
            source = referenceHtml.select(ContentXPath.CITE_SOURCE_JNL.path).first();
            if (source != null) {
              refInfo.setSource(source.text());
            }
          }

          //					reference volume
          volume = referenceHtml.select(ContentXPath.CITE_VOLUME.path).first();
          if (volume != null) refInfo.setVolume(volume.text());

          //					reference date
          date = referenceHtml.select(ContentXPath.CITE_DATE.path).first();
          if (date != null) refInfo.setDate(date.text());

          //					complete citation
          citeComplete = referenceHtml.select(ContentXPath.CITE_COMPLETE.path).first();
          completeCitation = citeComplete.text();
          refInfo.setCompleteCitation(completeCitation);

          //					reference title
          titleElement = referenceHtml.select(ContentXPath.CITE_TITLE.path).first();
          if (titleElement != null) {
            title = titleElement.text();
            refInfo.setTitle(title);
          } else {
            citeNodeText = citeComplete.ownText();
            title = extractTitleFromCitation(citeNodeText);
            refInfo.setTitle(title);
          }

          //					reference first page
          fpage = referenceHtml.select(ContentXPath.CITE_FPAGE.path).first();
          firstPage = "";
          if (fpage != null) firstPage = fpage.text();

          //					reference last page
          lpage = referenceHtml.select(ContentXPath.CITE_LPAGE.path).first();
          if (lpage != null) {
            lastPage = lpage.text();
            completePages = firstPage + "-" + lastPage;
            refInfo.setPages(completePages);
          } else if (fpage != null) {
            completePages = extractCompletePagesFromCitation(completeCitation, firstPage);
            refInfo.setPages(completePages);
          }

          //					pmid from medline link (if available)
          medline = referenceHtml.select(ContentXPath.MEDLINE_LINK.path).first();
          if (medline != null) {
            medlineLink = medline.attr(ContentXPath.MEDLINE_LINK_ATTR.path);
            refInfo.setPmid(extractPMIDFromMedlineLink(medlineLink));
          }

          //					publisherName
          publisherName = referenceHtml.select(ContentXPath.PUBL_NAME.path).first();
          if (publisherName != null) refInfo.setPublisherName(publisherName.text());

          //					publisherLocation
          publisherLocation = referenceHtml.select(ContentXPath.PUBL_LOC.path).first();
          if (publisherLocation != null) refInfo.setPublisherLocation(publisherLocation.text());
        }

        //				System.out.print("doi: " + refInfo.getDoi() + "; authors: ");
        //				for(Author author: refInfo.getAuthors()){
        //					System.out.print(author.getLastName() + ", " + author.getFirstName() + "; ");
        //				}
        //				System.out.print("source: " + refInfo.getSource() + "; volume: " + refInfo.getVolume()
        // + "; date: " + refInfo.getDate() + "; pages: " + refInfo.getPages() + "; title: " +
        // title);
        //				System.out.println();
        //				System.out.println("pages: " + refInfo.getPages());
        //				System.out.println("pmid: " + refInfo.getPmid());
        //				System.out.println("citeNode: " + citeNodeText);
        //				System.out.println("title: " + refInfo.getTitle());
        //				System.out.println("publisher: " + refInfo.getPublisherName() + "; loc: " +
        // refInfo.getPublisherLocation());
      }
    }
  }