private BibtexEntry parseNextEntry(String allText, int startIndex) { BibtexEntry entry = null; int index = allText.indexOf("<div class=\"detail", piv); int endIndex = allText.indexOf("</div>", index); if (index >= 0 && endIndex > 0) { endIndex += 6; piv = endIndex; String text = allText.substring(index, endIndex); BibtexEntryType type = null; String sourceField = null; String typeName = ""; Matcher typeMatcher = typePattern.matcher(text); if (typeMatcher.find()) { typeName = typeMatcher.group(1); if (typeName.equalsIgnoreCase("IEEE Journals & Magazines") || typeName.equalsIgnoreCase("IEEE Early Access Articles") || typeName.equalsIgnoreCase("IET Journals & Magazines") || typeName.equalsIgnoreCase("AIP Journals & Magazines") || typeName.equalsIgnoreCase("AVS Journals & Magazines") || typeName.equalsIgnoreCase("IBM Journals & Magazines") || typeName.equalsIgnoreCase("TUP Journals & Magazines") || typeName.equalsIgnoreCase("BIAI Journals & Magazines")) { type = BibtexEntryType.getType("article"); sourceField = "journal"; } else if (typeName.equalsIgnoreCase("IEEE Conference Publications") || typeName.equalsIgnoreCase("IET Conference Publications") || typeName.equalsIgnoreCase("VDE Conference Publications")) { type = BibtexEntryType.getType("inproceedings"); sourceField = "booktitle"; } else if (typeName.equalsIgnoreCase("IEEE Standards") || typeName.equalsIgnoreCase("Standards")) { type = BibtexEntryType.getType("standard"); sourceField = "number"; } else if (typeName.equalsIgnoreCase("IEEE eLearning Library Courses")) { type = BibtexEntryType.getType("Electronic"); sourceField = "note"; } else if (typeName.equalsIgnoreCase("Wiley-IEEE Press eBook Chapters") || typeName.equalsIgnoreCase("MIT Press eBook Chapters") || typeName.equalsIgnoreCase("IEEE USA Books & eBooks")) { type = BibtexEntryType.getType("inCollection"); sourceField = "booktitle"; } } if (type == null) { type = BibtexEntryType.getType("misc"); sourceField = "note"; System.err.println("Type detection failed. Use MISC instead."); unparseable++; System.err.println(text); } entry = new BibtexEntry(IdGenerator.next(), type); if (typeName.equalsIgnoreCase("IEEE Standards")) { entry.setField("organization", "IEEE"); } if (typeName.equalsIgnoreCase("Wiley-IEEE Press eBook Chapters")) { entry.setField("publisher", "Wiley-IEEE Press"); } else if (typeName.equalsIgnoreCase("MIT Press eBook Chapters")) { entry.setField("publisher", "MIT Press"); } else if (typeName.equalsIgnoreCase("IEEE USA Books & eBooks")) { entry.setField("publisher", "IEEE USA"); } if (typeName.equalsIgnoreCase("IEEE Early Access Articles")) { entry.setField("note", "Early Access"); } Set<String> fields = fieldPatterns.keySet(); for (String field : fields) { Matcher fieldMatcher = Pattern.compile(fieldPatterns.get(field)).matcher(text); if (fieldMatcher.find()) { entry.setField(field, htmlConverter.format(fieldMatcher.group(1))); if (field.equals("title") && fieldMatcher.find()) { String sec_title = htmlConverter.format(fieldMatcher.group(1)); if (entry.getType() == BibtexEntryType.getStandardType("standard")) { sec_title = sec_title.replaceAll("IEEE Std ", ""); } entry.setField(sourceField, sec_title); } if (field.equals("pages") && fieldMatcher.groupCount() == 2) { entry.setField(field, fieldMatcher.group(1) + "-" + fieldMatcher.group(2)); } } } Matcher authorMatcher = authorPattern.matcher(text); // System.out.println(text); StringBuilder authorNames = new StringBuilder(""); int authorCount = 0; while (authorMatcher.find()) { if (authorCount >= 1) { authorNames.append(" and "); } authorNames.append(htmlConverter.format(authorMatcher.group(1))); // System.out.println(authorCount + ": " + authorMatcher.group(1)); authorCount++; } entry.setField("author", authorNames.toString()); if (entry.getField("author") == null || entry.getField("author").startsWith("a href") || entry .getField("author") .startsWith("Topic(s)")) { // Fix for some documents without authors entry.setField("author", ""); } if (entry.getType() == BibtexEntryType.getStandardType("inproceedings") && entry.getField("author").equals("")) { entry.setType(BibtexEntryType.getStandardType("proceedings")); } if (includeAbstract) { index = text.indexOf("id=\"abstract"); if (index >= 0) { endIndex = text.indexOf("</div>", index) + 6; text = text.substring(index, endIndex); Matcher absMatcher = absPattern.matcher(text); if (absMatcher.find()) { // Clean-up abstract String abstr = absMatcher.group(1); abstr = abstr.replaceAll("<span class='snippet'>([\\w]+)</span>", "$1"); entry.setField("abstract", htmlConverter.format(abstr)); } } } } if (entry == null) { return null; } else { return cleanup(entry); } }
public void endElement(String uri, String localName, String qName) { if (localName.equals("PubmedArticle")) { // bibitems.add( new Bibitem(null, makeBibtexString(), Globals.nextKey(),"-1" ) ); // check if year ="" then give medline date instead if (year.equals("")) { if (!MedlineDate.equals("")) { // multi-year date format // System.out.println(MedlineDate); year = MedlineDate.substring(0, 4); // Matcher m = Pattern.compile("\\b[0-9]{4}\\b").matcher(MedlineDate); // if(m.matches()) // year = m.group(); } } // Build a string from the collected keywords: StringBuffer sb = new StringBuffer(); for (Iterator<String> iterator = descriptors.iterator(); iterator.hasNext(); ) { String s = iterator.next(); sb.append(s); if (iterator.hasNext()) sb.append(KEYWORD_SEPARATOR); } keywords = sb.toString(); BibtexEntry b = new BibtexEntry( Util.createNeutralId(), // Globals.DEFAULT_BIBTEXENTRY_ID, Globals.getEntryType( "article")); // id assumes an existing database so don't create one here if (!author.equals("")) { b.setField( "author", htmlConverter.formatUnicode(ImportFormatReader.expandAuthorInitials(author))); // b.setField("author",Util.replaceSpecialCharacters(ImportFormatReader.expandAuthorInitials(author))); author = ""; } if (!title.equals("")) b.setField("title", htmlConverter.formatUnicode(title)); // if (!title.equals("")) b.setField("title",Util.replaceSpecialCharacters(title)); if (!journal.equals("")) b.setField("journal", journal); if (!year.equals("")) b.setField("year", year); // PENDING [email protected] 2005-05-27 : added call to fixPageRange if (!page.equals("")) b.setField("pages", fixPageRange(page)); if (!volume.equals("")) b.setField("volume", volume); if (!language.equals("")) b.setField("language", language); if (!pst.equals("")) b.setField("medline-pst", pst); if (!abstractText.equals("")) b.setField("abstract", abstractText.replaceAll("%", "\\\\%")); if (!keywords.equals("")) b.setField("keywords", keywords); if (!month.equals("")) b.setField("month", month); // if (!url.equals("")) b.setField("url",url); if (!number.equals("")) b.setField("number", number); if (!doi.equals("")) { b.setField("doi", doi); b.setField("url", "http://dx.doi.org/" + doi); } if (!pii.equals("")) b.setField("pii", pii); if (!affiliation.equals("")) { b.setField("institution", affiliation.replaceAll("#", "\\\\#")); } // PENDING [email protected] 2005-05-27 : added "pmid" bibtex field // Older references do not have doi entries, but every // medline entry has a unique pubmed ID (aka primary ID). // Add a bibtex field for the pubmed ID for future use. if (!pubmedid.equals("")) b.setField("pmid", pubmedid); bibitems.add(b); abstractText = ""; author = ""; title = ""; journal = ""; keywords = ""; doi = ""; pii = ""; year = ""; forename = ""; lastName = ""; suffix = ""; abstractText = ""; affiliation = ""; pubmedid = ""; majorTopic = ""; minorTopics = ""; month = ""; volume = ""; language = ""; pst = ""; lastname = ""; suffix = ""; initials = ""; number = ""; page = ""; medlineID = ""; url = ""; MedlineDate = ""; descriptors.clear(); } else if (localName.equals("ArticleTitle")) { inTitle = false; } else if (localName.equals("PubDate")) { inPubDate = false; } else if (localName.equals("Year")) { inYear = false; } else if (localName.equals("PMID")) { inPubMedID = false; } else if (localName.equals("MedlineDate")) { inMedlineDate = false; } else if (localName.equals("MedlineTA")) { inJournal = false; } // journal name else if (localName.equals("Month")) { inMonth = false; } else if (localName.equals("Volume")) { inVolume = false; } else if (localName.equals("Language")) { inLanguage = false; } else if (localName.equals("PublicationStatus")) { inPst = false; } else if (localName.equals("AuthorList")) { author = join(authors.toArray(), " and "); inAuthorList = false; } else if (localName.equals("Author")) { // forename sometimes has initials with " " in middle: is pattern [A-Z] [A-Z] // when above is the case replace it with initials if (forename.length() == 3 && forename.charAt(1) == ' ') { forename = initials; } // Put together name with last name first, and enter suffix in between if present: if (lastname.indexOf(" ") > 0) author = "{" + lastname + "}"; else author = lastname; if (suffix.length() > 0) author = author + ", " + suffix; if (forename.length() > 0) author = author + ", " + forename; // author = initials + " " + lastname; authors.add(author); inAuthor = false; forename = ""; initials = ""; lastname = ""; suffix = ""; } else if (localName.equals("DescriptorName")) inDescriptorName = false; else if (localName.equals("QualifierName")) inQualifierName = false; else if (localName.equals("MeshHeading")) { inMeshHeader = false; if (minorTopics.equals("")) descriptors.add(majorTopic); else descriptors.add(majorTopic + ", " + minorTopics); } else if (localName.equals("LastName")) { inLastName = false; } else if (localName.equals("Suffix")) { inSuffix = false; } else if (localName.equals("ForeName") || localName.equals("FirstName")) { inForename = false; } else if (localName.equals("Issue")) { inIssue = false; } else if (localName.equals("MedlinePgn")) { inMedlinePgn = false; } // pagenumber else if (localName.equals("URL")) { inUrl = false; } else if (localName.equals("Initials")) { // initials= '.' + initials + '.'; inInitials = false; } else if (localName.equals("AbstractText")) { inAbstractText = false; } else if (localName.equals("Affiliation")) { inAffiliation = false; } else if (localName.equals("ArticleId")) { if (inDoi) inDoi = false; else if (inPii) inPii = false; } }
/** * This method must convert HTML style char sequences to normal characters. * * @param text The text to handle. * @return The converted text. */ private String convertHTMLChars(String text) { return htmlConverter.format(text); }