/** * Checks if the two entries represent the same publication. * * @param one BibEntry * @param two BibEntry * @return boolean */ public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) { // First check if they are of the same type - a necessary condition: if (!one.getType().equals(two.getType())) { return false; } EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode); // The check if they have the same required fields: java.util.List<String> var = type.getRequiredFieldsFlat(); String[] fields = var.toArray(new String[var.size()]); double[] req; if (fields == null) { req = new double[] {0., 0.}; } else { req = DuplicateCheck.compareFieldSet(fields, one, two); } if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) { // Far from the threshold value, so we base our decision on the req. fields only return req[0] >= DuplicateCheck.duplicateThreshold; } // Close to the threshold value, so we take a look at the optional fields, if any: java.util.List<String> optionalFields = type.getOptionalFields(); fields = optionalFields.toArray(new String[optionalFields.size()]); if (fields != null) { double[] opt = DuplicateCheck.compareFieldSet(fields, one, two); double totValue = ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1])) / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]); return totValue >= DuplicateCheck.duplicateThreshold; } return req[0] >= DuplicateCheck.duplicateThreshold; }
/** Parse the entries in the source, and return a List of BibEntry objects. */ @Override public List<BibEntry> importEntries(InputStream stream, OutputPrinter status) throws IOException { ArrayList<BibEntry> bibitems = new ArrayList<>(); StringBuilder sb = new StringBuilder(); String str; try (BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream))) { while ((str = in.readLine()) != null) { if (str.length() < 2) { continue; } if (str.indexOf("Record") == 0) { sb.append("__::__").append(str); } else { sb.append("__NEWFIELD__").append(str); } } } String[] entries = sb.toString().split("__::__"); String type = ""; HashMap<String, String> h = new HashMap<>(); for (String entry : entries) { if (entry.indexOf("Record") != 0) { continue; } h.clear(); String[] fields = entry.split("__NEWFIELD__"); for (String s : fields) { // System.out.println(fields[j]); String f3 = s.substring(0, 2); String frest = s.substring(5); if ("TI".equals(f3)) { h.put("title", frest); } else if ("PY".equals(f3)) { h.put("year", frest); } else if ("AU".equals(f3)) { h.put( "author", AuthorList.fixAuthor_lastNameFirst(frest.replace(",-", ", ").replace(";", " and "))); } else if ("AB".equals(f3)) { h.put("abstract", frest); } else if ("ID".equals(f3)) { h.put("keywords", frest); } else if ("SO".equals(f3)) { int m = frest.indexOf('.'); if (m >= 0) { String jr = frest.substring(0, m); h.put("journal", jr.replace("-", " ")); frest = frest.substring(m); m = frest.indexOf(';'); if (m >= 5) { String yr = frest.substring(m - 5, m); h.put("year", yr); frest = frest.substring(m); m = frest.indexOf(':'); if (m >= 0) { String pg = frest.substring(m + 1).trim(); h.put("pages", pg); h.put("volume", frest.substring(1, m)); } } } } else if ("RT".equals(f3)) { frest = frest.trim(); if ("Journal-Paper".equals(frest)) { type = "article"; } else if ("Conference-Paper".equals(frest) || "Conference-Paper; Journal-Paper".equals(frest)) { type = "inproceedings"; } else { type = frest.replace(" ", ""); } } } BibEntry b = new BibEntry( DEFAULT_BIBTEXENTRY_ID, EntryTypes.getTypeOrDefault(type)); // id assumes an existing database so don't // create one here b.setField(h); bibitems.add(b); } return bibitems; }
@Override public void endElement(String uri, String localName, String qName) { if ("PubmedArticle".equals(localName)) { // bibitems.add( new Bibitem(null, makeBibtexString(), Globals.nextKey(),"-1" ) ); // check if year ="" then give medline date instead if ("".equals(year)) { if (!"".equals(MedlineDate)) { // multi-year date format // System.out.println(MedlineDate); year = MedlineDate.substring(0, 4); // Matcher m = Pattern.compile("\\b[0-9]{4}\\b").matcher(MedlineDate); // if(m.matches()) // year = m.group(); } } // Build a string from the collected keywords: StringBuilder sb = new StringBuilder(); for (Iterator<String> iterator = descriptors.iterator(); iterator.hasNext(); ) { String s = iterator.next(); sb.append(s); if (iterator.hasNext()) { sb.append(MedlineHandler.KEYWORD_SEPARATOR); } } String keywords = sb.toString(); BibEntry b = new BibEntry( IdGenerator.next(), // Globals.DEFAULT_BIBTEXENTRY_ID, EntryTypes.getTypeOrDefault( "article")); // id assumes an existing database so don't create one here if (!"".equals(author)) { b.setField( "author", MedlineHandler.htmlConverter.formatUnicode( ImportFormatReader.expandAuthorInitials(author))); // b.setField("author",Util.replaceSpecialCharacters(ImportFormatReader.expandAuthorInitials(author))); author = ""; } if (!"".equals(title)) { b.setField("title", MedlineHandler.htmlConverter.formatUnicode(title)); } // if (!title.equals("")) b.setField("title",Util.replaceSpecialCharacters(title)); if (!"".equals(journal)) { b.setField("journal", journal); } if (!"".equals(year)) { b.setField("year", year); } // PENDING [email protected] 2005-05-27 : added call to fixPageRange if (!"".equals(page)) { b.setField("pages", fixPageRange(page)); } if (!"".equals(volume)) { b.setField("volume", volume); } if (!"".equals(language)) { b.setField("language", language); } if (!"".equals(pst)) { b.setField("medline-pst", pst); } if (!"".equals(abstractText)) { b.setField("abstract", abstractText.replaceAll("%", "\\\\%")); } if (!"".equals(keywords)) { b.setField("keywords", keywords); } if (!"".equals(month)) { b.setField("month", month); } // if (!url.equals("")) b.setField("url",url); if (!"".equals(number)) { b.setField("number", number); } if (!"".equals(doi)) { b.setField("doi", doi); b.setField("url", "http://dx.doi.org/" + doi); } if (!"".equals(pii)) { b.setField("pii", pii); } if (!"".equals(pmc)) { b.setField("pmc", pmc); } if (!"".equals(affiliation)) { b.setField("institution", affiliation.replaceAll("#", "\\\\#")); } // PENDING [email protected] 2005-05-27 : added "pmid" bibtex field // Older references do not have doi entries, but every // medline entry has a unique pubmed ID (aka primary ID). // Add a bibtex field for the pubmed ID for future use. if (!"".equals(pubmedid)) { b.setField("pmid", pubmedid); } bibitems.add(b); abstractText = ""; author = ""; title = ""; journal = ""; keywords = ""; doi = ""; pii = ""; pmc = ""; year = ""; forename = ""; lastName = ""; suffix = ""; abstractText = ""; affiliation = ""; pubmedid = ""; majorTopic = ""; minorTopics = ""; month = ""; volume = ""; language = ""; pst = ""; lastname = ""; suffix = ""; initials = ""; number = ""; page = ""; String medlineID = ""; String url = ""; MedlineDate = ""; descriptors.clear(); } else if ("ArticleTitle".equals(localName)) { inTitle = false; } else if ("PubDate".equals(localName)) { inPubDate = false; } else if ("Year".equals(localName)) { inYear = false; } else if ("PMID".equals(localName)) { inPubMedID = false; } else if ("MedlineDate".equals(localName)) { inMedlineDate = false; } else if ("MedlineTA".equals(localName)) { inJournal = false; } // journal name else if ("Month".equals(localName)) { inMonth = false; } else if ("Volume".equals(localName)) { inVolume = false; } else if ("Language".equals(localName)) { inLanguage = false; } else if ("PublicationStatus".equals(localName)) { inPst = false; } else if ("AuthorList".equals(localName)) { author = join(authors.toArray(), " and "); inAuthorList = false; } else if ("Author".equals(localName)) { // forename sometimes has initials with " " in middle: is pattern [A-Z] [A-Z] // when above is the case replace it with initials if ((forename.length() == 3) && (forename.charAt(1) == ' ')) { forename = initials; } // Put together name with last name first, and enter suffix in between if present: if (lastname.indexOf(" ") > 0) { author = "{" + lastname + "}"; } else { author = lastname; } if (!suffix.isEmpty()) { author = author + ", " + suffix; } if (!forename.isEmpty()) { author = author + ", " + forename; } // author = initials + " " + lastname; authors.add(author); inAuthor = false; forename = ""; initials = ""; lastname = ""; suffix = ""; } else if ("DescriptorName".equals(localName)) { inDescriptorName = false; } else if ("QualifierName".equals(localName)) { inQualifierName = false; } else if ("MeshHeading".equals(localName)) { inMeshHeader = false; if ("".equals(minorTopics)) { descriptors.add(majorTopic); } else { descriptors.add(majorTopic + ", " + minorTopics); } } else if ("LastName".equals(localName)) { inLastName = false; } else if ("Suffix".equals(localName)) { inSuffix = false; } else if ("ForeName".equals(localName) || "FirstName".equals(localName)) { inForename = false; } else if ("Issue".equals(localName)) { inIssue = false; } else if ("MedlinePgn".equals(localName)) { inMedlinePgn = false; } // pagenumber else if ("URL".equals(localName)) { inUrl = false; } else if ("Initials".equals(localName)) { // initials= '.' + initials + '.'; inInitials = false; } else if ("AbstractText".equals(localName)) { inAbstractText = false; } else if ("Affiliation".equals(localName)) { inAffiliation = false; } else if ("ArticleId".equals(localName)) { if (inDoi) { inDoi = false; } else if (inPii) { inPii = false; } else if (inPmc) { inPmc = false; } } }
/** Parse the entries in the source, and return a List of BibEntry objects. */ @Override public List<BibEntry> importEntries(InputStream stream, OutputPrinter status) throws IOException { if (stream == null) { throw new IOException("No stream given."); } ArrayList<BibEntry> bibitems = new ArrayList<>(); StringBuilder sb = new StringBuilder(); BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream)); // Pattern fieldPattern = Pattern.compile("^AU |^TI |^SO |^DT |^C1 |^AB // |^ID |^BP |^PY |^SE |^PY |^VL |^IS "); String str; while ((str = in.readLine()) != null) { if (str.length() < 3) { continue; } // begining of a new item if ("PT ".equals(str.substring(0, 3))) { sb.append("::").append(str); } else { String beg = str.substring(0, 3).trim(); // I could have used the fieldPattern regular expression instead // however this seems to be // quick and dirty and it works! if (beg.length() == 2) { sb.append(" ## "); // mark the begining of each field sb.append(str); } else { sb.append("EOLEOL"); // mark the end of each line sb.append(str.trim()); // remove the initial spaces } } } String[] entries = sb.toString().split("::"); HashMap<String, String> hm = new HashMap<>(); // skip the first entry as it is either empty or has document header for (String entry : entries) { String[] fields = entry.split(" ## "); if (fields.length == 0) { fields = entry.split("\n"); } String Type = ""; String PT = ""; String pages = ""; hm.clear(); for (String field : fields) { // empty field don't do anything if (field.length() <= 2) { continue; } String beg = field.substring(0, 2); String value = field.substring(3); if (value.startsWith(" - ")) { value = value.substring(3); } value = value.trim(); if ("PT".equals(beg)) { if (value.startsWith("J")) { PT = "article"; } else { PT = value; } Type = "article"; // make all of them PT? } else if ("TY".equals(beg)) { if ("JOUR".equals(value)) { Type = "article"; } else if ("CONF".equals(value)) { Type = "inproceedings"; } } else if ("JO".equals(beg)) { hm.put("booktitle", value); } else if ("AU".equals(beg)) { String author = IsiImporter.isiAuthorsConvert(value.replaceAll("EOLEOL", " and ")); // if there is already someone there then append with "and" if (hm.get("author") != null) { author = hm.get("author") + " and " + author; } hm.put("author", author); } else if ("TI".equals(beg)) { hm.put("title", value.replaceAll("EOLEOL", " ")); } else if ("SO".equals(beg) || "JA".equals(beg)) { hm.put("journal", value.replaceAll("EOLEOL", " ")); } else if ("ID".equals(beg) || "KW".equals(beg)) { value = value.replaceAll("EOLEOL", " "); String existingKeywords = hm.get("keywords"); if ((existingKeywords == null) || existingKeywords.contains(value)) { existingKeywords = value; } else { existingKeywords += ", " + value; } hm.put("keywords", existingKeywords); } else if ("AB".equals(beg)) { hm.put("abstract", value.replaceAll("EOLEOL", " ")); } else if ("BP".equals(beg) || "BR".equals(beg) || "SP".equals(beg)) { pages = value; } else if ("EP".equals(beg)) { int detpos = value.indexOf(' '); // tweak for IEEE Explore if ((detpos != -1) && !value.substring(0, detpos).trim().isEmpty()) { value = value.substring(0, detpos); } pages = pages + "--" + value; } else if ("PS".equals(beg)) { pages = IsiImporter.parsePages(value); } else if ("AR".equals(beg)) { pages = value; } else if ("IS".equals(beg)) { hm.put("number", value); } else if ("PY".equals(beg)) { hm.put("year", value); } else if ("VL".equals(beg)) { hm.put("volume", value); } else if ("PU".equals(beg)) { hm.put("publisher", value); } else if ("DI".equals(beg)) { hm.put("doi", value); } else if ("PD".equals(beg)) { String month = IsiImporter.parseMonth(value); if (month != null) { hm.put("month", month); } } else if ("DT".equals(beg)) { Type = value; if ("Review".equals(Type)) { Type = "article"; // set "Review" in Note/Comment? } else if (Type.startsWith("Article") || Type.startsWith("Journal") || "article".equals(PT)) { Type = "article"; } else { Type = "misc"; } } else if ("CR".equals(beg)) { hm.put("CitedReferences", value.replaceAll("EOLEOL", " ; ").trim()); } else { // Preserve all other entries except if ("ER".equals(beg) || "EF".equals(beg) || "VR".equals(beg) || "FN".equals(beg)) { continue; } hm.put(beg.toLowerCase(), value); } } if (!"".equals(pages)) { hm.put("pages", pages); } // Skip empty entries if (hm.isEmpty()) { continue; } BibEntry b = new BibEntry(DEFAULT_BIBTEXENTRY_ID, EntryTypes.getTypeOrDefault(Type)); // id assumes an existing database so don't // Remove empty fields: List<Object> toRemove = new ArrayList<>(); for (Map.Entry<String, String> field : hm.entrySet()) { String content = field.getValue(); if ((content == null) || content.trim().isEmpty()) { toRemove.add(field.getKey()); } } for (Object aToRemove : toRemove) { hm.remove(aToRemove); } // Polish entries IsiImporter.processSubSup(hm); IsiImporter.processCapitalization(hm); b.setField(hm); bibitems.add(b); } return bibitems; }