Example #1
0
  /**
   * Checks if the two entries represent the same publication.
   *
   * @param one BibEntry
   * @param two BibEntry
   * @return boolean
   */
  public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) {

    // First check if they are of the same type - a necessary condition:
    if (!one.getType().equals(two.getType())) {
      return false;
    }
    EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode);

    // The check if they have the same required fields:
    java.util.List<String> var = type.getRequiredFieldsFlat();
    String[] fields = var.toArray(new String[var.size()]);
    double[] req;
    if (fields == null) {
      req = new double[] {0., 0.};
    } else {
      req = DuplicateCheck.compareFieldSet(fields, one, two);
    }

    if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) {
      // Far from the threshold value, so we base our decision on the req. fields only
      return req[0] >= DuplicateCheck.duplicateThreshold;
    }
    // Close to the threshold value, so we take a look at the optional fields, if any:
    java.util.List<String> optionalFields = type.getOptionalFields();
    fields = optionalFields.toArray(new String[optionalFields.size()]);
    if (fields != null) {
      double[] opt = DuplicateCheck.compareFieldSet(fields, one, two);
      double totValue =
          ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1]))
              / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]);
      return totValue >= DuplicateCheck.duplicateThreshold;
    }
    return req[0] >= DuplicateCheck.duplicateThreshold;
  }
  /** Parse the entries in the source, and return a List of BibEntry objects. */
  @Override
  public List<BibEntry> importEntries(InputStream stream, OutputPrinter status) throws IOException {
    ArrayList<BibEntry> bibitems = new ArrayList<>();
    StringBuilder sb = new StringBuilder();
    String str;
    try (BufferedReader in =
        new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream))) {
      while ((str = in.readLine()) != null) {
        if (str.length() < 2) {
          continue;
        }
        if (str.indexOf("Record") == 0) {
          sb.append("__::__").append(str);
        } else {
          sb.append("__NEWFIELD__").append(str);
        }
      }
    }
    String[] entries = sb.toString().split("__::__");
    String type = "";
    HashMap<String, String> h = new HashMap<>();
    for (String entry : entries) {
      if (entry.indexOf("Record") != 0) {
        continue;
      }
      h.clear();

      String[] fields = entry.split("__NEWFIELD__");
      for (String s : fields) {
        // System.out.println(fields[j]);
        String f3 = s.substring(0, 2);
        String frest = s.substring(5);
        if ("TI".equals(f3)) {
          h.put("title", frest);
        } else if ("PY".equals(f3)) {
          h.put("year", frest);
        } else if ("AU".equals(f3)) {
          h.put(
              "author",
              AuthorList.fixAuthor_lastNameFirst(frest.replace(",-", ", ").replace(";", " and ")));
        } else if ("AB".equals(f3)) {
          h.put("abstract", frest);
        } else if ("ID".equals(f3)) {
          h.put("keywords", frest);
        } else if ("SO".equals(f3)) {
          int m = frest.indexOf('.');
          if (m >= 0) {
            String jr = frest.substring(0, m);
            h.put("journal", jr.replace("-", " "));
            frest = frest.substring(m);
            m = frest.indexOf(';');
            if (m >= 5) {
              String yr = frest.substring(m - 5, m);
              h.put("year", yr);
              frest = frest.substring(m);
              m = frest.indexOf(':');
              if (m >= 0) {
                String pg = frest.substring(m + 1).trim();
                h.put("pages", pg);
                h.put("volume", frest.substring(1, m));
              }
            }
          }

        } else if ("RT".equals(f3)) {
          frest = frest.trim();
          if ("Journal-Paper".equals(frest)) {
            type = "article";
          } else if ("Conference-Paper".equals(frest)
              || "Conference-Paper; Journal-Paper".equals(frest)) {
            type = "inproceedings";
          } else {
            type = frest.replace(" ", "");
          }
        }
      }
      BibEntry b =
          new BibEntry(
              DEFAULT_BIBTEXENTRY_ID,
              EntryTypes.getTypeOrDefault(type)); // id assumes an existing database so don't
      // create one here
      b.setField(h);

      bibitems.add(b);
    }

    return bibitems;
  }
Example #3
0
  @Override
  public void endElement(String uri, String localName, String qName) {
    if ("PubmedArticle".equals(localName)) {
      // bibitems.add( new Bibitem(null, makeBibtexString(), Globals.nextKey(),"-1" )	 );
      // check if year ="" then give medline date instead
      if ("".equals(year)) {
        if (!"".equals(MedlineDate)) {
          // multi-year date format
          // System.out.println(MedlineDate);
          year = MedlineDate.substring(0, 4);
          // Matcher m = Pattern.compile("\\b[0-9]{4}\\b").matcher(MedlineDate);
          // if(m.matches())
          // year = m.group();
        }
      }

      // Build a string from the collected keywords:
      StringBuilder sb = new StringBuilder();
      for (Iterator<String> iterator = descriptors.iterator(); iterator.hasNext(); ) {
        String s = iterator.next();
        sb.append(s);
        if (iterator.hasNext()) {
          sb.append(MedlineHandler.KEYWORD_SEPARATOR);
        }
      }
      String keywords = sb.toString();

      BibEntry b =
          new BibEntry(
              IdGenerator.next(), // Globals.DEFAULT_BIBTEXENTRY_ID,
              EntryTypes.getTypeOrDefault(
                  "article")); // id assumes an existing database so don't create one here
      if (!"".equals(author)) {
        b.setField(
            "author",
            MedlineHandler.htmlConverter.formatUnicode(
                ImportFormatReader.expandAuthorInitials(author)));
        // b.setField("author",Util.replaceSpecialCharacters(ImportFormatReader.expandAuthorInitials(author)));
        author = "";
      }
      if (!"".equals(title)) {
        b.setField("title", MedlineHandler.htmlConverter.formatUnicode(title));
      }
      // if (!title.equals("")) b.setField("title",Util.replaceSpecialCharacters(title));
      if (!"".equals(journal)) {
        b.setField("journal", journal);
      }
      if (!"".equals(year)) {
        b.setField("year", year);
      }
      // PENDING [email protected] 2005-05-27 : added call to fixPageRange
      if (!"".equals(page)) {
        b.setField("pages", fixPageRange(page));
      }
      if (!"".equals(volume)) {
        b.setField("volume", volume);
      }
      if (!"".equals(language)) {
        b.setField("language", language);
      }
      if (!"".equals(pst)) {
        b.setField("medline-pst", pst);
      }
      if (!"".equals(abstractText)) {
        b.setField("abstract", abstractText.replaceAll("%", "\\\\%"));
      }
      if (!"".equals(keywords)) {
        b.setField("keywords", keywords);
      }
      if (!"".equals(month)) {
        b.setField("month", month);
      }
      // if (!url.equals("")) b.setField("url",url);
      if (!"".equals(number)) {
        b.setField("number", number);
      }

      if (!"".equals(doi)) {
        b.setField("doi", doi);
        b.setField("url", "http://dx.doi.org/" + doi);
      }
      if (!"".equals(pii)) {
        b.setField("pii", pii);
      }
      if (!"".equals(pmc)) {
        b.setField("pmc", pmc);
      }
      if (!"".equals(affiliation)) {
        b.setField("institution", affiliation.replaceAll("#", "\\\\#"));
      }

      // PENDING [email protected] 2005-05-27 : added "pmid" bibtex field
      // Older references do not have doi entries, but every
      // medline entry has a unique pubmed ID (aka primary ID).
      // Add a bibtex field for the pubmed ID for future use.
      if (!"".equals(pubmedid)) {
        b.setField("pmid", pubmedid);
      }

      bibitems.add(b);

      abstractText = "";
      author = "";
      title = "";
      journal = "";
      keywords = "";
      doi = "";
      pii = "";
      pmc = "";
      year = "";
      forename = "";
      lastName = "";
      suffix = "";
      abstractText = "";
      affiliation = "";
      pubmedid = "";
      majorTopic = "";
      minorTopics = "";
      month = "";
      volume = "";
      language = "";
      pst = "";
      lastname = "";
      suffix = "";
      initials = "";
      number = "";
      page = "";
      String medlineID = "";
      String url = "";
      MedlineDate = "";
      descriptors.clear();
    } else if ("ArticleTitle".equals(localName)) {
      inTitle = false;
    } else if ("PubDate".equals(localName)) {
      inPubDate = false;
    } else if ("Year".equals(localName)) {
      inYear = false;
    } else if ("PMID".equals(localName)) {
      inPubMedID = false;
    } else if ("MedlineDate".equals(localName)) {
      inMedlineDate = false;
    } else if ("MedlineTA".equals(localName)) {
      inJournal = false;
    } // journal name
    else if ("Month".equals(localName)) {
      inMonth = false;
    } else if ("Volume".equals(localName)) {
      inVolume = false;
    } else if ("Language".equals(localName)) {
      inLanguage = false;
    } else if ("PublicationStatus".equals(localName)) {
      inPst = false;
    } else if ("AuthorList".equals(localName)) {
      author = join(authors.toArray(), " and ");
      inAuthorList = false;
    } else if ("Author".equals(localName)) {
      // forename sometimes has initials with " " in middle: is pattern [A-Z] [A-Z]
      // when above is the case replace it with initials
      if ((forename.length() == 3) && (forename.charAt(1) == ' ')) {
        forename = initials;
      }

      // Put together name with last name first, and enter suffix in between if present:
      if (lastname.indexOf(" ") > 0) {
        author = "{" + lastname + "}";
      } else {
        author = lastname;
      }

      if (!suffix.isEmpty()) {
        author = author + ", " + suffix;
      }
      if (!forename.isEmpty()) {
        author = author + ", " + forename;
      }

      // author = initials + " " + lastname;
      authors.add(author);
      inAuthor = false;
      forename = "";
      initials = "";
      lastname = "";
      suffix = "";
    } else if ("DescriptorName".equals(localName)) {
      inDescriptorName = false;
    } else if ("QualifierName".equals(localName)) {
      inQualifierName = false;
    } else if ("MeshHeading".equals(localName)) {
      inMeshHeader = false;
      if ("".equals(minorTopics)) {
        descriptors.add(majorTopic);
      } else {
        descriptors.add(majorTopic + ", " + minorTopics);
      }
    } else if ("LastName".equals(localName)) {
      inLastName = false;
    } else if ("Suffix".equals(localName)) {
      inSuffix = false;
    } else if ("ForeName".equals(localName) || "FirstName".equals(localName)) {
      inForename = false;
    } else if ("Issue".equals(localName)) {
      inIssue = false;
    } else if ("MedlinePgn".equals(localName)) {
      inMedlinePgn = false;
    } // pagenumber
    else if ("URL".equals(localName)) {
      inUrl = false;
    } else if ("Initials".equals(localName)) {
      // initials= '.' + initials + '.';
      inInitials = false;
    } else if ("AbstractText".equals(localName)) {
      inAbstractText = false;
    } else if ("Affiliation".equals(localName)) {
      inAffiliation = false;
    } else if ("ArticleId".equals(localName)) {
      if (inDoi) {
        inDoi = false;
      } else if (inPii) {
        inPii = false;
      } else if (inPmc) {
        inPmc = false;
      }
    }
  }
Example #4
0
  /** Parse the entries in the source, and return a List of BibEntry objects. */
  @Override
  public List<BibEntry> importEntries(InputStream stream, OutputPrinter status) throws IOException {
    if (stream == null) {
      throw new IOException("No stream given.");
    }

    ArrayList<BibEntry> bibitems = new ArrayList<>();
    StringBuilder sb = new StringBuilder();

    BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));

    // Pattern fieldPattern = Pattern.compile("^AU |^TI |^SO |^DT |^C1 |^AB
    // |^ID |^BP |^PY |^SE |^PY |^VL |^IS ");
    String str;

    while ((str = in.readLine()) != null) {
      if (str.length() < 3) {
        continue;
      }

      // begining of a new item
      if ("PT ".equals(str.substring(0, 3))) {
        sb.append("::").append(str);
      } else {
        String beg = str.substring(0, 3).trim();

        // I could have used the fieldPattern regular expression instead
        // however this seems to be
        // quick and dirty and it works!
        if (beg.length() == 2) {
          sb.append(" ## "); // mark the begining of each field
          sb.append(str);
        } else {
          sb.append("EOLEOL"); // mark the end of each line
          sb.append(str.trim()); // remove the initial spaces
        }
      }
    }

    String[] entries = sb.toString().split("::");

    HashMap<String, String> hm = new HashMap<>();

    // skip the first entry as it is either empty or has document header
    for (String entry : entries) {
      String[] fields = entry.split(" ## ");

      if (fields.length == 0) {
        fields = entry.split("\n");
      }

      String Type = "";
      String PT = "";
      String pages = "";
      hm.clear();

      for (String field : fields) {
        // empty field don't do anything
        if (field.length() <= 2) {
          continue;
        }

        String beg = field.substring(0, 2);
        String value = field.substring(3);
        if (value.startsWith(" - ")) {
          value = value.substring(3);
        }
        value = value.trim();

        if ("PT".equals(beg)) {
          if (value.startsWith("J")) {
            PT = "article";
          } else {
            PT = value;
          }
          Type = "article"; // make all of them PT?
        } else if ("TY".equals(beg)) {
          if ("JOUR".equals(value)) {
            Type = "article";
          } else if ("CONF".equals(value)) {
            Type = "inproceedings";
          }
        } else if ("JO".equals(beg)) {
          hm.put("booktitle", value);
        } else if ("AU".equals(beg)) {
          String author = IsiImporter.isiAuthorsConvert(value.replaceAll("EOLEOL", " and "));

          // if there is already someone there then append with "and"
          if (hm.get("author") != null) {
            author = hm.get("author") + " and " + author;
          }

          hm.put("author", author);
        } else if ("TI".equals(beg)) {
          hm.put("title", value.replaceAll("EOLEOL", " "));
        } else if ("SO".equals(beg) || "JA".equals(beg)) {
          hm.put("journal", value.replaceAll("EOLEOL", " "));
        } else if ("ID".equals(beg) || "KW".equals(beg)) {

          value = value.replaceAll("EOLEOL", " ");
          String existingKeywords = hm.get("keywords");
          if ((existingKeywords == null) || existingKeywords.contains(value)) {
            existingKeywords = value;
          } else {
            existingKeywords += ", " + value;
          }
          hm.put("keywords", existingKeywords);

        } else if ("AB".equals(beg)) {
          hm.put("abstract", value.replaceAll("EOLEOL", " "));
        } else if ("BP".equals(beg) || "BR".equals(beg) || "SP".equals(beg)) {
          pages = value;
        } else if ("EP".equals(beg)) {
          int detpos = value.indexOf(' ');

          // tweak for IEEE Explore
          if ((detpos != -1) && !value.substring(0, detpos).trim().isEmpty()) {
            value = value.substring(0, detpos);
          }

          pages = pages + "--" + value;
        } else if ("PS".equals(beg)) {
          pages = IsiImporter.parsePages(value);
        } else if ("AR".equals(beg)) {
          pages = value;
        } else if ("IS".equals(beg)) {
          hm.put("number", value);
        } else if ("PY".equals(beg)) {
          hm.put("year", value);
        } else if ("VL".equals(beg)) {
          hm.put("volume", value);
        } else if ("PU".equals(beg)) {
          hm.put("publisher", value);
        } else if ("DI".equals(beg)) {
          hm.put("doi", value);
        } else if ("PD".equals(beg)) {

          String month = IsiImporter.parseMonth(value);
          if (month != null) {
            hm.put("month", month);
          }

        } else if ("DT".equals(beg)) {
          Type = value;
          if ("Review".equals(Type)) {
            Type = "article"; // set "Review" in Note/Comment?
          } else if (Type.startsWith("Article")
              || Type.startsWith("Journal")
              || "article".equals(PT)) {
            Type = "article";
          } else {
            Type = "misc";
          }
        } else if ("CR".equals(beg)) {
          hm.put("CitedReferences", value.replaceAll("EOLEOL", " ; ").trim());
        } else {
          // Preserve all other entries except
          if ("ER".equals(beg) || "EF".equals(beg) || "VR".equals(beg) || "FN".equals(beg)) {
            continue;
          }
          hm.put(beg.toLowerCase(), value);
        }
      }

      if (!"".equals(pages)) {
        hm.put("pages", pages);
      }

      // Skip empty entries
      if (hm.isEmpty()) {
        continue;
      }

      BibEntry b = new BibEntry(DEFAULT_BIBTEXENTRY_ID, EntryTypes.getTypeOrDefault(Type));
      // id assumes an existing database so don't

      // Remove empty fields:
      List<Object> toRemove = new ArrayList<>();
      for (Map.Entry<String, String> field : hm.entrySet()) {
        String content = field.getValue();
        if ((content == null) || content.trim().isEmpty()) {
          toRemove.add(field.getKey());
        }
      }
      for (Object aToRemove : toRemove) {
        hm.remove(aToRemove);
      }

      // Polish entries
      IsiImporter.processSubSup(hm);
      IsiImporter.processCapitalization(hm);

      b.setField(hm);

      bibitems.add(b);
    }

    return bibitems;
  }