コード例 #1
0
 public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) {
   NodeFilter[] filters =
       new NodeFilter[] {
         // filter out script
         new TagNameFilter("script"),
         // Menu and related articles/other issues
         HtmlNodeFilters.tagWithAttribute("div", "class", "A_Left_Column"),
         // Footer menu that seems to currently be blank
         HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Menu"),
         // Copyright
         HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Copy"),
         // Lazy HTML these are inserted everywhere in an attempt to get the layout to work
         HtmlNodeFilters.tagWithAttribute("div", "style", "clear:both;"),
         // search box with broken layout. they may try to fix this
         HtmlNodeFilters.tagWithAttribute("div", "id", "search"),
         // another blank navigation div
         HtmlNodeFilters.tagWithAttribute("div", "id", "Sub_Top_Nav"),
       };
   return new HtmlFilterInputStream(
       in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters)));
 }
コード例 #2
0
/**
 * BaseAtyponHtmlCrawlFilterFactory The basic AtyponHtmlCrawlFilterFactory Child plugins can extend
 * this class and add publisher specific crawl filters, if necessary. Common crawl filters can be
 * easily added and be available to children. Otherwise, this can be used by child plugins if no
 * other crawl filters are needed.
 */
public class BaseAtyponHtmlCrawlFilterFactory implements FilterFactory {
  protected static final Pattern corrections =
      Pattern.compile(
          "Original Article|Corrigendum|Correction|Errata|Erratum", Pattern.CASE_INSENSITIVE);
  protected static NodeFilter[] baseAtyponFilters =
      new NodeFilter[] {
        HtmlNodeFilters.tagWithAttribute("div", "class", "citedBySection"),

        // Since overcrawling is a constant problem for Atypon, put common
        // next article-previous article link for safety;
        // AIAA, AMetSoc, ASCE, Ammons, APHA, SEG,Siam,
        HtmlNodeFilters.tagWithAttribute("a", "class", "articleToolsNav"),
        // BIR, Maney, Endocrine - also handles next/prev issue - also for issues
        HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavRightTd"),
        HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavLeftTd"),
        // BQ, BioOne, Edinburgh, futurescience, nrc
        //        all handle next/prev article link in plugin
        // T&F doesn't have prev/next article links

        // breadcrumb or other link back to TOC from article page
        // AMetSoc, Ammons, APHA, NRC,
        HtmlNodeFilters.tagWithAttribute("div", "id", "breadcrumbs"),
        // ASCE, BiR, Maney, SEG, SIAM, Endocrine
        HtmlNodeFilters.tagWithAttributeRegex("ul", "class", "^(linkList )?breadcrumbs$"),

        // on TOC next-prev issue
        // AIAA, AMetSoc, Ammons, APHA,
        HtmlNodeFilters.tagWithAttribute("div", "id", "nextprev"),
        // ASCE, SEG, SIAM
        HtmlNodeFilters.tagWithAttribute("div", "id", "prevNextNav"),

        // on TOC left column with listing of all volumes/issues
        HtmlNodeFilters.tagWithAttribute("ul", "class", "volumeIssues"),

        // have started finding cases of direct in-publication links within references
        // there are a variety of ways these blocks are identified, but
        // these are unlikely to be used anywhere else so put in parent
        // emerald, AIAA
        HtmlNodeFilters.tagWithAttribute("div", "class", "references"),
        // ASCE
        HtmlNodeFilters.tagWithAttribute("li", "class", "reference"),
        // maney, future-science (also in child...will remove later)
        HtmlNodeFilters.tagWithAttribute("table", "class", "references"),

        // Not all Atypon plugins necessarily need this but MANY do and it is
        // an insidious source of over crawling
        new NodeFilter() {
          @Override
          public boolean accept(Node node) {
            if (!(node instanceof LinkTag)) return false;
            String allText = ((CompositeTag) node).toPlainTextString();
            return corrections.matcher(allText).find();
          }
        },
      };

  /**
   * Create an array of NodeFilters that combines the atyponBaseFilters with the given array
   *
   * @param nodes The array of NodeFilters to add
   */
  private NodeFilter[] addTo(NodeFilter[] nodes) {
    NodeFilter[] result = Arrays.copyOf(baseAtyponFilters, baseAtyponFilters.length + nodes.length);
    System.arraycopy(nodes, 0, result, baseAtyponFilters.length, nodes.length);
    return result;
  }

  /**
   * Create a FilteredInputStream that excludes the the atyponBaseFilters
   *
   * @param au The archival unit
   * @param in Incoming input stream
   * @param encoding The encoding
   */
  public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding)
      throws PluginException {

    return new HtmlFilterInputStream(
        in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(baseAtyponFilters)));
  }

  /**
   * Create a FilteredInputStream that excludes the the atyponBaseFilters and moreNodes
   *
   * @param au The archival unit
   * @param in Incoming input stream
   * @param encoding The encoding
   * @param moreNodes An array of NodeFilters to be excluded with atyponBaseFilters
   */
  public InputStream createFilteredInputStream(
      ArchivalUnit au, InputStream in, String encoding, NodeFilter[] moreNodes)
      throws PluginException {
    NodeFilter[] bothFilters = addTo(moreNodes);
    return new HtmlFilterInputStream(
        in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(bothFilters)));
  }
}