public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding)
     throws PluginException {
   HtmlTransform[] transforms =
       new HtmlTransform[] {
         // Filter out <a target="_blank">...</a>
         HtmlNodeFilterTransform.exclude(
             HtmlNodeFilters.tagWithAttribute("a", "target", "_blank")),
       };
   return new HtmlFilterInputStream(in, encoding, new HtmlCompoundTransform(transforms));
 }
 public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) {
   NodeFilter[] filters =
       new NodeFilter[] {
         // filter out script
         new TagNameFilter("script"),
         // Menu and related articles/other issues
         HtmlNodeFilters.tagWithAttribute("div", "class", "A_Left_Column"),
         // Footer menu that seems to currently be blank
         HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Menu"),
         // Copyright
         HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Copy"),
         // Lazy HTML these are inserted everywhere in an attempt to get the layout to work
         HtmlNodeFilters.tagWithAttribute("div", "style", "clear:both;"),
         // search box with broken layout. they may try to fix this
         HtmlNodeFilters.tagWithAttribute("div", "id", "search"),
         // another blank navigation div
         HtmlNodeFilters.tagWithAttribute("div", "id", "Sub_Top_Nav"),
       };
   return new HtmlFilterInputStream(
       in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters)));
 }
  @Override
  public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) {
    NodeFilter[] includeNodes =
        new NodeFilter[] {
          // manifest pages
          // <ul> and <li> without attributes (unlike TOC/full/abs/ref breadcrumbs)
          new NodeFilter() {
            @Override
            public boolean accept(Node node) {
              if (HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/toc/").accept(node)) {
                Node liParent = node.getParent();
                if (liParent instanceof Bullet) {
                  Bullet li = (Bullet) liParent;
                  Vector liAttr = li.getAttributesEx();
                  if (liAttr != null && liAttr.size() == 1) {
                    Node ulParent = li.getParent();
                    if (ulParent instanceof BulletList) {
                      BulletList ul = (BulletList) ulParent;
                      Vector ulAttr = ul.getAttributesEx();
                      return ulAttr != null && ulAttr.size() == 1;
                    }
                  }
                }
              } else if (HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/doi/book/")
                  .accept(node)) {
                // book manifest page has single doi/book ref whose parent is just the <body>
                // element
                // http://emeraldinsight.com/clockss/eisbn/9780080549910
                Node liParent = node.getParent();
                if (liParent instanceof BodyTag) {
                  return true;
                }
              }
              return false;
            }
          },
          // book - landing page main contents chapter list and synopsis)
          // http://emeraldinsight.com/doi/book/10.1108/9780080549910
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumBookDetailsWidget"),
          // toc - contents only
          // http://www.emeraldinsight.com/toc/aaaj/26/8
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumTocWidget"),
          // abs, full, ref - contents only
          // http://www.emeraldinsight.com/doi/full/10.1108/AAAJ-05-2013-1360
          HtmlNodeFilters.tagWithAttributeRegex(
              "div", "class", "literatumPublicationContentWidget"),
          // showCitFormats
          // http://www.emeraldinsight.com/action/
          //                      showCitFormats?doi=10.1108%2F09513571311285621
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "downloadCitationsWidget"),
          // showPopup - generated by BaseAtyponHtmlLinkExtractorFactory
          // http://www.emeraldinsight.com/action/showPopup?citid=citart1
          //                          &id=FN_fn1&doi=10.1108%2FAAAJ-02-2012-00947
          HtmlNodeFilters.tagWithAttributeRegex("body", "class", "popupBody")
        };

    // handled by parent: script, sfxlink, stylesheet, pdfplus file sise
    // <head> tag, <li> item has the text "Cited by", accessIcon,
    NodeFilter[] excludeNodes =
        new NodeFilter[] {
          // toc, abs, full, ref - Reprints and Permissions
          HtmlNodeFilters.tagWithAttributeRegex("a", "class", "rightsLink"),
          // toc - above the first toc entry with Track Citations
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "toc-actions"),
          // abs, full, ref - downloads count
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "downloadsCount"),
          // full - section choose pulldown appeared in multiple sections
          // http://www.emeraldinsight.com/doi/full/10.1108/AAAJ-02-2013-1228
          HtmlNodeFilters.tagWithAttribute("div", "class", "sectionJumpTo"),
          // abs, full, ref - Article Options and Tools
          HtmlNodeFilters.allExceptSubtree(
              HtmlNodeFilters.tagWithAttributeRegex("div", "class", "options"),
              HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/action/showCitFormats\\?")),
          // abs, full, ref - random html - potential problem
          HtmlNodeFilters.tagWithAttribute("span", "class", "Z3988"),
          // full, ref - references section - Crossref/ISI/Abstract/Infotrieve
          // separated by a comma. Not easy to remove the comma, so hash out
          // class citation
          HtmlNodeFilters.tagWithAttribute("div", "class", "citation"),
          // TOC - in case icon options change
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "icon-key"),
          // on the full/abs/ref pages there are little style definitions that
          HtmlNodeFilters.tagWithAttributeRegex("style", "type", "text/css"),
        };
    return super.createFilteredInputStream(au, in, encoding, includeNodes, excludeNodes);
  }
  public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding)
      throws PluginException {
    NodeFilter[] filters =
        new NodeFilter[] {
          // Filter out <div id="footer">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "footer"),
          // Filter out <div id="top-ad-alignment">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "top-ad-alignment"),
          // Filter out <div id="top-ad">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "top-ad"),
          // Filter out <div id="ident">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "ident"),
          // Filter out <div id="ad">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "ad"),
          // Filter out <div id="vertical-ad">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "vertical-ad"),
          // Filter out <div class="right-col-download">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "class", "right-col-download"),
          // Filter out <div id="cart-navbar">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "cart-navbar"),
          //         // Filter out <div class="heading-macfix article-access-options">...</div>
          //        HtmlNodeFilters.tagWithAttribute("div", "class", "heading-macfix
          // article-access-options"),
          // Filter out <div id="baynote-recommendations">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "baynote-recommendations"),
          // Filter out <div id="bookmarks-container">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "bookmarks-container"),
          // Filter out <div id="llb">...</div>
          HtmlNodeFilters.tagWithAttribute("div", "id", "llb"),
          // Filter out <a href="...">...</a> where the href value includes "exitTargetId" as a
          // parameter
          HtmlNodeFilters.tagWithAttributeRegex("a", "href", "[\\?&]exitTargetId="),
          // Filter out <input name="exitTargetId">
          HtmlNodeFilters.tagWithAttribute("input", "name", "exitTargetId"),
        };

    return new HtmlFilterInputStream(
        in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters)));
  }
/**
 * BaseAtyponHtmlCrawlFilterFactory The basic AtyponHtmlCrawlFilterFactory Child plugins can extend
 * this class and add publisher specific crawl filters, if necessary. Common crawl filters can be
 * easily added and be available to children. Otherwise, this can be used by child plugins if no
 * other crawl filters are needed.
 */
public class BaseAtyponHtmlCrawlFilterFactory implements FilterFactory {
  protected static final Pattern corrections =
      Pattern.compile(
          "Original Article|Corrigendum|Correction|Errata|Erratum", Pattern.CASE_INSENSITIVE);
  protected static NodeFilter[] baseAtyponFilters =
      new NodeFilter[] {
        HtmlNodeFilters.tagWithAttribute("div", "class", "citedBySection"),

        // Since overcrawling is a constant problem for Atypon, put common
        // next article-previous article link for safety;
        // AIAA, AMetSoc, ASCE, Ammons, APHA, SEG,Siam,
        HtmlNodeFilters.tagWithAttribute("a", "class", "articleToolsNav"),
        // BIR, Maney, Endocrine - also handles next/prev issue - also for issues
        HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavRightTd"),
        HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavLeftTd"),
        // BQ, BioOne, Edinburgh, futurescience, nrc
        //        all handle next/prev article link in plugin
        // T&F doesn't have prev/next article links

        // breadcrumb or other link back to TOC from article page
        // AMetSoc, Ammons, APHA, NRC,
        HtmlNodeFilters.tagWithAttribute("div", "id", "breadcrumbs"),
        // ASCE, BiR, Maney, SEG, SIAM, Endocrine
        HtmlNodeFilters.tagWithAttributeRegex("ul", "class", "^(linkList )?breadcrumbs$"),

        // on TOC next-prev issue
        // AIAA, AMetSoc, Ammons, APHA,
        HtmlNodeFilters.tagWithAttribute("div", "id", "nextprev"),
        // ASCE, SEG, SIAM
        HtmlNodeFilters.tagWithAttribute("div", "id", "prevNextNav"),

        // on TOC left column with listing of all volumes/issues
        HtmlNodeFilters.tagWithAttribute("ul", "class", "volumeIssues"),

        // have started finding cases of direct in-publication links within references
        // there are a variety of ways these blocks are identified, but
        // these are unlikely to be used anywhere else so put in parent
        // emerald, AIAA
        HtmlNodeFilters.tagWithAttribute("div", "class", "references"),
        // ASCE
        HtmlNodeFilters.tagWithAttribute("li", "class", "reference"),
        // maney, future-science (also in child...will remove later)
        HtmlNodeFilters.tagWithAttribute("table", "class", "references"),

        // Not all Atypon plugins necessarily need this but MANY do and it is
        // an insidious source of over crawling
        new NodeFilter() {
          @Override
          public boolean accept(Node node) {
            if (!(node instanceof LinkTag)) return false;
            String allText = ((CompositeTag) node).toPlainTextString();
            return corrections.matcher(allText).find();
          }
        },
      };

  /**
   * Create an array of NodeFilters that combines the atyponBaseFilters with the given array
   *
   * @param nodes The array of NodeFilters to add
   */
  private NodeFilter[] addTo(NodeFilter[] nodes) {
    NodeFilter[] result = Arrays.copyOf(baseAtyponFilters, baseAtyponFilters.length + nodes.length);
    System.arraycopy(nodes, 0, result, baseAtyponFilters.length, nodes.length);
    return result;
  }

  /**
   * Create a FilteredInputStream that excludes the the atyponBaseFilters
   *
   * @param au The archival unit
   * @param in Incoming input stream
   * @param encoding The encoding
   */
  public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding)
      throws PluginException {

    return new HtmlFilterInputStream(
        in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(baseAtyponFilters)));
  }

  /**
   * Create a FilteredInputStream that excludes the the atyponBaseFilters and moreNodes
   *
   * @param au The archival unit
   * @param in Incoming input stream
   * @param encoding The encoding
   * @param moreNodes An array of NodeFilters to be excluded with atyponBaseFilters
   */
  public InputStream createFilteredInputStream(
      ArchivalUnit au, InputStream in, String encoding, NodeFilter[] moreNodes)
      throws PluginException {
    NodeFilter[] bothFilters = addTo(moreNodes);
    return new HtmlFilterInputStream(
        in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(bothFilters)));
  }
}
  @Override
  public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) {
    NodeFilter[] filters =
        new NodeFilter[] {
          // handled by parent: script, sfxlink, stylesheet

          HtmlNodeFilters.tag("noscript"),

          // toc - first top block ad
          // http://www.birpublications.org/toc/bjr/87/1044
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumAd"),
          // page header: login, register, etc., and journal menu such as
          // subscribe, alerts, ...
          HtmlNodeFilters.tagWithAttributeRegex("div", "id", "pageHeader"),
          // page footer
          HtmlNodeFilters.tagWithAttributeRegex("div", "id", "pageFooter"),
          // toc - BJR logo image right below pageHeader
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "^widget general-image"),
          // toc, abs, full, ref - menu above breadcrumbs
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "menuXml"),
          // toc - free.gif image tied to an abs
          HtmlNodeFilters.tagWithAttributeRegex("img", "src", "free.gif"),
          // toc - access icon container
          HtmlNodeFilters.tagWithAttribute("td", "class", "accessIconContainer"),
          // toc - pulldown with sections - may add citedby later
          HtmlNodeFilters.tagWithAttribute("div", "class", "publicationTooldropdownContainer"),
          // toc - right column, current issue
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumBookIssueNavigation"),
          // toc, abs - share social media
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "general-bookmark-share"),
          // toc - right column impact factor block - no unique name found
          HtmlNodeFilters.tagWithAttributeRegex(
              "div",
              "class",
              "widget\\s+layout-one-column\\s+none\\s+widget-regular\\s+widget-border-toggle"),
          // ref - this seems unused but may get turned on
          // http://www.birpublications.org/doi/ref/10.1259/bjr.20130571
          HtmlNodeFilters.tagWithAttribute("div", "id", "MathJax_Message"),
          // full - section choose pulldown appeared in multiple sections
          // http://www.birpublications.org/doi/full/10.1259/dmfr.20120050
          HtmlNodeFilters.tagWithAttribute("div", "class", "sectionJumpTo"),
          // toc, abs, full, text and ref right column - most read
          // http://www.birpublications.org/toc/bjr/88/1052
          HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumMostReadWidget"),
          // abs - right column all literatumArticleToolsWidget
          // except Download Citation
          // http://www.birpublications.org/doi/abs/10.1259/bjr.20140472
          HtmlNodeFilters.allExceptSubtree(
              HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumArticleToolsWidget"),
              HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/action/showCitFormats\\?")),
        };
    // super.createFilteredInputStream adds bir filter to the baseAtyponFilters
    // and returns the filtered input stream using an array of NodeFilters that
    // combine the two arrays of NodeFilters.
    return super.createFilteredInputStream(au, in, encoding, filters);
  }