/** * Create a FilteredInputStream that excludes the the atyponBaseFilters and moreNodes * * @param au The archival unit * @param in Incoming input stream * @param encoding The encoding * @param moreNodes An array of NodeFilters to be excluded with atyponBaseFilters */ public InputStream createFilteredInputStream( ArchivalUnit au, InputStream in, String encoding, NodeFilter[] moreNodes) throws PluginException { NodeFilter[] bothFilters = addTo(moreNodes); return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(bothFilters))); }
public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) throws PluginException { HtmlTransform[] transforms = new HtmlTransform[] { // Filter out <a target="_blank">...</a> HtmlNodeFilterTransform.exclude( HtmlNodeFilters.tagWithAttribute("a", "target", "_blank")), }; return new HtmlFilterInputStream(in, encoding, new HtmlCompoundTransform(transforms)); }
public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) { NodeFilter[] filters = new NodeFilter[] { // filter out script new TagNameFilter("script"), // Menu and related articles/other issues HtmlNodeFilters.tagWithAttribute("div", "class", "A_Left_Column"), // Footer menu that seems to currently be blank HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Menu"), // Copyright HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Copy"), // Lazy HTML these are inserted everywhere in an attempt to get the layout to work HtmlNodeFilters.tagWithAttribute("div", "style", "clear:both;"), // search box with broken layout. they may try to fix this HtmlNodeFilters.tagWithAttribute("div", "id", "search"), // another blank navigation div HtmlNodeFilters.tagWithAttribute("div", "id", "Sub_Top_Nav"), }; return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters))); }
public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) throws PluginException { NodeFilter[] filters = new NodeFilter[] { // Filter out <div id="footer">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "footer"), // Filter out <div id="top-ad-alignment">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "top-ad-alignment"), // Filter out <div id="top-ad">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "top-ad"), // Filter out <div id="ident">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "ident"), // Filter out <div id="ad">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "ad"), // Filter out <div id="vertical-ad">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "vertical-ad"), // Filter out <div class="right-col-download">...</div> HtmlNodeFilters.tagWithAttribute("div", "class", "right-col-download"), // Filter out <div id="cart-navbar">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "cart-navbar"), // // Filter out <div class="heading-macfix article-access-options">...</div> // HtmlNodeFilters.tagWithAttribute("div", "class", "heading-macfix // article-access-options"), // Filter out <div id="baynote-recommendations">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "baynote-recommendations"), // Filter out <div id="bookmarks-container">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "bookmarks-container"), // Filter out <div id="llb">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "llb"), // Filter out <a href="...">...</a> where the href value includes "exitTargetId" as a // parameter HtmlNodeFilters.tagWithAttributeRegex("a", "href", "[\\?&]exitTargetId="), // Filter out <input name="exitTargetId"> HtmlNodeFilters.tagWithAttribute("input", "name", "exitTargetId"), }; return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters))); }
/** * Create a FilteredInputStream that excludes the the atyponBaseFilters * * @param au The archival unit * @param in Incoming input stream * @param encoding The encoding */ public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) throws PluginException { return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(baseAtyponFilters))); }