public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) throws PluginException { HtmlTransform[] transforms = new HtmlTransform[] { // Filter out <a target="_blank">...</a> HtmlNodeFilterTransform.exclude( HtmlNodeFilters.tagWithAttribute("a", "target", "_blank")), }; return new HtmlFilterInputStream(in, encoding, new HtmlCompoundTransform(transforms)); }
public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) { NodeFilter[] filters = new NodeFilter[] { // filter out script new TagNameFilter("script"), // Menu and related articles/other issues HtmlNodeFilters.tagWithAttribute("div", "class", "A_Left_Column"), // Footer menu that seems to currently be blank HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Menu"), // Copyright HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Copy"), // Lazy HTML these are inserted everywhere in an attempt to get the layout to work HtmlNodeFilters.tagWithAttribute("div", "style", "clear:both;"), // search box with broken layout. they may try to fix this HtmlNodeFilters.tagWithAttribute("div", "id", "search"), // another blank navigation div HtmlNodeFilters.tagWithAttribute("div", "id", "Sub_Top_Nav"), }; return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters))); }
@Override public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) { NodeFilter[] includeNodes = new NodeFilter[] { // manifest pages // <ul> and <li> without attributes (unlike TOC/full/abs/ref breadcrumbs) new NodeFilter() { @Override public boolean accept(Node node) { if (HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/toc/").accept(node)) { Node liParent = node.getParent(); if (liParent instanceof Bullet) { Bullet li = (Bullet) liParent; Vector liAttr = li.getAttributesEx(); if (liAttr != null && liAttr.size() == 1) { Node ulParent = li.getParent(); if (ulParent instanceof BulletList) { BulletList ul = (BulletList) ulParent; Vector ulAttr = ul.getAttributesEx(); return ulAttr != null && ulAttr.size() == 1; } } } } else if (HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/doi/book/") .accept(node)) { // book manifest page has single doi/book ref whose parent is just the <body> // element // http://emeraldinsight.com/clockss/eisbn/9780080549910 Node liParent = node.getParent(); if (liParent instanceof BodyTag) { return true; } } return false; } }, // book - landing page main contents chapter list and synopsis) // http://emeraldinsight.com/doi/book/10.1108/9780080549910 HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumBookDetailsWidget"), // toc - contents only // http://www.emeraldinsight.com/toc/aaaj/26/8 HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumTocWidget"), // abs, full, ref - contents only // http://www.emeraldinsight.com/doi/full/10.1108/AAAJ-05-2013-1360 HtmlNodeFilters.tagWithAttributeRegex( "div", "class", "literatumPublicationContentWidget"), // showCitFormats // http://www.emeraldinsight.com/action/ // showCitFormats?doi=10.1108%2F09513571311285621 HtmlNodeFilters.tagWithAttributeRegex("div", "class", "downloadCitationsWidget"), // showPopup - generated by BaseAtyponHtmlLinkExtractorFactory // http://www.emeraldinsight.com/action/showPopup?citid=citart1 // &id=FN_fn1&doi=10.1108%2FAAAJ-02-2012-00947 HtmlNodeFilters.tagWithAttributeRegex("body", "class", "popupBody") }; // handled by parent: script, sfxlink, stylesheet, pdfplus file sise // <head> tag, <li> item has the text "Cited by", accessIcon, NodeFilter[] excludeNodes = new NodeFilter[] { // toc, abs, full, ref - Reprints and Permissions HtmlNodeFilters.tagWithAttributeRegex("a", "class", "rightsLink"), // toc - above the first toc entry with Track Citations HtmlNodeFilters.tagWithAttributeRegex("div", "class", "toc-actions"), // abs, full, ref - downloads count HtmlNodeFilters.tagWithAttributeRegex("div", "class", "downloadsCount"), // full - section choose pulldown appeared in multiple sections // http://www.emeraldinsight.com/doi/full/10.1108/AAAJ-02-2013-1228 HtmlNodeFilters.tagWithAttribute("div", "class", "sectionJumpTo"), // abs, full, ref - Article Options and Tools HtmlNodeFilters.allExceptSubtree( HtmlNodeFilters.tagWithAttributeRegex("div", "class", "options"), HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/action/showCitFormats\\?")), // abs, full, ref - random html - potential problem HtmlNodeFilters.tagWithAttribute("span", "class", "Z3988"), // full, ref - references section - Crossref/ISI/Abstract/Infotrieve // separated by a comma. Not easy to remove the comma, so hash out // class citation HtmlNodeFilters.tagWithAttribute("div", "class", "citation"), // TOC - in case icon options change HtmlNodeFilters.tagWithAttributeRegex("div", "class", "icon-key"), // on the full/abs/ref pages there are little style definitions that HtmlNodeFilters.tagWithAttributeRegex("style", "type", "text/css"), }; return super.createFilteredInputStream(au, in, encoding, includeNodes, excludeNodes); }
public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) throws PluginException { NodeFilter[] filters = new NodeFilter[] { // Filter out <div id="footer">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "footer"), // Filter out <div id="top-ad-alignment">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "top-ad-alignment"), // Filter out <div id="top-ad">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "top-ad"), // Filter out <div id="ident">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "ident"), // Filter out <div id="ad">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "ad"), // Filter out <div id="vertical-ad">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "vertical-ad"), // Filter out <div class="right-col-download">...</div> HtmlNodeFilters.tagWithAttribute("div", "class", "right-col-download"), // Filter out <div id="cart-navbar">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "cart-navbar"), // // Filter out <div class="heading-macfix article-access-options">...</div> // HtmlNodeFilters.tagWithAttribute("div", "class", "heading-macfix // article-access-options"), // Filter out <div id="baynote-recommendations">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "baynote-recommendations"), // Filter out <div id="bookmarks-container">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "bookmarks-container"), // Filter out <div id="llb">...</div> HtmlNodeFilters.tagWithAttribute("div", "id", "llb"), // Filter out <a href="...">...</a> where the href value includes "exitTargetId" as a // parameter HtmlNodeFilters.tagWithAttributeRegex("a", "href", "[\\?&]exitTargetId="), // Filter out <input name="exitTargetId"> HtmlNodeFilters.tagWithAttribute("input", "name", "exitTargetId"), }; return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters))); }
/** * BaseAtyponHtmlCrawlFilterFactory The basic AtyponHtmlCrawlFilterFactory Child plugins can extend * this class and add publisher specific crawl filters, if necessary. Common crawl filters can be * easily added and be available to children. Otherwise, this can be used by child plugins if no * other crawl filters are needed. */ public class BaseAtyponHtmlCrawlFilterFactory implements FilterFactory { protected static final Pattern corrections = Pattern.compile( "Original Article|Corrigendum|Correction|Errata|Erratum", Pattern.CASE_INSENSITIVE); protected static NodeFilter[] baseAtyponFilters = new NodeFilter[] { HtmlNodeFilters.tagWithAttribute("div", "class", "citedBySection"), // Since overcrawling is a constant problem for Atypon, put common // next article-previous article link for safety; // AIAA, AMetSoc, ASCE, Ammons, APHA, SEG,Siam, HtmlNodeFilters.tagWithAttribute("a", "class", "articleToolsNav"), // BIR, Maney, Endocrine - also handles next/prev issue - also for issues HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavRightTd"), HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavLeftTd"), // BQ, BioOne, Edinburgh, futurescience, nrc // all handle next/prev article link in plugin // T&F doesn't have prev/next article links // breadcrumb or other link back to TOC from article page // AMetSoc, Ammons, APHA, NRC, HtmlNodeFilters.tagWithAttribute("div", "id", "breadcrumbs"), // ASCE, BiR, Maney, SEG, SIAM, Endocrine HtmlNodeFilters.tagWithAttributeRegex("ul", "class", "^(linkList )?breadcrumbs$"), // on TOC next-prev issue // AIAA, AMetSoc, Ammons, APHA, HtmlNodeFilters.tagWithAttribute("div", "id", "nextprev"), // ASCE, SEG, SIAM HtmlNodeFilters.tagWithAttribute("div", "id", "prevNextNav"), // on TOC left column with listing of all volumes/issues HtmlNodeFilters.tagWithAttribute("ul", "class", "volumeIssues"), // have started finding cases of direct in-publication links within references // there are a variety of ways these blocks are identified, but // these are unlikely to be used anywhere else so put in parent // emerald, AIAA HtmlNodeFilters.tagWithAttribute("div", "class", "references"), // ASCE HtmlNodeFilters.tagWithAttribute("li", "class", "reference"), // maney, future-science (also in child...will remove later) HtmlNodeFilters.tagWithAttribute("table", "class", "references"), // Not all Atypon plugins necessarily need this but MANY do and it is // an insidious source of over crawling new NodeFilter() { @Override public boolean accept(Node node) { if (!(node instanceof LinkTag)) return false; String allText = ((CompositeTag) node).toPlainTextString(); return corrections.matcher(allText).find(); } }, }; /** * Create an array of NodeFilters that combines the atyponBaseFilters with the given array * * @param nodes The array of NodeFilters to add */ private NodeFilter[] addTo(NodeFilter[] nodes) { NodeFilter[] result = Arrays.copyOf(baseAtyponFilters, baseAtyponFilters.length + nodes.length); System.arraycopy(nodes, 0, result, baseAtyponFilters.length, nodes.length); return result; } /** * Create a FilteredInputStream that excludes the the atyponBaseFilters * * @param au The archival unit * @param in Incoming input stream * @param encoding The encoding */ public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) throws PluginException { return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(baseAtyponFilters))); } /** * Create a FilteredInputStream that excludes the the atyponBaseFilters and moreNodes * * @param au The archival unit * @param in Incoming input stream * @param encoding The encoding * @param moreNodes An array of NodeFilters to be excluded with atyponBaseFilters */ public InputStream createFilteredInputStream( ArchivalUnit au, InputStream in, String encoding, NodeFilter[] moreNodes) throws PluginException { NodeFilter[] bothFilters = addTo(moreNodes); return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(bothFilters))); } }
@Override public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) { NodeFilter[] filters = new NodeFilter[] { // handled by parent: script, sfxlink, stylesheet HtmlNodeFilters.tag("noscript"), // toc - first top block ad // http://www.birpublications.org/toc/bjr/87/1044 HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumAd"), // page header: login, register, etc., and journal menu such as // subscribe, alerts, ... HtmlNodeFilters.tagWithAttributeRegex("div", "id", "pageHeader"), // page footer HtmlNodeFilters.tagWithAttributeRegex("div", "id", "pageFooter"), // toc - BJR logo image right below pageHeader HtmlNodeFilters.tagWithAttributeRegex("div", "class", "^widget general-image"), // toc, abs, full, ref - menu above breadcrumbs HtmlNodeFilters.tagWithAttributeRegex("div", "class", "menuXml"), // toc - free.gif image tied to an abs HtmlNodeFilters.tagWithAttributeRegex("img", "src", "free.gif"), // toc - access icon container HtmlNodeFilters.tagWithAttribute("td", "class", "accessIconContainer"), // toc - pulldown with sections - may add citedby later HtmlNodeFilters.tagWithAttribute("div", "class", "publicationTooldropdownContainer"), // toc - right column, current issue HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumBookIssueNavigation"), // toc, abs - share social media HtmlNodeFilters.tagWithAttributeRegex("div", "class", "general-bookmark-share"), // toc - right column impact factor block - no unique name found HtmlNodeFilters.tagWithAttributeRegex( "div", "class", "widget\\s+layout-one-column\\s+none\\s+widget-regular\\s+widget-border-toggle"), // ref - this seems unused but may get turned on // http://www.birpublications.org/doi/ref/10.1259/bjr.20130571 HtmlNodeFilters.tagWithAttribute("div", "id", "MathJax_Message"), // full - section choose pulldown appeared in multiple sections // http://www.birpublications.org/doi/full/10.1259/dmfr.20120050 HtmlNodeFilters.tagWithAttribute("div", "class", "sectionJumpTo"), // toc, abs, full, text and ref right column - most read // http://www.birpublications.org/toc/bjr/88/1052 HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumMostReadWidget"), // abs - right column all literatumArticleToolsWidget // except Download Citation // http://www.birpublications.org/doi/abs/10.1259/bjr.20140472 HtmlNodeFilters.allExceptSubtree( HtmlNodeFilters.tagWithAttributeRegex("div", "class", "literatumArticleToolsWidget"), HtmlNodeFilters.tagWithAttributeRegex("a", "href", "/action/showCitFormats\\?")), }; // super.createFilteredInputStream adds bir filter to the baseAtyponFilters // and returns the filtered input stream using an array of NodeFilters that // combine the two arrays of NodeFilters. return super.createFilteredInputStream(au, in, encoding, filters); }