public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) { NodeFilter[] filters = new NodeFilter[] { // filter out script new TagNameFilter("script"), // Menu and related articles/other issues HtmlNodeFilters.tagWithAttribute("div", "class", "A_Left_Column"), // Footer menu that seems to currently be blank HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Menu"), // Copyright HtmlNodeFilters.tagWithAttribute("div", "class", "A_Foot_Copy"), // Lazy HTML these are inserted everywhere in an attempt to get the layout to work HtmlNodeFilters.tagWithAttribute("div", "style", "clear:both;"), // search box with broken layout. they may try to fix this HtmlNodeFilters.tagWithAttribute("div", "id", "search"), // another blank navigation div HtmlNodeFilters.tagWithAttribute("div", "id", "Sub_Top_Nav"), }; return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(filters))); }
/** * BaseAtyponHtmlCrawlFilterFactory The basic AtyponHtmlCrawlFilterFactory Child plugins can extend * this class and add publisher specific crawl filters, if necessary. Common crawl filters can be * easily added and be available to children. Otherwise, this can be used by child plugins if no * other crawl filters are needed. */ public class BaseAtyponHtmlCrawlFilterFactory implements FilterFactory { protected static final Pattern corrections = Pattern.compile( "Original Article|Corrigendum|Correction|Errata|Erratum", Pattern.CASE_INSENSITIVE); protected static NodeFilter[] baseAtyponFilters = new NodeFilter[] { HtmlNodeFilters.tagWithAttribute("div", "class", "citedBySection"), // Since overcrawling is a constant problem for Atypon, put common // next article-previous article link for safety; // AIAA, AMetSoc, ASCE, Ammons, APHA, SEG,Siam, HtmlNodeFilters.tagWithAttribute("a", "class", "articleToolsNav"), // BIR, Maney, Endocrine - also handles next/prev issue - also for issues HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavRightTd"), HtmlNodeFilters.tagWithAttributeRegex("td", "class", "journalNavLeftTd"), // BQ, BioOne, Edinburgh, futurescience, nrc // all handle next/prev article link in plugin // T&F doesn't have prev/next article links // breadcrumb or other link back to TOC from article page // AMetSoc, Ammons, APHA, NRC, HtmlNodeFilters.tagWithAttribute("div", "id", "breadcrumbs"), // ASCE, BiR, Maney, SEG, SIAM, Endocrine HtmlNodeFilters.tagWithAttributeRegex("ul", "class", "^(linkList )?breadcrumbs$"), // on TOC next-prev issue // AIAA, AMetSoc, Ammons, APHA, HtmlNodeFilters.tagWithAttribute("div", "id", "nextprev"), // ASCE, SEG, SIAM HtmlNodeFilters.tagWithAttribute("div", "id", "prevNextNav"), // on TOC left column with listing of all volumes/issues HtmlNodeFilters.tagWithAttribute("ul", "class", "volumeIssues"), // have started finding cases of direct in-publication links within references // there are a variety of ways these blocks are identified, but // these are unlikely to be used anywhere else so put in parent // emerald, AIAA HtmlNodeFilters.tagWithAttribute("div", "class", "references"), // ASCE HtmlNodeFilters.tagWithAttribute("li", "class", "reference"), // maney, future-science (also in child...will remove later) HtmlNodeFilters.tagWithAttribute("table", "class", "references"), // Not all Atypon plugins necessarily need this but MANY do and it is // an insidious source of over crawling new NodeFilter() { @Override public boolean accept(Node node) { if (!(node instanceof LinkTag)) return false; String allText = ((CompositeTag) node).toPlainTextString(); return corrections.matcher(allText).find(); } }, }; /** * Create an array of NodeFilters that combines the atyponBaseFilters with the given array * * @param nodes The array of NodeFilters to add */ private NodeFilter[] addTo(NodeFilter[] nodes) { NodeFilter[] result = Arrays.copyOf(baseAtyponFilters, baseAtyponFilters.length + nodes.length); System.arraycopy(nodes, 0, result, baseAtyponFilters.length, nodes.length); return result; } /** * Create a FilteredInputStream that excludes the the atyponBaseFilters * * @param au The archival unit * @param in Incoming input stream * @param encoding The encoding */ public InputStream createFilteredInputStream(ArchivalUnit au, InputStream in, String encoding) throws PluginException { return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(baseAtyponFilters))); } /** * Create a FilteredInputStream that excludes the the atyponBaseFilters and moreNodes * * @param au The archival unit * @param in Incoming input stream * @param encoding The encoding * @param moreNodes An array of NodeFilters to be excluded with atyponBaseFilters */ public InputStream createFilteredInputStream( ArchivalUnit au, InputStream in, String encoding, NodeFilter[] moreNodes) throws PluginException { NodeFilter[] bothFilters = addTo(moreNodes); return new HtmlFilterInputStream( in, encoding, HtmlNodeFilterTransform.exclude(new OrFilter(bothFilters))); } }