/** * Transform a particular attribute on a TagNode, iff that attribute exists * * @param context The ReplayParseContext being transformed * @param node the TagNode to update * @param attr the attribute name to transform * @param transformer the StringTransformer responsible for creating the new value * @return true if the attribute was found and updated */ private boolean transformAttr( ReplayParseContext context, TagNode node, String attr, StringTransformer transformer) { String orig = node.getAttribute(attr); if (orig != null) { node.setAttribute(attr, transformer.transform(context, orig)); return true; } return false; }
private Hashtable<String, Integer> extractData(String docUrl, Page page) throws Exception { Hashtable<String, Integer> businessTable = new Hashtable<String, Integer>(); String currentBusiness = ""; Lexer lexer = new Lexer(page); while (true) { Node node = lexer.nextNode(); if (node == null) { break; } if (node instanceof TagNode) { TagNode tagNode = (TagNode) node; if (tagNode.getTagName().equals("A")) { String href = tagNode.getAttribute("href"); if (href != null) { String absUrl = AbsUrlConstructor.construct(docUrl, href); Crawler.dispatchUrl(absUrl); Pattern pBusiness = Pattern.compile("^(http://www.yelp.com/biz/)(\\S)+"); if (pBusiness.matcher(absUrl).matches()) { currentBusiness = extractBusinessName(href); if (!businessTable.containsKey(currentBusiness)) { businessTable.put(currentBusiness, -1); } // System.out.println("currentBusiness = "+currentBusiness); // rating = "4"; // UpdateDatabase(linkID, business, rating, userID); // System.out.println(business + " added."); } } } else if (tagNode.getTagName().equals("IMG")) { String c1ass2 = tagNode.getAttribute("class"); if (c1ass2 != null) { String rating = ""; String[] rate = c1ass2.split("_"); int num = rate.length - 1; if (!rate[num].equals("loader")) { rating = rate[num].trim(); if (businessTable.get(currentBusiness) == -1) { businessTable.put(currentBusiness, Integer.parseInt(rating)); } } // System.out.println(linkID + " " + business + " " + rating + " " + userID; } } } } return businessTable; }
/** * Parses the given text to create the tag contents. * * @param text A string of the form <TAGNAME xx="yy">. */ public void setText(String text) { Lexer lexer; TagNode output; lexer = new Lexer(text); try { output = (TagNode) lexer.nextNode(); mPage = output.getPage(); nodeBegin = output.getStartPosition(); nodeEnd = output.getEndPosition(); mAttributes = output.getAttributesEx(); } catch (ParserException pe) { throw new IllegalArgumentException(pe.getMessage()); } }
private void handleJSIncludeNode(ReplayParseContext context, TagNode tagNode) throws IOException { String file = tagNode.getAttribute("SRC"); if (file != null) { // TODO: This is hacky.. fix it // This is used to check if the file should be skipped... // from a custom rule.. String result = jsBlockTrans.transform(context, file); // The rewriting is done by the js_ rewriter if ((result != null) && !result.isEmpty()) { tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file)); } else { file = ""; tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file)); } } emit(context, null, tagNode, null); }
private boolean isNotTagAppearingInHead(TagNode node) { String thisTag = node.getTagName(); if (thisTag.startsWith("!")) return false; for (String tag : okHeadTags) { if (thisTag.equals(tag)) { return false; } } return true; }
// TODO: This should all be refactored up into an abstract base class with // default no-op methods, allowing a subclass to only override the ones they // want... public void handleNode(ParseContext pContext, Node node) throws IOException { ReplayParseContext context = (ReplayParseContext) pContext; if (NodeUtils.isRemarkNode(node)) { RemarkNode remarkNode = (RemarkNode) node; remarkNode.setText(jsBlockTrans.transform(context, remarkNode.getText())); emit(context, null, node, null); } else if (NodeUtils.isTextNode(node)) { TextNode textNode = (TextNode) node; if (context.isInCSS()) { handleCSSTextNode(context, textNode); } else if (context.isInScriptText()) { handleJSTextNode(context, textNode); } else { emit(context, null, textNode, null); // handleContentTextNode(context,textNode); } } else if (NodeUtils.isTagNode(node)) { TagNode tagNode = (TagNode) node; if (NodeUtils.isOpenTagNodeNamed(tagNode, NodeUtils.SCRIPT_TAG_NAME)) { handleJSIncludeNode(context, tagNode); } else if (tagNode.isEndTag()) { if (tagNode.getTagName().equals("HEAD")) { context.putData(FERRET_IN_HEAD, null); } if (checkAllowTag(pContext, tagNode)) { emit(context, null, tagNode, null); } // handleCloseTagNode(context,tagNode); } else { // assume start, possibly empty: handleOpenTagNode(context, tagNode); } } else { throw new IllegalArgumentException("Unknown node type.."); } }
/** * Retrieves the value of a table cell. Appends the text of child nodes of the cell. In case of * composite tags like span or div the inner text is appended. */ public static String getValue(TagNode cell) { String value = EMPTY; for (Node child : cell.getChildren().toNodeArray()) { if (child instanceof CompositeTag) { value += ((CompositeTag) child).getStringText(); } else { value = value + child.getText(); } } return value.trim().replaceAll(" ", EMPTY); }
/** * Transform a particular attribute on a TagNode, if that TagNode has a previous value for the * updated attribute, AND if that TagNode contains another named attribute with a specific value. * * @param context the ReplayParseContext * @param node the TagNode to be updated * @param attrName update only occurs if the TagNode has an attribute with this name. * @param attrVal update only occurs if the TagNode has an attribute attrName has this value, case * insensitive. In fact as an optimization, it is ASSUMED that this argument is already * UPPER-CASED * @param modAttr the attribute value to update * @param transformer the StringTransformer responsible for creating the new value based on the * old one. * @return true if the attribute was updated. */ private boolean transformAttrWhere( ReplayParseContext context, TagNode node, String attrName, String attrVal, String modAttr, StringTransformer transformer) { String val = node.getAttribute(attrName); if (val != null) { if (val.toUpperCase().equals(attrVal)) { return transformAttr(context, node, modAttr, transformer); } } return false; }
protected boolean checkAllowTag(ParseContext context, TagNode tagNode) { String tagName = tagNode.getTagName(); // Check the NOSCRIPT tag, if force-noscript is set, // then skip the NOSCRIPT tags and include contents explicitly if (tagName.equals("NOSCRIPT")) { String allPolicies = context.getOraclePolicy(); if ((allPolicies != null) && allPolicies.contains("force-noscript")) { return false; } } return true; }
public void handleOpenTagNode(ParseContext pContext, TagNode node) throws IOException { ReplayParseContext context = (ReplayParseContext) pContext; if (context.getData(FERRET_DONE_KEY) == null) { // we haven't emitted yet: // are we running in post-emit? if (context.getPhase() == ReplayParseEventDelegator.PHASE_POST_OUTPUT) { // emit if it is a body tag: if (node.getTagName().equals("BODY")) { emit((ReplayParseContext) context, node); } } else { // must be PHASE_PRE_MODIFY: if it's a body tag, emit now: if (isNotTagAppearingInHead(node)) { if (node.getTagName().equals(FRAMESET_TAG)) { // don't put content in pages with a FRAMESET: context.putData(FERRET_DONE_KEY, "1"); } else { // and this is a tag that shouldn't be in the HEAD. Emit: emit((ReplayParseContext) context, node); } } } } }
/** * Create a tag like the one provided. * * @param tag The tag to emulate. * @param scanner The scanner for this tag. */ public TagNode(TagNode tag, TagScanner scanner) { this(tag.getPage(), tag.getTagBegin(), tag.getTagEnd(), tag.getAttributesEx()); setThisScanner(scanner); }
/** * Creates a list of Grids based on the given HTML string. This works only for table-based HTML * documents. * * @param html the HTML string. * @return a list of Grids. */ public static List<Grid> fromHtml(String html) throws Exception { if (html == null || html.trim().isEmpty()) { return null; } List<Grid> grids = new ArrayList<>(); Parser parser = Parser.createParser(html, "UTF-8"); Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray(); for (Node t : tables) { Grid grid = new ListGrid(); TableTag table = (TableTag) t; TableRow[] rows = table.getRows(); Integer firstColumnCount = null; for (TableRow row : rows) { if (getColumnCount(row) == 0) // Ignore if no cells { log.warn("Ignoring row with no columns"); continue; } Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray(); if (firstColumnCount == null) // First row becomes header { firstColumnCount = getColumnCount(row); for (Node c : cells) { TagNode cell = (TagNode) c; grid.addHeader(new GridHeader(getValue(cell), false, false)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyHeaders((colSpan - 1)); } } } else // Rest becomes rows { if (firstColumnCount != getColumnCount(row)) // Ignore { log.warn( "Ignoring row which has " + row.getColumnCount() + " columns since table has " + firstColumnCount + " columns"); continue; } grid.addRow(); for (Node c : cells) { // TODO row span TagNode cell = (TagNode) c; grid.addValue(getValue(cell)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyValues((colSpan - 1)); } } } } grids.add(grid); } return grids; }
private static boolean handleURL(String address) { Main.status(String.format("Processing page \"%s\".", address)); try { NodeList posts = getPosts(address); if (posts.toNodeArray().length == 0) { return false; } for (Node post_node : posts.toNodeArray()) { if (post_node instanceof TagNode) { TagNode post = (TagNode) post_node; Post new_post = new Post(Long.parseLong(post.getAttribute("id").substring(5))); if (!Main.post_post_hash.containsKey(new_post)) { NodeList photo_posts = getPhotoPosts(post.getChildren()); NodeList remarks = getRemarks(photo_posts); for (Node node : remarks.toNodeArray()) { Matcher matcher = lores.matcher(node.getText()); String media_url = ""; if (matcher.find()) { media_url = matcher.group(); media_url = media_url.substring(17, media_url.length() - 1); } String thumb = media_url.replace( media_url.substring(media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } NodeList photoset_posts = getPhotosetPosts(post.getChildren()); NodeList iframes = getIFrames(photoset_posts); for (Node node : iframes.toNodeArray()) { if (node instanceof TagNode) { String iframe_url = ((TagNode) node).getAttribute("src"); Parser parser2 = new Parser(iframe_url); NodeList a_list = parser2.extractAllNodesThatMatch(new TagNameFilter("a")); Node[] a_array = a_list.toNodeArray(); Node[] img_array = a_list.extractAllNodesThatMatch(new TagNameFilter("img"), true).toNodeArray(); String media_url; for (int i = 0; i < a_array.length; i++) { media_url = ((TagNode) img_array[i]).getAttribute("src"); String thumb = media_url.replace( media_url.substring( media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } } } Main.handlePost(new_post); } else { new_post = post_post_hash.get(new_post); handleNonDownloadPost(new_post); } } } } catch (Exception ex) { ex.printStackTrace(); Main.status("Error handling post."); } return true; }
/** * 生成预览内容 * * @param html * @param max_count * @return */ public static String preview(String html, int max_count) { if (html.length() <= max_count * 1.1) return html; Parser parser = new Parser(); StringBuffer prvContent = new StringBuffer(); try { parser.setEncoding(Globals.ENC_8859_1); parser.setInputHTML(html); parser.setNodeFactory(factory); NodeList nodes = parser.extractAllNodesThatMatch(nfilter); Node node = null; for (int i = 0; i < nodes.size(); i++) { if (prvContent.length() >= max_count) { if (node instanceof TagNode) { TagNode tmp_node = (TagNode) node; boolean isEnd = tmp_node.isEndTag(); if (!isEnd) { prvContent.setLength(prvContent.length() - tmp_node.getText().length() - 2); } } // 补齐所有未关闭的标签 Node parent = node; // System.out.println("current node is . "+parent.getText()); do { // System.out.println(parent.getClass().getName()+":"+parent.getText()); parent = parent.getParent(); // System.out.println("parent = "+parent); if (parent == null) break; if (!(parent instanceof TagNode)) continue; // System.out.println("Parent node is no ended. "+parent.getText()); prvContent.append(((TagNode) parent).getEndTag().toHtml()); } while (true); break; } node = nodes.elementAt(i); if (node instanceof TagNode) { TagNode tag = (TagNode) node; prvContent.append('<'); prvContent.append(tag.getText()); prvContent.append('>'); // System.out.println("TAG: " + '<'+tag.getText()+'>'); } else if (node instanceof TextNode) { int space = max_count - prvContent.length(); if (space > 10) { TextNode text = (TextNode) node; if (text.getText().length() < 10) prvContent.append(text.getText()); else prvContent.append( StringUtils.abbreviate(text.getText(), max_count - prvContent.length())); // System.out.println("TEXT: " + text.getText()); } } } return prvContent.toString(); } catch (ParserException e) { e.printStackTrace(); } finally { parser = null; } return html; }
private void handleOpenTagNode(ReplayParseContext context, TagNode tagNode) throws IOException { boolean insertedJsp = context.getData(FERRET_DONE_KEY) != null; String preEmit = null; String postEmit = null; String tagName = tagNode.getTagName(); boolean alreadyInsertedHead = (context.getData(FERRET_HEAD_INSERTED) != null); if (!alreadyInsertedHead) { // If we're at the beginning of a <head> tag, and haven't inserted yet, // insert right AFTER head tag if (tagName.equals("HEAD")) { emitHeadInsert(context, tagNode, true); context.putData(FERRET_IN_HEAD, FERRET_IN_HEAD); return; } // If we're at the beginning of any tag, other than <html>, // (including <body>) and haven't inserted yet, // insert right BEFORE the next tag, also continue other default processing // of the tag if (!tagName.equals("HTML") && !tagName.equals("!DOCTYPE")) { emitHeadInsert(context, null, false); // Don't return continue to further processing } } boolean inHead = (context.getData(FERRET_IN_HEAD) != null); // Time to insert the JSP header? // IK added check to avoid inserting inside css or script if (!insertedJsp && !context.isInCSS() && !context.isInScriptText() && !inHead) { if (!okHeadTagMap.containsKey(tagName)) { if (tagName.equals(FRAMESET_TAG)) { // don't put the insert in framsets: } else { if (jspInsertPath != null && !context.getJspExec().getUiResults().getWbRequest().isIFrameWrapperContext()) { String tmp = null; try { tmp = context.getJspExec().jspToString(jspInsertPath); } catch (ServletException e) { e.printStackTrace(); } if (tagName.equals(BODY_TAG)) { // insert it now, *after* the current Tag: postEmit = tmp; } else { // hrm... we are seeing a node that should be in // the body.. lets emit the jsp now, *before* // the current Tag: preEmit = tmp; } } } context.putData(FERRET_DONE_KEY, ""); } } // now do all the usual attribute rewriting: // this could be slightly optimized by moving tags more likely to occur // to the front of the if/else if/else if routing... if (tagName.equals("A")) { transformAttr(context, tagNode, "HREF", anchorUrlTrans); } else if (tagName.equals("APPLET")) { transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans); transformAttr(context, tagNode, "ARCHIVE", objectEmbedUrlTrans); } else if (tagName.equals("AREA")) { transformAttr(context, tagNode, "HREF", anchorUrlTrans); } else if (tagName.equals("BASE")) { String orig = tagNode.getAttribute("HREF"); if (orig != null) { try { context.setBaseUrl(new URL(orig)); transformAttr(context, tagNode, "HREF", anchorUrlTrans); } catch (MalformedURLException e) { e.printStackTrace(); } } } else if (tagName.equals("EMBED")) { transformAttr(context, tagNode, "SRC", objectEmbedUrlTrans); } else if (tagName.equals("IFRAME")) { transformAttr(context, tagNode, "SRC", iframeUrlTrans); } else if (tagName.equals("IMG")) { transformAttr(context, tagNode, "SRC", imageUrlTrans); } else if (tagName.equals("INPUT")) { transformAttr(context, tagNode, "SRC", imageUrlTrans); } else if (tagName.equals("FORM")) { transformAttr(context, tagNode, "ACTION", anchorUrlTrans); } else if (tagName.equals("FRAME")) { transformAttr(context, tagNode, "SRC", framesetUrlTrans); } else if (tagName.equals("LINK")) { if (transformAttrWhere(context, tagNode, "REL", "STYLESHEET", "HREF", cssUrlTrans)) { // no-op } else if (transformAttrWhere( context, tagNode, "REL", "SHORTCUT ICON", "HREF", imageUrlTrans)) { // no-op } else { transformAttr(context, tagNode, "HREF", anchorUrlTrans); } } else if (tagName.equals("META")) { transformAttrWhere(context, tagNode, "HTTP-EQUIV", "REFRESH", "CONTENT", metaRefreshTrans); transformAttr(context, tagNode, "URL", anchorUrlTrans); } else if (tagName.equals("OBJECT")) { transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans); transformAttr(context, tagNode, "CDATA", objectEmbedUrlTrans); } else if (tagName.equals("SCRIPT")) { transformAttr(context, tagNode, "SRC", jsUrlTrans); } else if (tagName.equals("DIV") || tagName.equals("LI")) { // HTML5 -- can have data-src or data-uri attributes in any tag! // Can really be in any tag but for now using most common use cases // Experimental transformAttr(context, tagNode, "data-src", objectEmbedUrlTrans); transformAttr(context, tagNode, "data-uri", objectEmbedUrlTrans); } else { if (!checkAllowTag(context, tagNode)) { return; } } // now, for *all* tags... transformAttr(context, tagNode, "BACKGROUND", imageUrlTrans); transformAttr(context, tagNode, "STYLE", cssInlineTrans); transformAttr(context, tagNode, "onclick", jsBlockTrans); transformAttr(context, tagNode, "onload", jsBlockTrans); emit(context, preEmit, tagNode, postEmit); }