private Hashtable<String, Integer> extractData(String docUrl, Page page) throws Exception { Hashtable<String, Integer> businessTable = new Hashtable<String, Integer>(); String currentBusiness = ""; Lexer lexer = new Lexer(page); while (true) { Node node = lexer.nextNode(); if (node == null) { break; } if (node instanceof TagNode) { TagNode tagNode = (TagNode) node; if (tagNode.getTagName().equals("A")) { String href = tagNode.getAttribute("href"); if (href != null) { String absUrl = AbsUrlConstructor.construct(docUrl, href); Crawler.dispatchUrl(absUrl); Pattern pBusiness = Pattern.compile("^(http://www.yelp.com/biz/)(\\S)+"); if (pBusiness.matcher(absUrl).matches()) { currentBusiness = extractBusinessName(href); if (!businessTable.containsKey(currentBusiness)) { businessTable.put(currentBusiness, -1); } // System.out.println("currentBusiness = "+currentBusiness); // rating = "4"; // UpdateDatabase(linkID, business, rating, userID); // System.out.println(business + " added."); } } } else if (tagNode.getTagName().equals("IMG")) { String c1ass2 = tagNode.getAttribute("class"); if (c1ass2 != null) { String rating = ""; String[] rate = c1ass2.split("_"); int num = rate.length - 1; if (!rate[num].equals("loader")) { rating = rate[num].trim(); if (businessTable.get(currentBusiness) == -1) { businessTable.put(currentBusiness, Integer.parseInt(rating)); } } // System.out.println(linkID + " " + business + " " + rating + " " + userID; } } } } return businessTable; }
private boolean isNotTagAppearingInHead(TagNode node) { String thisTag = node.getTagName(); if (thisTag.startsWith("!")) return false; for (String tag : okHeadTags) { if (thisTag.equals(tag)) { return false; } } return true; }
protected boolean checkAllowTag(ParseContext context, TagNode tagNode) { String tagName = tagNode.getTagName(); // Check the NOSCRIPT tag, if force-noscript is set, // then skip the NOSCRIPT tags and include contents explicitly if (tagName.equals("NOSCRIPT")) { String allPolicies = context.getOraclePolicy(); if ((allPolicies != null) && allPolicies.contains("force-noscript")) { return false; } } return true; }
public void handleOpenTagNode(ParseContext pContext, TagNode node) throws IOException { ReplayParseContext context = (ReplayParseContext) pContext; if (context.getData(FERRET_DONE_KEY) == null) { // we haven't emitted yet: // are we running in post-emit? if (context.getPhase() == ReplayParseEventDelegator.PHASE_POST_OUTPUT) { // emit if it is a body tag: if (node.getTagName().equals("BODY")) { emit((ReplayParseContext) context, node); } } else { // must be PHASE_PRE_MODIFY: if it's a body tag, emit now: if (isNotTagAppearingInHead(node)) { if (node.getTagName().equals(FRAMESET_TAG)) { // don't put content in pages with a FRAMESET: context.putData(FERRET_DONE_KEY, "1"); } else { // and this is a tag that shouldn't be in the HEAD. Emit: emit((ReplayParseContext) context, node); } } } } }
// TODO: This should all be refactored up into an abstract base class with // default no-op methods, allowing a subclass to only override the ones they // want... public void handleNode(ParseContext pContext, Node node) throws IOException { ReplayParseContext context = (ReplayParseContext) pContext; if (NodeUtils.isRemarkNode(node)) { RemarkNode remarkNode = (RemarkNode) node; remarkNode.setText(jsBlockTrans.transform(context, remarkNode.getText())); emit(context, null, node, null); } else if (NodeUtils.isTextNode(node)) { TextNode textNode = (TextNode) node; if (context.isInCSS()) { handleCSSTextNode(context, textNode); } else if (context.isInScriptText()) { handleJSTextNode(context, textNode); } else { emit(context, null, textNode, null); // handleContentTextNode(context,textNode); } } else if (NodeUtils.isTagNode(node)) { TagNode tagNode = (TagNode) node; if (NodeUtils.isOpenTagNodeNamed(tagNode, NodeUtils.SCRIPT_TAG_NAME)) { handleJSIncludeNode(context, tagNode); } else if (tagNode.isEndTag()) { if (tagNode.getTagName().equals("HEAD")) { context.putData(FERRET_IN_HEAD, null); } if (checkAllowTag(pContext, tagNode)) { emit(context, null, tagNode, null); } // handleCloseTagNode(context,tagNode); } else { // assume start, possibly empty: handleOpenTagNode(context, tagNode); } } else { throw new IllegalArgumentException("Unknown node type.."); } }
private void handleOpenTagNode(ReplayParseContext context, TagNode tagNode) throws IOException { boolean insertedJsp = context.getData(FERRET_DONE_KEY) != null; String preEmit = null; String postEmit = null; String tagName = tagNode.getTagName(); boolean alreadyInsertedHead = (context.getData(FERRET_HEAD_INSERTED) != null); if (!alreadyInsertedHead) { // If we're at the beginning of a <head> tag, and haven't inserted yet, // insert right AFTER head tag if (tagName.equals("HEAD")) { emitHeadInsert(context, tagNode, true); context.putData(FERRET_IN_HEAD, FERRET_IN_HEAD); return; } // If we're at the beginning of any tag, other than <html>, // (including <body>) and haven't inserted yet, // insert right BEFORE the next tag, also continue other default processing // of the tag if (!tagName.equals("HTML") && !tagName.equals("!DOCTYPE")) { emitHeadInsert(context, null, false); // Don't return continue to further processing } } boolean inHead = (context.getData(FERRET_IN_HEAD) != null); // Time to insert the JSP header? // IK added check to avoid inserting inside css or script if (!insertedJsp && !context.isInCSS() && !context.isInScriptText() && !inHead) { if (!okHeadTagMap.containsKey(tagName)) { if (tagName.equals(FRAMESET_TAG)) { // don't put the insert in framsets: } else { if (jspInsertPath != null && !context.getJspExec().getUiResults().getWbRequest().isIFrameWrapperContext()) { String tmp = null; try { tmp = context.getJspExec().jspToString(jspInsertPath); } catch (ServletException e) { e.printStackTrace(); } if (tagName.equals(BODY_TAG)) { // insert it now, *after* the current Tag: postEmit = tmp; } else { // hrm... we are seeing a node that should be in // the body.. lets emit the jsp now, *before* // the current Tag: preEmit = tmp; } } } context.putData(FERRET_DONE_KEY, ""); } } // now do all the usual attribute rewriting: // this could be slightly optimized by moving tags more likely to occur // to the front of the if/else if/else if routing... if (tagName.equals("A")) { transformAttr(context, tagNode, "HREF", anchorUrlTrans); } else if (tagName.equals("APPLET")) { transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans); transformAttr(context, tagNode, "ARCHIVE", objectEmbedUrlTrans); } else if (tagName.equals("AREA")) { transformAttr(context, tagNode, "HREF", anchorUrlTrans); } else if (tagName.equals("BASE")) { String orig = tagNode.getAttribute("HREF"); if (orig != null) { try { context.setBaseUrl(new URL(orig)); transformAttr(context, tagNode, "HREF", anchorUrlTrans); } catch (MalformedURLException e) { e.printStackTrace(); } } } else if (tagName.equals("EMBED")) { transformAttr(context, tagNode, "SRC", objectEmbedUrlTrans); } else if (tagName.equals("IFRAME")) { transformAttr(context, tagNode, "SRC", iframeUrlTrans); } else if (tagName.equals("IMG")) { transformAttr(context, tagNode, "SRC", imageUrlTrans); } else if (tagName.equals("INPUT")) { transformAttr(context, tagNode, "SRC", imageUrlTrans); } else if (tagName.equals("FORM")) { transformAttr(context, tagNode, "ACTION", anchorUrlTrans); } else if (tagName.equals("FRAME")) { transformAttr(context, tagNode, "SRC", framesetUrlTrans); } else if (tagName.equals("LINK")) { if (transformAttrWhere(context, tagNode, "REL", "STYLESHEET", "HREF", cssUrlTrans)) { // no-op } else if (transformAttrWhere( context, tagNode, "REL", "SHORTCUT ICON", "HREF", imageUrlTrans)) { // no-op } else { transformAttr(context, tagNode, "HREF", anchorUrlTrans); } } else if (tagName.equals("META")) { transformAttrWhere(context, tagNode, "HTTP-EQUIV", "REFRESH", "CONTENT", metaRefreshTrans); transformAttr(context, tagNode, "URL", anchorUrlTrans); } else if (tagName.equals("OBJECT")) { transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans); transformAttr(context, tagNode, "CDATA", objectEmbedUrlTrans); } else if (tagName.equals("SCRIPT")) { transformAttr(context, tagNode, "SRC", jsUrlTrans); } else if (tagName.equals("DIV") || tagName.equals("LI")) { // HTML5 -- can have data-src or data-uri attributes in any tag! // Can really be in any tag but for now using most common use cases // Experimental transformAttr(context, tagNode, "data-src", objectEmbedUrlTrans); transformAttr(context, tagNode, "data-uri", objectEmbedUrlTrans); } else { if (!checkAllowTag(context, tagNode)) { return; } } // now, for *all* tags... transformAttr(context, tagNode, "BACKGROUND", imageUrlTrans); transformAttr(context, tagNode, "STYLE", cssInlineTrans); transformAttr(context, tagNode, "onclick", jsBlockTrans); transformAttr(context, tagNode, "onload", jsBlockTrans); emit(context, preEmit, tagNode, postEmit); }