Ejemplo n.º 1
0
  private Hashtable<String, Integer> extractData(String docUrl, Page page) throws Exception {
    Hashtable<String, Integer> businessTable = new Hashtable<String, Integer>();
    String currentBusiness = "";
    Lexer lexer = new Lexer(page);
    while (true) {
      Node node = lexer.nextNode();
      if (node == null) {
        break;
      }

      if (node instanceof TagNode) {
        TagNode tagNode = (TagNode) node;
        if (tagNode.getTagName().equals("A")) {
          String href = tagNode.getAttribute("href");
          if (href != null) {
            String absUrl = AbsUrlConstructor.construct(docUrl, href);

            Crawler.dispatchUrl(absUrl);

            Pattern pBusiness = Pattern.compile("^(http://www.yelp.com/biz/)(\\S)+");
            if (pBusiness.matcher(absUrl).matches()) {
              currentBusiness = extractBusinessName(href);
              if (!businessTable.containsKey(currentBusiness)) {
                businessTable.put(currentBusiness, -1);
              }
              //							System.out.println("currentBusiness = "+currentBusiness);
              // rating = "4";
              // UpdateDatabase(linkID, business, rating, userID);
              // System.out.println(business + " added.");
            }
          }
        } else if (tagNode.getTagName().equals("IMG")) {
          String c1ass2 = tagNode.getAttribute("class");
          if (c1ass2 != null) {
            String rating = "";

            String[] rate = c1ass2.split("_");
            int num = rate.length - 1;
            if (!rate[num].equals("loader")) {
              rating = rate[num].trim();
              if (businessTable.get(currentBusiness) == -1) {
                businessTable.put(currentBusiness, Integer.parseInt(rating));
              }
            }
            // System.out.println(linkID + " " + business + " " + rating + " " + userID;

          }
        }
      }
    }

    return businessTable;
  }
 private boolean isNotTagAppearingInHead(TagNode node) {
   String thisTag = node.getTagName();
   if (thisTag.startsWith("!")) return false;
   for (String tag : okHeadTags) {
     if (thisTag.equals(tag)) {
       return false;
     }
   }
   return true;
 }
  protected boolean checkAllowTag(ParseContext context, TagNode tagNode) {
    String tagName = tagNode.getTagName();

    // Check the NOSCRIPT tag, if force-noscript is set,
    // then  skip the NOSCRIPT tags and include contents explicitly
    if (tagName.equals("NOSCRIPT")) {
      String allPolicies = context.getOraclePolicy();

      if ((allPolicies != null) && allPolicies.contains("force-noscript")) {
        return false;
      }
    }

    return true;
  }
 public void handleOpenTagNode(ParseContext pContext, TagNode node) throws IOException {
   ReplayParseContext context = (ReplayParseContext) pContext;
   if (context.getData(FERRET_DONE_KEY) == null) {
     // we haven't emitted yet:
     // are we running in post-emit?
     if (context.getPhase() == ReplayParseEventDelegator.PHASE_POST_OUTPUT) {
       // emit if it is a body tag:
       if (node.getTagName().equals("BODY")) {
         emit((ReplayParseContext) context, node);
       }
     } else {
       // must be PHASE_PRE_MODIFY: if it's a body tag, emit now:
       if (isNotTagAppearingInHead(node)) {
         if (node.getTagName().equals(FRAMESET_TAG)) {
           // don't put content in pages with a FRAMESET:
           context.putData(FERRET_DONE_KEY, "1");
         } else {
           // and this is a tag that shouldn't be in the HEAD. Emit:
           emit((ReplayParseContext) context, node);
         }
       }
     }
   }
 }
  // TODO: This should all be refactored up into an abstract base class with
  // default no-op methods, allowing a subclass to only override the ones they
  // want...
  public void handleNode(ParseContext pContext, Node node) throws IOException {
    ReplayParseContext context = (ReplayParseContext) pContext;
    if (NodeUtils.isRemarkNode(node)) {
      RemarkNode remarkNode = (RemarkNode) node;
      remarkNode.setText(jsBlockTrans.transform(context, remarkNode.getText()));
      emit(context, null, node, null);

    } else if (NodeUtils.isTextNode(node)) {
      TextNode textNode = (TextNode) node;
      if (context.isInCSS()) {
        handleCSSTextNode(context, textNode);

      } else if (context.isInScriptText()) {
        handleJSTextNode(context, textNode);
      } else {
        emit(context, null, textNode, null);
        //				handleContentTextNode(context,textNode);
      }
    } else if (NodeUtils.isTagNode(node)) {
      TagNode tagNode = (TagNode) node;

      if (NodeUtils.isOpenTagNodeNamed(tagNode, NodeUtils.SCRIPT_TAG_NAME)) {
        handleJSIncludeNode(context, tagNode);
      } else if (tagNode.isEndTag()) {

        if (tagNode.getTagName().equals("HEAD")) {
          context.putData(FERRET_IN_HEAD, null);
        }

        if (checkAllowTag(pContext, tagNode)) {
          emit(context, null, tagNode, null);
        }

        //				handleCloseTagNode(context,tagNode);
      } else {
        // assume start, possibly empty:
        handleOpenTagNode(context, tagNode);
      }
    } else {
      throw new IllegalArgumentException("Unknown node type..");
    }
  }
  private void handleOpenTagNode(ReplayParseContext context, TagNode tagNode) throws IOException {

    boolean insertedJsp = context.getData(FERRET_DONE_KEY) != null;

    String preEmit = null;
    String postEmit = null;

    String tagName = tagNode.getTagName();

    boolean alreadyInsertedHead = (context.getData(FERRET_HEAD_INSERTED) != null);

    if (!alreadyInsertedHead) {
      // If we're at the beginning of a <head> tag, and haven't inserted yet,
      // insert right AFTER head tag
      if (tagName.equals("HEAD")) {
        emitHeadInsert(context, tagNode, true);
        context.putData(FERRET_IN_HEAD, FERRET_IN_HEAD);
        return;
      }

      // If we're at the beginning of any tag, other than <html>,
      // (including <body>) and haven't inserted yet,
      // insert right BEFORE the next tag, also continue other default processing
      // of the tag
      if (!tagName.equals("HTML") && !tagName.equals("!DOCTYPE")) {
        emitHeadInsert(context, null, false);
        // Don't return continue to further processing
      }
    }

    boolean inHead = (context.getData(FERRET_IN_HEAD) != null);

    // Time to insert the JSP header?
    // IK added check to avoid inserting inside css or script
    if (!insertedJsp && !context.isInCSS() && !context.isInScriptText() && !inHead) {
      if (!okHeadTagMap.containsKey(tagName)) {
        if (tagName.equals(FRAMESET_TAG)) {
          // don't put the insert in framsets:
        } else {
          if (jspInsertPath != null
              && !context.getJspExec().getUiResults().getWbRequest().isIFrameWrapperContext()) {
            String tmp = null;
            try {
              tmp = context.getJspExec().jspToString(jspInsertPath);
            } catch (ServletException e) {
              e.printStackTrace();
            }
            if (tagName.equals(BODY_TAG)) {
              // insert it now, *after* the current Tag:
              postEmit = tmp;
            } else {
              // hrm... we are seeing a node that should be in
              // the body.. lets emit the jsp now, *before*
              // the current Tag:
              preEmit = tmp;
            }
          }
        }
        context.putData(FERRET_DONE_KEY, "");
      }
    }

    // now do all the usual attribute rewriting:
    // this could be slightly optimized by moving tags more likely to occur
    // to the front of the if/else if/else if routing...

    if (tagName.equals("A")) {
      transformAttr(context, tagNode, "HREF", anchorUrlTrans);

    } else if (tagName.equals("APPLET")) {
      transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans);
      transformAttr(context, tagNode, "ARCHIVE", objectEmbedUrlTrans);

    } else if (tagName.equals("AREA")) {
      transformAttr(context, tagNode, "HREF", anchorUrlTrans);

    } else if (tagName.equals("BASE")) {
      String orig = tagNode.getAttribute("HREF");
      if (orig != null) {
        try {
          context.setBaseUrl(new URL(orig));
          transformAttr(context, tagNode, "HREF", anchorUrlTrans);

        } catch (MalformedURLException e) {
          e.printStackTrace();
        }
      }

    } else if (tagName.equals("EMBED")) {
      transformAttr(context, tagNode, "SRC", objectEmbedUrlTrans);

    } else if (tagName.equals("IFRAME")) {
      transformAttr(context, tagNode, "SRC", iframeUrlTrans);

    } else if (tagName.equals("IMG")) {
      transformAttr(context, tagNode, "SRC", imageUrlTrans);

    } else if (tagName.equals("INPUT")) {
      transformAttr(context, tagNode, "SRC", imageUrlTrans);

    } else if (tagName.equals("FORM")) {
      transformAttr(context, tagNode, "ACTION", anchorUrlTrans);

    } else if (tagName.equals("FRAME")) {
      transformAttr(context, tagNode, "SRC", framesetUrlTrans);

    } else if (tagName.equals("LINK")) {
      if (transformAttrWhere(context, tagNode, "REL", "STYLESHEET", "HREF", cssUrlTrans)) {
        // no-op
      } else if (transformAttrWhere(
          context, tagNode, "REL", "SHORTCUT ICON", "HREF", imageUrlTrans)) {
        // no-op
      } else {
        transformAttr(context, tagNode, "HREF", anchorUrlTrans);
      }

    } else if (tagName.equals("META")) {
      transformAttrWhere(context, tagNode, "HTTP-EQUIV", "REFRESH", "CONTENT", metaRefreshTrans);
      transformAttr(context, tagNode, "URL", anchorUrlTrans);

    } else if (tagName.equals("OBJECT")) {
      transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans);
      transformAttr(context, tagNode, "CDATA", objectEmbedUrlTrans);

    } else if (tagName.equals("SCRIPT")) {
      transformAttr(context, tagNode, "SRC", jsUrlTrans);
    } else if (tagName.equals("DIV") || tagName.equals("LI")) {
      // HTML5 -- can have data-src or data-uri attributes in any tag!
      // Can really be in any tag but for now using most common use cases
      // Experimental
      transformAttr(context, tagNode, "data-src", objectEmbedUrlTrans);
      transformAttr(context, tagNode, "data-uri", objectEmbedUrlTrans);
    } else {
      if (!checkAllowTag(context, tagNode)) {
        return;
      }
    }
    // now, for *all* tags...
    transformAttr(context, tagNode, "BACKGROUND", imageUrlTrans);
    transformAttr(context, tagNode, "STYLE", cssInlineTrans);
    transformAttr(context, tagNode, "onclick", jsBlockTrans);
    transformAttr(context, tagNode, "onload", jsBlockTrans);

    emit(context, preEmit, tagNode, postEmit);
  }