/**
  * Transform a particular attribute on a TagNode, iff that attribute exists
  *
  * @param context The ReplayParseContext being transformed
  * @param node the TagNode to update
  * @param attr the attribute name to transform
  * @param transformer the StringTransformer responsible for creating the new value
  * @return true if the attribute was found and updated
  */
 private boolean transformAttr(
     ReplayParseContext context, TagNode node, String attr, StringTransformer transformer) {
   String orig = node.getAttribute(attr);
   if (orig != null) {
     node.setAttribute(attr, transformer.transform(context, orig));
     return true;
   }
   return false;
 }
Пример #2
0
  private Hashtable<String, Integer> extractData(String docUrl, Page page) throws Exception {
    Hashtable<String, Integer> businessTable = new Hashtable<String, Integer>();
    String currentBusiness = "";
    Lexer lexer = new Lexer(page);
    while (true) {
      Node node = lexer.nextNode();
      if (node == null) {
        break;
      }

      if (node instanceof TagNode) {
        TagNode tagNode = (TagNode) node;
        if (tagNode.getTagName().equals("A")) {
          String href = tagNode.getAttribute("href");
          if (href != null) {
            String absUrl = AbsUrlConstructor.construct(docUrl, href);

            Crawler.dispatchUrl(absUrl);

            Pattern pBusiness = Pattern.compile("^(http://www.yelp.com/biz/)(\\S)+");
            if (pBusiness.matcher(absUrl).matches()) {
              currentBusiness = extractBusinessName(href);
              if (!businessTable.containsKey(currentBusiness)) {
                businessTable.put(currentBusiness, -1);
              }
              //							System.out.println("currentBusiness = "+currentBusiness);
              // rating = "4";
              // UpdateDatabase(linkID, business, rating, userID);
              // System.out.println(business + " added.");
            }
          }
        } else if (tagNode.getTagName().equals("IMG")) {
          String c1ass2 = tagNode.getAttribute("class");
          if (c1ass2 != null) {
            String rating = "";

            String[] rate = c1ass2.split("_");
            int num = rate.length - 1;
            if (!rate[num].equals("loader")) {
              rating = rate[num].trim();
              if (businessTable.get(currentBusiness) == -1) {
                businessTable.put(currentBusiness, Integer.parseInt(rating));
              }
            }
            // System.out.println(linkID + " " + business + " " + rating + " " + userID;

          }
        }
      }
    }

    return businessTable;
  }
Пример #3
0
  /**
   * Parses the given text to create the tag contents.
   *
   * @param text A string of the form &lt;TAGNAME xx="yy"&gt;.
   */
  public void setText(String text) {
    Lexer lexer;
    TagNode output;

    lexer = new Lexer(text);
    try {
      output = (TagNode) lexer.nextNode();
      mPage = output.getPage();
      nodeBegin = output.getStartPosition();
      nodeEnd = output.getEndPosition();
      mAttributes = output.getAttributesEx();
    } catch (ParserException pe) {
      throw new IllegalArgumentException(pe.getMessage());
    }
  }
  private void handleJSIncludeNode(ReplayParseContext context, TagNode tagNode) throws IOException {
    String file = tagNode.getAttribute("SRC");
    if (file != null) {
      // TODO: This is hacky.. fix it
      // This is used to check if the file should be skipped...
      // from a custom rule..
      String result = jsBlockTrans.transform(context, file);
      // The rewriting is done by the js_ rewriter
      if ((result != null) && !result.isEmpty()) {
        tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file));
      } else {
        file = "";
        tagNode.setAttribute("SRC", jsUrlTrans.transform(context, file));
      }
    }

    emit(context, null, tagNode, null);
  }
 private boolean isNotTagAppearingInHead(TagNode node) {
   String thisTag = node.getTagName();
   if (thisTag.startsWith("!")) return false;
   for (String tag : okHeadTags) {
     if (thisTag.equals(tag)) {
       return false;
     }
   }
   return true;
 }
  // TODO: This should all be refactored up into an abstract base class with
  // default no-op methods, allowing a subclass to only override the ones they
  // want...
  public void handleNode(ParseContext pContext, Node node) throws IOException {
    ReplayParseContext context = (ReplayParseContext) pContext;
    if (NodeUtils.isRemarkNode(node)) {
      RemarkNode remarkNode = (RemarkNode) node;
      remarkNode.setText(jsBlockTrans.transform(context, remarkNode.getText()));
      emit(context, null, node, null);

    } else if (NodeUtils.isTextNode(node)) {
      TextNode textNode = (TextNode) node;
      if (context.isInCSS()) {
        handleCSSTextNode(context, textNode);

      } else if (context.isInScriptText()) {
        handleJSTextNode(context, textNode);
      } else {
        emit(context, null, textNode, null);
        //				handleContentTextNode(context,textNode);
      }
    } else if (NodeUtils.isTagNode(node)) {
      TagNode tagNode = (TagNode) node;

      if (NodeUtils.isOpenTagNodeNamed(tagNode, NodeUtils.SCRIPT_TAG_NAME)) {
        handleJSIncludeNode(context, tagNode);
      } else if (tagNode.isEndTag()) {

        if (tagNode.getTagName().equals("HEAD")) {
          context.putData(FERRET_IN_HEAD, null);
        }

        if (checkAllowTag(pContext, tagNode)) {
          emit(context, null, tagNode, null);
        }

        //				handleCloseTagNode(context,tagNode);
      } else {
        // assume start, possibly empty:
        handleOpenTagNode(context, tagNode);
      }
    } else {
      throw new IllegalArgumentException("Unknown node type..");
    }
  }
Пример #7
0
  /**
   * Retrieves the value of a table cell. Appends the text of child nodes of the cell. In case of
   * composite tags like span or div the inner text is appended.
   */
  public static String getValue(TagNode cell) {
    String value = EMPTY;

    for (Node child : cell.getChildren().toNodeArray()) {
      if (child instanceof CompositeTag) {
        value += ((CompositeTag) child).getStringText();
      } else {
        value = value + child.getText();
      }
    }

    return value.trim().replaceAll("&nbsp;", EMPTY);
  }
 /**
  * Transform a particular attribute on a TagNode, if that TagNode has a previous value for the
  * updated attribute, AND if that TagNode contains another named attribute with a specific value.
  *
  * @param context the ReplayParseContext
  * @param node the TagNode to be updated
  * @param attrName update only occurs if the TagNode has an attribute with this name.
  * @param attrVal update only occurs if the TagNode has an attribute attrName has this value, case
  *     insensitive. In fact as an optimization, it is ASSUMED that this argument is already
  *     UPPER-CASED
  * @param modAttr the attribute value to update
  * @param transformer the StringTransformer responsible for creating the new value based on the
  *     old one.
  * @return true if the attribute was updated.
  */
 private boolean transformAttrWhere(
     ReplayParseContext context,
     TagNode node,
     String attrName,
     String attrVal,
     String modAttr,
     StringTransformer transformer) {
   String val = node.getAttribute(attrName);
   if (val != null) {
     if (val.toUpperCase().equals(attrVal)) {
       return transformAttr(context, node, modAttr, transformer);
     }
   }
   return false;
 }
  protected boolean checkAllowTag(ParseContext context, TagNode tagNode) {
    String tagName = tagNode.getTagName();

    // Check the NOSCRIPT tag, if force-noscript is set,
    // then  skip the NOSCRIPT tags and include contents explicitly
    if (tagName.equals("NOSCRIPT")) {
      String allPolicies = context.getOraclePolicy();

      if ((allPolicies != null) && allPolicies.contains("force-noscript")) {
        return false;
      }
    }

    return true;
  }
 public void handleOpenTagNode(ParseContext pContext, TagNode node) throws IOException {
   ReplayParseContext context = (ReplayParseContext) pContext;
   if (context.getData(FERRET_DONE_KEY) == null) {
     // we haven't emitted yet:
     // are we running in post-emit?
     if (context.getPhase() == ReplayParseEventDelegator.PHASE_POST_OUTPUT) {
       // emit if it is a body tag:
       if (node.getTagName().equals("BODY")) {
         emit((ReplayParseContext) context, node);
       }
     } else {
       // must be PHASE_PRE_MODIFY: if it's a body tag, emit now:
       if (isNotTagAppearingInHead(node)) {
         if (node.getTagName().equals(FRAMESET_TAG)) {
           // don't put content in pages with a FRAMESET:
           context.putData(FERRET_DONE_KEY, "1");
         } else {
           // and this is a tag that shouldn't be in the HEAD. Emit:
           emit((ReplayParseContext) context, node);
         }
       }
     }
   }
 }
Пример #11
0
 /**
  * Create a tag like the one provided.
  *
  * @param tag The tag to emulate.
  * @param scanner The scanner for this tag.
  */
 public TagNode(TagNode tag, TagScanner scanner) {
   this(tag.getPage(), tag.getTagBegin(), tag.getTagEnd(), tag.getAttributesEx());
   setThisScanner(scanner);
 }
Пример #12
0
  /**
   * Creates a list of Grids based on the given HTML string. This works only for table-based HTML
   * documents.
   *
   * @param html the HTML string.
   * @return a list of Grids.
   */
  public static List<Grid> fromHtml(String html) throws Exception {
    if (html == null || html.trim().isEmpty()) {
      return null;
    }

    List<Grid> grids = new ArrayList<>();

    Parser parser = Parser.createParser(html, "UTF-8");

    Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray();

    for (Node t : tables) {
      Grid grid = new ListGrid();

      TableTag table = (TableTag) t;

      TableRow[] rows = table.getRows();

      Integer firstColumnCount = null;

      for (TableRow row : rows) {
        if (getColumnCount(row) == 0) // Ignore if no cells
        {
          log.warn("Ignoring row with no columns");
          continue;
        }

        Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();

        if (firstColumnCount == null) // First row becomes header
        {
          firstColumnCount = getColumnCount(row);

          for (Node c : cells) {
            TagNode cell = (TagNode) c;

            grid.addHeader(new GridHeader(getValue(cell), false, false));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyHeaders((colSpan - 1));
            }
          }
        } else // Rest becomes rows
        {
          if (firstColumnCount != getColumnCount(row)) // Ignore
          {
            log.warn(
                "Ignoring row which has "
                    + row.getColumnCount()
                    + " columns since table has "
                    + firstColumnCount
                    + " columns");
            continue;
          }

          grid.addRow();

          for (Node c : cells) {
            // TODO row span

            TagNode cell = (TagNode) c;

            grid.addValue(getValue(cell));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyValues((colSpan - 1));
            }
          }
        }
      }

      grids.add(grid);
    }

    return grids;
  }
Пример #13
0
 private static boolean handleURL(String address) {
   Main.status(String.format("Processing page \"%s\".", address));
   try {
     NodeList posts = getPosts(address);
     if (posts.toNodeArray().length == 0) {
       return false;
     }
     for (Node post_node : posts.toNodeArray()) {
       if (post_node instanceof TagNode) {
         TagNode post = (TagNode) post_node;
         Post new_post = new Post(Long.parseLong(post.getAttribute("id").substring(5)));
         if (!Main.post_post_hash.containsKey(new_post)) {
           NodeList photo_posts = getPhotoPosts(post.getChildren());
           NodeList remarks = getRemarks(photo_posts);
           for (Node node : remarks.toNodeArray()) {
             Matcher matcher = lores.matcher(node.getText());
             String media_url = "";
             if (matcher.find()) {
               media_url = matcher.group();
               media_url = media_url.substring(17, media_url.length() - 1);
             }
             String thumb =
                 media_url.replace(
                     media_url.substring(media_url.lastIndexOf("_"), media_url.lastIndexOf(".")),
                     "_75sq");
             URL thumb_url = new URL(thumb);
             new_post.pictures.add(new Picture(new URL(media_url), thumb_url));
           }
           NodeList photoset_posts = getPhotosetPosts(post.getChildren());
           NodeList iframes = getIFrames(photoset_posts);
           for (Node node : iframes.toNodeArray()) {
             if (node instanceof TagNode) {
               String iframe_url = ((TagNode) node).getAttribute("src");
               Parser parser2 = new Parser(iframe_url);
               NodeList a_list = parser2.extractAllNodesThatMatch(new TagNameFilter("a"));
               Node[] a_array = a_list.toNodeArray();
               Node[] img_array =
                   a_list.extractAllNodesThatMatch(new TagNameFilter("img"), true).toNodeArray();
               String media_url;
               for (int i = 0; i < a_array.length; i++) {
                 media_url = ((TagNode) img_array[i]).getAttribute("src");
                 String thumb =
                     media_url.replace(
                         media_url.substring(
                             media_url.lastIndexOf("_"), media_url.lastIndexOf(".")),
                         "_75sq");
                 URL thumb_url = new URL(thumb);
                 new_post.pictures.add(new Picture(new URL(media_url), thumb_url));
               }
             }
           }
           Main.handlePost(new_post);
         } else {
           new_post = post_post_hash.get(new_post);
           handleNonDownloadPost(new_post);
         }
       }
     }
   } catch (Exception ex) {
     ex.printStackTrace();
     Main.status("Error handling post.");
   }
   return true;
 }
Пример #14
0
  /**
   * 生成预览内容
   *
   * @param html
   * @param max_count
   * @return
   */
  public static String preview(String html, int max_count) {
    if (html.length() <= max_count * 1.1) return html;
    Parser parser = new Parser();
    StringBuffer prvContent = new StringBuffer();
    try {
      parser.setEncoding(Globals.ENC_8859_1);
      parser.setInputHTML(html);

      parser.setNodeFactory(factory);

      NodeList nodes = parser.extractAllNodesThatMatch(nfilter);
      Node node = null;
      for (int i = 0; i < nodes.size(); i++) {
        if (prvContent.length() >= max_count) {
          if (node instanceof TagNode) {
            TagNode tmp_node = (TagNode) node;
            boolean isEnd = tmp_node.isEndTag();
            if (!isEnd) {
              prvContent.setLength(prvContent.length() - tmp_node.getText().length() - 2);
            }
          }
          // 补齐所有未关闭的标签
          Node parent = node;
          // System.out.println("current node is . "+parent.getText());
          do {
            // System.out.println(parent.getClass().getName()+":"+parent.getText());
            parent = parent.getParent();
            // System.out.println("parent = "+parent);
            if (parent == null) break;
            if (!(parent instanceof TagNode)) continue;
            // System.out.println("Parent node is no ended. "+parent.getText());
            prvContent.append(((TagNode) parent).getEndTag().toHtml());
          } while (true);
          break;
        }
        node = nodes.elementAt(i);
        if (node instanceof TagNode) {
          TagNode tag = (TagNode) node;
          prvContent.append('<');
          prvContent.append(tag.getText());
          prvContent.append('>');
          // System.out.println("TAG: " + '<'+tag.getText()+'>');
        } else if (node instanceof TextNode) {
          int space = max_count - prvContent.length();
          if (space > 10) {
            TextNode text = (TextNode) node;
            if (text.getText().length() < 10) prvContent.append(text.getText());
            else
              prvContent.append(
                  StringUtils.abbreviate(text.getText(), max_count - prvContent.length()));
            // System.out.println("TEXT: " + text.getText());
          }
        }
      }
      return prvContent.toString();
    } catch (ParserException e) {
      e.printStackTrace();
    } finally {
      parser = null;
    }
    return html;
  }
  private void handleOpenTagNode(ReplayParseContext context, TagNode tagNode) throws IOException {

    boolean insertedJsp = context.getData(FERRET_DONE_KEY) != null;

    String preEmit = null;
    String postEmit = null;

    String tagName = tagNode.getTagName();

    boolean alreadyInsertedHead = (context.getData(FERRET_HEAD_INSERTED) != null);

    if (!alreadyInsertedHead) {
      // If we're at the beginning of a <head> tag, and haven't inserted yet,
      // insert right AFTER head tag
      if (tagName.equals("HEAD")) {
        emitHeadInsert(context, tagNode, true);
        context.putData(FERRET_IN_HEAD, FERRET_IN_HEAD);
        return;
      }

      // If we're at the beginning of any tag, other than <html>,
      // (including <body>) and haven't inserted yet,
      // insert right BEFORE the next tag, also continue other default processing
      // of the tag
      if (!tagName.equals("HTML") && !tagName.equals("!DOCTYPE")) {
        emitHeadInsert(context, null, false);
        // Don't return continue to further processing
      }
    }

    boolean inHead = (context.getData(FERRET_IN_HEAD) != null);

    // Time to insert the JSP header?
    // IK added check to avoid inserting inside css or script
    if (!insertedJsp && !context.isInCSS() && !context.isInScriptText() && !inHead) {
      if (!okHeadTagMap.containsKey(tagName)) {
        if (tagName.equals(FRAMESET_TAG)) {
          // don't put the insert in framsets:
        } else {
          if (jspInsertPath != null
              && !context.getJspExec().getUiResults().getWbRequest().isIFrameWrapperContext()) {
            String tmp = null;
            try {
              tmp = context.getJspExec().jspToString(jspInsertPath);
            } catch (ServletException e) {
              e.printStackTrace();
            }
            if (tagName.equals(BODY_TAG)) {
              // insert it now, *after* the current Tag:
              postEmit = tmp;
            } else {
              // hrm... we are seeing a node that should be in
              // the body.. lets emit the jsp now, *before*
              // the current Tag:
              preEmit = tmp;
            }
          }
        }
        context.putData(FERRET_DONE_KEY, "");
      }
    }

    // now do all the usual attribute rewriting:
    // this could be slightly optimized by moving tags more likely to occur
    // to the front of the if/else if/else if routing...

    if (tagName.equals("A")) {
      transformAttr(context, tagNode, "HREF", anchorUrlTrans);

    } else if (tagName.equals("APPLET")) {
      transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans);
      transformAttr(context, tagNode, "ARCHIVE", objectEmbedUrlTrans);

    } else if (tagName.equals("AREA")) {
      transformAttr(context, tagNode, "HREF", anchorUrlTrans);

    } else if (tagName.equals("BASE")) {
      String orig = tagNode.getAttribute("HREF");
      if (orig != null) {
        try {
          context.setBaseUrl(new URL(orig));
          transformAttr(context, tagNode, "HREF", anchorUrlTrans);

        } catch (MalformedURLException e) {
          e.printStackTrace();
        }
      }

    } else if (tagName.equals("EMBED")) {
      transformAttr(context, tagNode, "SRC", objectEmbedUrlTrans);

    } else if (tagName.equals("IFRAME")) {
      transformAttr(context, tagNode, "SRC", iframeUrlTrans);

    } else if (tagName.equals("IMG")) {
      transformAttr(context, tagNode, "SRC", imageUrlTrans);

    } else if (tagName.equals("INPUT")) {
      transformAttr(context, tagNode, "SRC", imageUrlTrans);

    } else if (tagName.equals("FORM")) {
      transformAttr(context, tagNode, "ACTION", anchorUrlTrans);

    } else if (tagName.equals("FRAME")) {
      transformAttr(context, tagNode, "SRC", framesetUrlTrans);

    } else if (tagName.equals("LINK")) {
      if (transformAttrWhere(context, tagNode, "REL", "STYLESHEET", "HREF", cssUrlTrans)) {
        // no-op
      } else if (transformAttrWhere(
          context, tagNode, "REL", "SHORTCUT ICON", "HREF", imageUrlTrans)) {
        // no-op
      } else {
        transformAttr(context, tagNode, "HREF", anchorUrlTrans);
      }

    } else if (tagName.equals("META")) {
      transformAttrWhere(context, tagNode, "HTTP-EQUIV", "REFRESH", "CONTENT", metaRefreshTrans);
      transformAttr(context, tagNode, "URL", anchorUrlTrans);

    } else if (tagName.equals("OBJECT")) {
      transformAttr(context, tagNode, "CODEBASE", objectEmbedUrlTrans);
      transformAttr(context, tagNode, "CDATA", objectEmbedUrlTrans);

    } else if (tagName.equals("SCRIPT")) {
      transformAttr(context, tagNode, "SRC", jsUrlTrans);
    } else if (tagName.equals("DIV") || tagName.equals("LI")) {
      // HTML5 -- can have data-src or data-uri attributes in any tag!
      // Can really be in any tag but for now using most common use cases
      // Experimental
      transformAttr(context, tagNode, "data-src", objectEmbedUrlTrans);
      transformAttr(context, tagNode, "data-uri", objectEmbedUrlTrans);
    } else {
      if (!checkAllowTag(context, tagNode)) {
        return;
      }
    }
    // now, for *all* tags...
    transformAttr(context, tagNode, "BACKGROUND", imageUrlTrans);
    transformAttr(context, tagNode, "STYLE", cssInlineTrans);
    transformAttr(context, tagNode, "onclick", jsBlockTrans);
    transformAttr(context, tagNode, "onload", jsBlockTrans);

    emit(context, preEmit, tagNode, postEmit);
  }