Пример #1
0
  /**
   * 方法:获取对应的页面内容
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws ParserException
   *     <p>Add By Ethan Lam At 2011-11-23
   */
  public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException {
    Parser parser = new Parser();
    parser.setInputHTML(htmlPageContent);
    NodeFilter filter =
        new AndFilter(
            new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon"));
    NodeList nodeList = parser.parse(filter);
    NodeIterator it = nodeList.elements();
    Div div = null;
    StringBuffer htmlContent = new StringBuffer();
    while (it.hasMoreNodes()) {
      div = (Div) it.nextNode();
      NodeList nl = div.getChildren();
      if (nl == null) continue;
      NodeIterator sub = nl.elements();
      while (sub.hasMoreNodes()) {
        Node t = sub.nextNode();
        if (t instanceof ParagraphTag) {
          //	        		    LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText());
          htmlContent.append(((ParagraphTag) t).getStringText());
        }
      }
    }
    if ("".equals(htmlContent.toString().trim())) return;

    Page page = new Page();
    page.setUrl(preUrl);
    page.setSegment(htmlContent.toString());
    LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString());
    pageSer.save(page);
  }
Пример #2
0
 /**
  * 提取网页中所有的IssueComment元素
  *
  * @param source
  */
 private List<IssueCommentEvent> processComment(
     NodeList nodeList, List<IssueCommentEvent> icList) {
   SimpleNodeIterator sni = nodeList.elements();
   while (sni.hasMoreNodes()) {
     Node node = sni.nextNode();
     if (node.getText().matches("div id=\"issuecomment-.*\".*+")) {
       IssueCommentEvent i = new IssueCommentEvent();
       // TODO 解析comment工作
       Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
       i.setActor(actorNode.toPlainTextString());
       Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body");
       i.setCommentBody(contentNode.toPlainTextString());
       Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
       Pattern pattern = Pattern.compile("datetime=\".*\"");
       Matcher matcher = pattern.matcher(timeNode.getText());
       if (matcher.find()) {
         String time = matcher.group().split("\"")[1];
         i.setCreatedAt(time);
         System.out.println(time);
       }
       icList.add(i);
     } else {
       // 得到该节点的子节点列表
       NodeList childList = node.getChildren();
       // 孩子节点为空,说明是值节点
       if (null != childList) { // 如果孩子结点不为空则递归调用
         processComment(childList, icList);
       }
     }
   }
   return icList;
 }
Пример #3
0
  /**
   * 处理开启pullrequest的需求
   *
   * @param nodeList
   * @param pList
   * @return
   */
  public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div id=\"issue-")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("open");
        Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body");
        pullRequestEvent.setBody(commentNode.toPlainTextString());
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author");
        pullRequestEvent.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processOpenPull(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #4
0
  /**
   * 提取网页中的删除操作
   *
   * @param nodeList
   * @param dList
   * @return
   */
  public List<DeleteEvent> processDelete(NodeList nodeList, List<DeleteEvent> dList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("discussion-item-head_ref_deleted")) {
        DeleteEvent d = new DeleteEvent();
        // TODO 解析comment工作
        Node deleteNode = DownloadUtil.getSomeChild(node, "span title=\"");
        d.setRef(deleteNode.getText().split("\"")[1]);
        System.out.println(deleteNode.getText().split("\"")[1]);
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        d.setActor(actorNode.toPlainTextString());
        System.out.println(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          d.setDeleteAt(time);
        }
        dList.add(d);
      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processDelete(childList, dList);
        }
      }
    }

    return dList;
  }
Пример #5
0
  public List<PullRequestReviewCommentEvent> processSubPullRequestReviewComment(
      NodeList nodeList, List<PullRequestReviewCommentEvent> prList, String discussionId) {
    SimpleNodeIterator sni2 = nodeList.elements();
    while (sni2.hasMoreNodes()) {
      Node node2 = sni2.nextNode();
      if (node2.getText().contains("div id=\"discussion_r")) {
        PullRequestReviewCommentEvent p = new PullRequestReviewCommentEvent();
        // TODO 解析comment工作
        p.setDiscussionId(discussionId);

        Node actorNode = DownloadUtil.getSomeChild(node2, "class=\"author\"");
        p.setActor(actorNode.toPlainTextString());
        System.out.println(actorNode.toPlainTextString());
        Node contentNode = DownloadUtil.getSomeChild(node2, "div class=\"comment-body");
        p.setCommentBody(contentNode.toPlainTextString());
        System.out.println(contentNode.toPlainTextString().trim());
        Node timeNode = DownloadUtil.getSomeChild(node2, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          p.setCreatedAt(time);
        }
        prList.add(p);
      } else {
        // 得到该节点的子节点列表
        NodeList childList = node2.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processSubPullRequestReviewComment(childList, prList, discussionId);
        }
      }
    }
    return prList;
  }
Пример #6
0
  /**
   * 递归钻取正文信息
   *
   * @param nodeP
   * @return
   */
  @SuppressWarnings("unchecked")
  protected List<Node> extractHtml(Node nodeP, String type) throws Exception {
    NodeList nodeList = nodeP.getChildren();
    if ((nodeList == null) || (nodeList.size() == 0)) {
      return null;
    }
    ArrayList tableList = new ArrayList();
    try {
      for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) {
        Node node = (Node) e.nextNode();
        if (node instanceof LinkTag) {
          tableList.add(node);
        } else if (node instanceof ScriptTag
            || node instanceof StyleTag
            || node instanceof SelectTag) {
        } else if (node instanceof TextNode) {
          if (node.getText().length() > 0) {
            tableList.add(node);
          }
        } else {
          List tempList = extractHtml(node, type);
          if ((tempList != null) && (tempList.size() > 0)) {
            Iterator ti = tempList.iterator();
            while (ti.hasNext()) {
              tableList.add(ti.next());
            }
          }
        }
      }
    } catch (Exception e) {
      return null;
    }
    if ((tableList != null) && (tableList.size() > 0)) {
      TableContext tc = new TableContext();
      tc.setLinkList(new ArrayList());
      tc.setTextBuffer(new StringBuffer());
      tableNumber++;
      tc.setTableRow(tableNumber);
      Iterator ti = tableList.iterator();

      // 得到设置的搜索URL
      String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL);

      while (ti.hasNext()) {
        Node node = (Node) ti.next();
        if (node instanceof LinkTag) {
          LinkTag linkTag = (LinkTag) node;
          if (!"1".equalsIgnoreCase(type)) {
            linkTag.setAttribute(
                "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href")));
          }
          tc.getLinkList().add(linkTag);
        } else {
          tc.getTextBuffer().append(node.getText());
        }
      }
      return tableList;
    }
    return null;
  }
Пример #7
0
  @Override
  public void crawl(Parser parser) throws ParserException {
    List<LCOdds> data = new ArrayList<LCOdds>();
    NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT));
    for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) {
      NodeList cells = it.nextNode().getChildren();
      cells.keepAllNodesThatMatch(tdFilter);

      LCOdds lc = parseRow(cells);

      if (null != lc) {
        data.add(lc);
      }
    }
    // persist
    if (data.size() < 1) {
      log.warn(" -- [ 06_LC_2 ] data is empty !");
    }
    storeData("lc_odds", data);
  }
Пример #8
0
 /**
  * 处理对pullrequest的review时,comment的操作, 与processSubPullRequestReviewComment配合一起使用
  *
  * @param nodeList
  * @param prList
  * @return
  */
 public List<PullRequestReviewCommentEvent> processPullRequestReviewComment(
     NodeList nodeList, List<PullRequestReviewCommentEvent> prList) {
   SimpleNodeIterator sni = nodeList.elements();
   while (sni.hasMoreNodes()) {
     Node node = sni.nextNode();
     if (node.getText().contains("div id=\"diff-for-comment-")) {
       String discussionId = node.getText().split("\"")[1];
       System.out.println(discussionId);
       NodeList subNodeList = node.getChildren();
       prList = processSubPullRequestReviewComment(subNodeList, prList, discussionId);
     } else {
       // 得到该节点的子节点列表
       NodeList childList = node.getChildren();
       // 孩子节点为空,说明是值节点
       if (null != childList) { // 如果孩子结点不为空则递归调用
         processPullRequestReviewComment(childList, prList);
       }
     }
   }
   return prList;
 }
Пример #9
0
 private void processNodeList(NodeList list, List<String> valueList) {
   // 迭代开始
   SimpleNodeIterator iterator = list.elements();
   while (iterator.hasMoreNodes()) {
     Node node = iterator.nextNode();
     // 得到该节点的子节点列表
     NodeList childList = node.getChildren();
     // 孩子节点为空,说明是值节点
     if (null == childList) {
       // 得到值节点的值
       String result = node.toPlainTextString().trim();
       // 若包含关键字,则简单打印出来文本
       // System.out.println(result);
       if (result != null && !"".equals(result)) valueList.add(result);
     } // end if
     // 孩子节点不为空,继续迭代该孩子节点
     else {
       processNodeList(childList, valueList);
     } // end else
   } // end wile
 }
Пример #10
0
  /**
   * 处理Reference了当前pullrequest的操作
   *
   * @param source
   */
  public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("ref");
        Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\"");
        pullRequestEvent.setBody(
            anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString());
        Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+");
        Matcher artifactMatcher =
            artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText());
        if (artifactMatcher.find()) {
          String anotherAtifact = artifactMatcher.group();
          pullRequestEvent.setPullrequestBaseRef(anotherAtifact);
          System.out.println(anotherAtifact);
        }
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processReference(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #11
0
  /**
   * 处理labeled操作
   *
   * @param source
   */
  public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("labeled");
        List<Node> lableList = new ArrayList<Node>();
        lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList);
        String lables = "";
        for (int i = 0; i < lableList.size(); i++) {
          lables += lableList.get(i).toPlainTextString();
          if (i != lableList.size() - 1) {
            lables += ",";
          }
        }
        System.out.println(lables);
        pullRequestEvent.setPullrequestBaseLabels(lables);
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pullRequestEvent.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processLabled(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #12
0
  /**
   * 处理取消指派某人操作
   *
   * <p>跟之前一样,取消指派的是后面的家伙
   *
   * @param nodeList
   * @param pList
   * @return
   */
  private List<PullRequestEvent> processUnassigned(
      NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) {
        PullRequestEvent pEvent = new PullRequestEvent();
        pEvent.setAction("assigned");
        Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString());
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\"");
        if (actorNode != null) {
          pEvent.setActor(actorNode.toPlainTextString());
        } else {
          pEvent.setActor(assignedNode.toPlainTextString());
        }
        System.out.println(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pEvent.setCreatedAt(time);
        }
        pList.add(pEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processUnassigned(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #13
0
  /**
   * 处理移除里程碑动作
   *
   * @param nodeList
   * @param pList
   * @return
   */
  public List<PullRequestEvent> processRemoveMileStone(
      NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) {
        PullRequestEvent p = new PullRequestEvent();
        p.setAction("removeMilestone");
        Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\"");
        Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+");
        Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText());
        if (milestoneMatcher.find()) {
          String milestone = milestoneMatcher.group().split("\"")[0];
          p.setBody(milestone);
        }
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        p.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          p.setCreatedAt(time);
        }
        pList.add(p);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processRemoveMileStone(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #14
0
  /**
   * 处理目标 超链接节点
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws Exception
   */
  public void dealLinkNodes(String htmlPageContent, String preUrl) {
    try {
      Parser parser = new Parser();
      parser.setInputHTML(htmlPageContent);
      NodeFilter filter =
          new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"));
      NodeList nodeList = parser.parse(filter);
      LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0));
      NodeIterator it = nodeList.elements();
      while (it.hasMoreNodes()) {
        Node node = it.nextNode();
        if (node instanceof LinkTag) {
          if (!filterHandler.isLinkTagFilter(((LinkTag) node))) {
            LoggerUtil.debug(
                "ParserHandler  ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText());
            CrawlQueue.getQueueManager()
                .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl);
          }
        }
      }
    } catch (Exception e) {

    }
  }
 public ConversationID findCorrespondingHTMLFormConversation(ConversationID samlId) {
   ConversationModel conversationModel = this.model.getConversationModel();
   HttpUrl samlHttpUrl = conversationModel.getRequestUrl(samlId);
   int samlConversationIndex = conversationModel.getIndexOfConversation(samlId);
   for (int conversationIndex = samlConversationIndex - 1;
       conversationIndex >= 0;
       conversationIndex--) {
     ConversationID id = conversationModel.getConversationAt(conversationIndex);
     Response response = conversationModel.getResponse(id);
     HttpUrl httpUrl = conversationModel.getRequestUrl(id);
     Object parsedContent = Parser.parse(httpUrl, response);
     if (null == parsedContent) {
       continue;
     }
     if (false == parsedContent instanceof org.htmlparser.util.NodeList) {
       continue;
     }
     org.htmlparser.util.NodeList htmlNodeList = (org.htmlparser.util.NodeList) parsedContent;
     org.htmlparser.util.NodeList forms = htmlNodeList.searchFor(FormTag.class);
     try {
       for (NodeIterator ni = forms.elements(); ni.hasMoreNodes(); ) {
         FormTag form = (FormTag) ni.nextNode();
         String formAction = form.getAttribute("action");
         HttpUrl formActionHttpUrl = new HttpUrl(formAction);
         if (samlHttpUrl.equals(formActionHttpUrl)) {
           return id;
         }
       }
     } catch (ParserException ex) {
       this._logger.log(Level.WARNING, "Looking for forms, got ''{0}''", ex);
     } catch (MalformedURLException ex) {
       this._logger.log(Level.WARNING, "Malformed action url: {0}", ex.getMessage());
     }
   }
   return null;
 }
Пример #16
0
  private void scanPage() throws IOException, ParserException, ParseException {
    URL u = new URL(this.url);
    HttpURLConnection conn = (HttpURLConnection) u.openConnection();
    Parser parser = new Parser(conn);
    System.setProperty("sun.net.client.defaultConnectTimeout", "30000000"); // jdk1.4换成这个,连接超时
    System.setProperty("sun.net.client.defaultReadTimeout", "30000000"); // jdk1.4换成这个,读操作超时
    // con.setConnectTimeout(5000);//jdk 1.5换成这个,连接超时
    // con.setReadTimeout(5000);//jdk 1.5换成这个,读操作超时
    parser.setEncoding("UTF-8");
    NodeFilter filter = new NodeClassFilter(CompositeTag.class);
    NodeList tags = parser.extractAllNodesThatMatch(filter);
    SimpleNodeIterator iter = tags.elements();

    CompositeTag tag = null;
    while (iter.hasMoreNodes()) {
      tag = (CompositeTag) iter.nextNode();
      String id = tag.getAttribute("id");
      String cls = tag.getAttribute("class");
      if ((tag instanceof LinkTag)) {
        LinkTag lt = (LinkTag) tag;

        if (cls == null) {
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Category")) {
          this.category = lt.getStringText();
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Sub-Category")) {
          this.subCategory = lt.getStringText();
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Brand")) {
          this.brand = lt.getStringText();
          continue;
        }
        if (cls.startsWith("gae-click*Product-Page*PrForm*Free-Shipping")) {
          this.freight = "Free Shipping!";
        } else if (cls.equalsIgnoreCase("link fn")) {
          this.pname = lt.getStringText();
          continue;
        }
      } else if ((tag instanceof LabelTag)) {
        LabelTag lt = (LabelTag) tag;
        if ((id != null) && (id.startsWith("label")) && (cls != null) && (cls.startsWith("d"))) {
          String l = lt.getLabel();
          l = l.replace("\n", "");
          int idx = l.indexOf(40);
          if (idx > 0) {
            l = l.substring(0, idx);
          }
          this.dimNames.put(cls, l);
        }
      } else if (!(tag instanceof SelectTag)) {
        if ((tag instanceof Span)) {
          if ((id != null) && (id.equalsIgnoreCase("sku"))) {
            String sku = tag.getStringText();
            this.pid = sku.substring(sku.indexOf(35) + 1);
          }
        } else if ((tag instanceof Bullet)) {
          Bullet b = (Bullet) tag;
          String text = b.getStringText().trim();

          if (text.startsWith("Weight")) {
            int idx = text.indexOf(":");
            this.weight = text.substring(idx + 1).trim();
          }

        } else if ((tag instanceof Div)) {
          Div div = (Div) tag;
          if (cls == null) {
            continue;
          }
          if (cls.equalsIgnoreCase("description")) {
            StringBuilder sb = new StringBuilder();
            BulletList bullets = (BulletList) div.getChild(0);
            SimpleNodeIterator bls = bullets.elements();
            while (bls.hasMoreNodes()) {
              Node n = bls.nextNode();
              if ((n instanceof Bullet)) {
                Bullet bl = (Bullet) n;
                sb.append(bl.getStringText());
              }
            }
            this.intro = sb.toString();
          }
        } else if ((this.items == null) && ((tag instanceof ScriptTag))) {
          this.items = readScript((ScriptTag) tag);
        }
      }
    }
  }