/**
  * Extracted from "http://scores.nba.com/games/20031029/scoreboard.html" which has a lot of table
  * columns with unclosed DIV tags because the closing DIV doesn't have a slash. This caused
  * java.lang.StackOverflowError on Windows. Tests the new non-recursive CompositeTagScanner with
  * the walk back through the parse stack. See also Bug #750117 StackOverFlow while Node-Iteration
  * and others.
  */
 public void testInvalidNesting() throws ParserException {
   String html =
       "<table cellspacing=\"2\" cellpadding=\"0\" border=\"0\" width=\"600\">\n"
           + "<tr>\n"
           + "<td><div class=\"ScoreBoardSec\">&nbsp;<a  target=\"_parent\" class=\"ScoreBoardSec\" href=\"http://www.nba.com/heat/\">Heat</a><div></td>\n"
           + "</tr>\n"
           + "</table>";
   createParser(html);
   parseAndAssertNodeCount(1);
   assertType("table", TableTag.class, node[0]);
   TableTag table = (TableTag) node[0];
   assertTrue("table should have 3 nodes", 3 == table.getChildCount());
   assertType("row", TableRow.class, table.childAt(1));
   TableRow row = (TableRow) table.childAt(1);
   assertTrue("row should have 3 nodes", 3 == row.getChildCount());
   assertType("column", TableColumn.class, row.childAt(1));
   TableColumn column = (TableColumn) row.childAt(1);
   assertTrue("column should have 1 node", 1 == column.getChildCount());
   assertType("element", Div.class, column.childAt(0));
   Div div = (Div) column.childAt(0);
   assertTrue("div should have 3 nodes", 3 == div.getChildCount());
   assertType("link", LinkTag.class, div.childAt(1));
   LinkTag link = (LinkTag) div.childAt(1);
   assertTrue("link contents", link.getLink().equals("http://www.nba.com/heat/"));
   assertType("bogus div", Div.class, div.childAt(2));
   assertTrue("bogus div should have no children", 0 == ((Div) div.childAt(2)).getChildCount());
 }
Example #2
0
 private HtmlTable makeMockTable(String tableIdentifier) {
   // Create just enough "table" to test if
   TableTag tableTag = new TableTag();
   TableRow tableRow = new TableRow();
   TableColumn tableColumn = new TableColumn();
   tableColumn.setChildren(new NodeList(new TextNode(tableIdentifier)));
   tableRow.setChildren(new NodeList(tableColumn));
   tableTag.setChildren(new NodeList(tableRow));
   return new HtmlTable(tableTag);
 }
Example #3
0
  /** Returns the number of columns/cells in the given row, including cell spacing. */
  private static int getColumnCount(TableRow row) {
    Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();

    int cols = 0;

    for (Node cell : cells) {
      Integer colSpan = MathUtils.parseInt(((TagNode) cell).getAttribute("colspan"));

      cols += colSpan != null ? colSpan : 1;
    }

    return cols;
  }
Example #4
0
  /** 从课表处,分课表 */
  public List<Courses> parseCourses(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      e.printStackTrace();
    }
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      e.printStackTrace();
    }

    List<Courses> list = new ArrayList<Courses>();
    String schoolyear = "";
    String semester = "";
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          Courses courses = null;
          boolean isCourse = false;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            String temp = "学年学期:";
            int start = info.indexOf(temp);
            int len = "2012-2013".length();
            if (start != -1) {
              start = start + temp.length();
              schoolyear = info.substring(start, start + len);
              // semester = info.substring(start+len+2);
              // 网络正常时候测试学期改为数字
              semester = info.substring(start + len + 3, start + len + 4);
              if ("一".equals(semester)) {
                semester = "1";
              } else if ("二".equals(semester)) {
                semester = "2";
              }
            }
            if (k == 1 && info.indexOf("[") != -1) {
              courses = new Courses();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              courses.setCourseCode(courseCode);
              courses.setCoursesname(coursesname);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              courses.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              courses.setType(info);
            }
            if (k == 4 && isCourse) {
              courses.setLeanType(info);
            }
            if (k == 5 && isCourse) {
              courses.setCheckType(info);
            }
            if (k == 6 && isCourse) {
              courses.setGetType(info);
            }
            if (k == 7 && isCourse) {
              //	double score=Double.parseDouble(info);
              courses.setScore(info);
            }
            if (k == 8 && isCourse) {
              courses.setRemark(info);
            }
          } // end for k
          if (courses != null) {
            courses.setSchoolYear(schoolyear);
            courses.setSemester(semester);
            list.add(courses);
          }
        } // end for j
      }
    }
    return list;
  }
Example #5
0
  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        if (tag.getText().indexOf("[课程号]") == -1) {
          continue;
        }
        TableRow[] rows = tag.getRows();
        for (int j = 1; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            System.out.println(info + "===" + k);
            switch (k) {
              case 1:
                int start = info.indexOf("[");
                int end = info.indexOf("]");
                timeTable = new TimeTable();
                timeTable.setCourseCode(info.substring(start + 1, end));
                timeTable.setCourseName(info.substring(end + 1));
                break;
              case 3:
                timeTable.setCredit(Double.parseDouble(info));
                break;
              case 4:
                timeTable.setType(info);
                break;
              case 5:
                int t_start = info.indexOf("]");
                timeTable.setTeacher(info.substring(t_start + 1));
                break;
              case 8:
                List<TimeAndAdress> ta_list = praseStr(info);
                for (TimeAndAdress ta : ta_list) {
                  timeTable.setAddress(ta.getAddress());
                  timeTable.setTime(ta.getTime());
                  timeTable.setCycle(ta.getCycle());
                  timeTable.setSingleDouble(ta.getSingleDouble());
                  timeTable.setWeek(ta.getWeek());
                  list.add(timeTable.clone());
                }
                break;
              default:
                break;
            }
          }
        } // end for j
      }
    }
    return list;
  }
Example #6
0
  /**
   * Creates a list of Grids based on the given HTML string. This works only for table-based HTML
   * documents.
   *
   * @param html the HTML string.
   * @return a list of Grids.
   */
  public static List<Grid> fromHtml(String html) throws Exception {
    if (html == null || html.trim().isEmpty()) {
      return null;
    }

    List<Grid> grids = new ArrayList<>();

    Parser parser = Parser.createParser(html, "UTF-8");

    Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray();

    for (Node t : tables) {
      Grid grid = new ListGrid();

      TableTag table = (TableTag) t;

      TableRow[] rows = table.getRows();

      Integer firstColumnCount = null;

      for (TableRow row : rows) {
        if (getColumnCount(row) == 0) // Ignore if no cells
        {
          log.warn("Ignoring row with no columns");
          continue;
        }

        Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();

        if (firstColumnCount == null) // First row becomes header
        {
          firstColumnCount = getColumnCount(row);

          for (Node c : cells) {
            TagNode cell = (TagNode) c;

            grid.addHeader(new GridHeader(getValue(cell), false, false));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyHeaders((colSpan - 1));
            }
          }
        } else // Rest becomes rows
        {
          if (firstColumnCount != getColumnCount(row)) // Ignore
          {
            log.warn(
                "Ignoring row which has "
                    + row.getColumnCount()
                    + " columns since table has "
                    + firstColumnCount
                    + " columns");
            continue;
          }

          grid.addRow();

          for (Node c : cells) {
            // TODO row span

            TagNode cell = (TagNode) c;

            grid.addValue(getValue(cell));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyValues((colSpan - 1));
            }
          }
        }
      }

      grids.add(grid);
    }

    return grids;
  }
Example #7
0
  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter tagfilter = new NodeClassFilter(TableTag.class);
    NodeFilter idFilter = new HasAttributeFilter("id", "reportArea");
    NodeFilter filter = new AndFilter(tagfilter, idFilter);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;

          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            //   System.out.println(info+"=="+k);
            if (k == 1 && info.indexOf("[") != -1) {
              timeTable = new TimeTable();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              timeTable.setCourseName(coursesname);
              timeTable.setCourseCode(courseCode);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              timeTable.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              timeTable.setType(info);
            }
            if (k == 4 && isCourse) {
              timeTable.setTeacher(info);
            }
            if (k == 5 && isCourse) {
              timeTable.setClassId(info);
            }
            if (k == 6 && isCourse) {
              timeTable.setClassNum(info);
            }
            if (k == 11 && isCourse) {
              List<TimeAndAdress> ta_list = praseStr(info);
              for (TimeAndAdress ta : ta_list) {
                timeTable.setAddress(ta.getAddress());
                timeTable.setTime(ta.getTime());
                timeTable.setCycle(ta.getCycle());
                timeTable.setSingleDouble(ta.getSingleDouble());
                timeTable.setWeek(ta.getWeek());
                list.add(timeTable.clone());
              }
            }
          } // end for k
        } // end for j
      }
    }
    return list;
  }