/** * Extracted from "http://scores.nba.com/games/20031029/scoreboard.html" which has a lot of table * columns with unclosed DIV tags because the closing DIV doesn't have a slash. This caused * java.lang.StackOverflowError on Windows. Tests the new non-recursive CompositeTagScanner with * the walk back through the parse stack. See also Bug #750117 StackOverFlow while Node-Iteration * and others. */ public void testInvalidNesting() throws ParserException { String html = "<table cellspacing=\"2\" cellpadding=\"0\" border=\"0\" width=\"600\">\n" + "<tr>\n" + "<td><div class=\"ScoreBoardSec\"> <a target=\"_parent\" class=\"ScoreBoardSec\" href=\"http://www.nba.com/heat/\">Heat</a><div></td>\n" + "</tr>\n" + "</table>"; createParser(html); parseAndAssertNodeCount(1); assertType("table", TableTag.class, node[0]); TableTag table = (TableTag) node[0]; assertTrue("table should have 3 nodes", 3 == table.getChildCount()); assertType("row", TableRow.class, table.childAt(1)); TableRow row = (TableRow) table.childAt(1); assertTrue("row should have 3 nodes", 3 == row.getChildCount()); assertType("column", TableColumn.class, row.childAt(1)); TableColumn column = (TableColumn) row.childAt(1); assertTrue("column should have 1 node", 1 == column.getChildCount()); assertType("element", Div.class, column.childAt(0)); Div div = (Div) column.childAt(0); assertTrue("div should have 3 nodes", 3 == div.getChildCount()); assertType("link", LinkTag.class, div.childAt(1)); LinkTag link = (LinkTag) div.childAt(1); assertTrue("link contents", link.getLink().equals("http://www.nba.com/heat/")); assertType("bogus div", Div.class, div.childAt(2)); assertTrue("bogus div should have no children", 0 == ((Div) div.childAt(2)).getChildCount()); }
private HtmlTable makeMockTable(String tableIdentifier) { // Create just enough "table" to test if TableTag tableTag = new TableTag(); TableRow tableRow = new TableRow(); TableColumn tableColumn = new TableColumn(); tableColumn.setChildren(new NodeList(new TextNode(tableIdentifier))); tableRow.setChildren(new NodeList(tableColumn)); tableTag.setChildren(new NodeList(tableRow)); return new HtmlTable(tableTag); }
/** Returns the number of columns/cells in the given row, including cell spacing. */ private static int getColumnCount(TableRow row) { Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray(); int cols = 0; for (Node cell : cells) { Integer colSpan = MathUtils.parseInt(((TagNode) cell).getAttribute("colspan")); cols += colSpan != null ? colSpan : 1; } return cols; }
/** 从课表处,分课表 */ public List<Courses> parseCourses(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { e.printStackTrace(); } NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { e.printStackTrace(); } List<Courses> list = new ArrayList<Courses>(); String schoolyear = ""; String semester = ""; for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); Courses courses = null; boolean isCourse = false; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); String temp = "学年学期:"; int start = info.indexOf(temp); int len = "2012-2013".length(); if (start != -1) { start = start + temp.length(); schoolyear = info.substring(start, start + len); // semester = info.substring(start+len+2); // 网络正常时候测试学期改为数字 semester = info.substring(start + len + 3, start + len + 4); if ("一".equals(semester)) { semester = "1"; } else if ("二".equals(semester)) { semester = "2"; } } if (k == 1 && info.indexOf("[") != -1) { courses = new Courses(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); courses.setCourseCode(courseCode); courses.setCoursesname(coursesname); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); courses.setCredit(credit); } if (k == 3 && isCourse) { courses.setType(info); } if (k == 4 && isCourse) { courses.setLeanType(info); } if (k == 5 && isCourse) { courses.setCheckType(info); } if (k == 6 && isCourse) { courses.setGetType(info); } if (k == 7 && isCourse) { // double score=Double.parseDouble(info); courses.setScore(info); } if (k == 8 && isCourse) { courses.setRemark(info); } } // end for k if (courses != null) { courses.setSchoolYear(schoolyear); courses.setSemester(semester); list.add(courses); } } // end for j } } return list; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); if (tag.getText().indexOf("[课程号]") == -1) { continue; } TableRow[] rows = tag.getRows(); for (int j = 1; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); System.out.println(info + "===" + k); switch (k) { case 1: int start = info.indexOf("["); int end = info.indexOf("]"); timeTable = new TimeTable(); timeTable.setCourseCode(info.substring(start + 1, end)); timeTable.setCourseName(info.substring(end + 1)); break; case 3: timeTable.setCredit(Double.parseDouble(info)); break; case 4: timeTable.setType(info); break; case 5: int t_start = info.indexOf("]"); timeTable.setTeacher(info.substring(t_start + 1)); break; case 8: List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } break; default: break; } } } // end for j } } return list; }
/** * Creates a list of Grids based on the given HTML string. This works only for table-based HTML * documents. * * @param html the HTML string. * @return a list of Grids. */ public static List<Grid> fromHtml(String html) throws Exception { if (html == null || html.trim().isEmpty()) { return null; } List<Grid> grids = new ArrayList<>(); Parser parser = Parser.createParser(html, "UTF-8"); Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray(); for (Node t : tables) { Grid grid = new ListGrid(); TableTag table = (TableTag) t; TableRow[] rows = table.getRows(); Integer firstColumnCount = null; for (TableRow row : rows) { if (getColumnCount(row) == 0) // Ignore if no cells { log.warn("Ignoring row with no columns"); continue; } Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray(); if (firstColumnCount == null) // First row becomes header { firstColumnCount = getColumnCount(row); for (Node c : cells) { TagNode cell = (TagNode) c; grid.addHeader(new GridHeader(getValue(cell), false, false)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyHeaders((colSpan - 1)); } } } else // Rest becomes rows { if (firstColumnCount != getColumnCount(row)) // Ignore { log.warn( "Ignoring row which has " + row.getColumnCount() + " columns since table has " + firstColumnCount + " columns"); continue; } grid.addRow(); for (Node c : cells) { // TODO row span TagNode cell = (TagNode) c; grid.addValue(getValue(cell)); Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan")); if (colSpan != null && colSpan > 1) { grid.addEmptyValues((colSpan - 1)); } } } } grids.add(grid); } return grids; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter tagfilter = new NodeClassFilter(TableTag.class); NodeFilter idFilter = new HasAttributeFilter("id", "reportArea"); NodeFilter filter = new AndFilter(tagfilter, idFilter); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); // System.out.println(info+"=="+k); if (k == 1 && info.indexOf("[") != -1) { timeTable = new TimeTable(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); timeTable.setCourseName(coursesname); timeTable.setCourseCode(courseCode); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); timeTable.setCredit(credit); } if (k == 3 && isCourse) { timeTable.setType(info); } if (k == 4 && isCourse) { timeTable.setTeacher(info); } if (k == 5 && isCourse) { timeTable.setClassId(info); } if (k == 6 && isCourse) { timeTable.setClassNum(info); } if (k == 11 && isCourse) { List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } } } // end for k } // end for j } } return list; }