/** * Find the end tag that lines up to the beginning tag. * * @param index The index to start the search on. This specifies the starting data unit. * @param tag The beginning tag that we are seeking the end tag for. * @return The index that the ending tag was found at. Returns -1 if not found. */ public final int findEndTag(final int index, final Tag tag) { int depth = 0; int count = index; while (count < this.page.getDataSize()) { final DataUnit du = this.page.getDataUnit(count); if (du instanceof TagDataUnit) { final Tag nextTag = ((TagDataUnit) du).getTag(); if (tag.getName().equalsIgnoreCase(nextTag.getName())) { if (nextTag.getType() == Tag.Type.END) { if (depth == 0) { return count; } else { depth--; } } else if (nextTag.getType() == Tag.Type.BEGIN) { depth++; } } } count++; } return -1; }
/** * Using the data units, which should have already been loaded by this time, load the contents of * the web page. This includes the title, any links and forms. Div tags and spans are also * processed. */ protected final void loadContents() { for (int index = 0; index < this.page.getDataSize(); index++) { final DataUnit du = this.page.getDataUnit(index); if (du instanceof TagDataUnit) { final Tag tag = ((TagDataUnit) du).getTag(); if (tag.getType() != Tag.Type.END) { if (tag.getName().equalsIgnoreCase("a")) { loadLink(index, tag); } else if (tag.getName().equalsIgnoreCase("title")) { loadTitle(index, tag); } else if (tag.getName().equalsIgnoreCase("form")) { loadForm(index, tag); } else if (tag.getName().equalsIgnoreCase("input")) { loadInput(index, tag); } } if (tag.getType() == Tag.Type.BEGIN) { if (tag.getName().equalsIgnoreCase("div")) { loadDiv(index, tag); } else if (tag.getName().equalsIgnoreCase("span")) { loadSpan(index, tag); } } if (tag.getType() == Tag.Type.END) { if (tag.getName().equalsIgnoreCase("div")) { if (this.lastHierarchyElement != null) { this.lastHierarchyElement = this.lastHierarchyElement.getParent(); } } else if (tag.getName().equalsIgnoreCase("span")) { if (this.lastHierarchyElement != null) { this.lastHierarchyElement = this.lastHierarchyElement.getParent(); } } } } } }