private List<Tag> tagsWithSpecificTagRemoved(String tagNameToRemove, List<Tag> tags) { log.debug("Tag name to remove = {}, tags to operate on = {}", tagNameToRemove, tags); for (int i = 0; i < tags.size(); i++) { Tag currentTag = tags.get(i); log.debug("Current tag name = {}", currentTag.getName()); if (currentTag.getName() == tagNameToRemove) { tags.remove(currentTag); i--; } } log.debug("returning tags = {}", tags); return tags; }
private List<Field> extractFieldsFromUL(String html) { List<Field> extractedFields = new ArrayList<Field>(); Source source = new Source(html); source.fullSequentialParse(); List<Element> lis = source.getAllElements(HTMLElementName.LI); for (Element li : lis) { log.debug("looking at li: {} w/text: {}", li, li.getTextExtractor().toString()); String[] parts = li.getTextExtractor().toString().split(":"); if (parts.length == 2) { Field field = new ScrapedField(parts[0], parts[1]); extractedFields.add(field); log.debug("found <li> to process: {}, added field: {}", li, field); } else if (tagsWithSpecificTagRemoved(HTMLElementName.BR, li.getAllTags()).size() == 4) { Tag enclosingTag = li.getAllTags().get(1); log.debug("enclosing tag: {}", enclosingTag); log.debug( "first element of enclosing tag: {}", enclosingTag.getElement().getTextExtractor().toString()); String tagText = enclosingTag .getElement() .getRenderer() .setMaxLineLength(Integer.MAX_VALUE) .toString() .trim() .replaceAll(":$", ""); String allText = li.getRenderer() .setMaxLineLength(Integer.MAX_VALUE) .toString() .trim() .replaceAll(":$", ""); log.debug("enclosing tag text starts at: {}", allText.indexOf(tagText)); log.debug( "tagText (length = {}): {} alltext (length = {}): {}", new Object[] {tagText.length(), tagText, allText.length(), allText}); if (allText.startsWith(tagText)) { String valueText = (allText.length() > tagText.length()) ? allText.substring(tagText.length() + 1) : ""; extractedFields.add(new ScrapedField(tagText, valueText)); log.debug("extracted fields = {}", extractedFields); } } } return extractedFields; }