Пример #1
0
 private List<Tag> tagsWithSpecificTagRemoved(String tagNameToRemove, List<Tag> tags) {
   log.debug("Tag name to remove = {}, tags to operate on = {}", tagNameToRemove, tags);
   for (int i = 0; i < tags.size(); i++) {
     Tag currentTag = tags.get(i);
     log.debug("Current tag name = {}", currentTag.getName());
     if (currentTag.getName() == tagNameToRemove) {
       tags.remove(currentTag);
       i--;
     }
   }
   log.debug("returning tags = {}", tags);
   return tags;
 }
Пример #2
0
 private List<Field> extractFieldsFromUL(String html) {
   List<Field> extractedFields = new ArrayList<Field>();
   Source source = new Source(html);
   source.fullSequentialParse();
   List<Element> lis = source.getAllElements(HTMLElementName.LI);
   for (Element li : lis) {
     log.debug("looking at li: {} w/text: {}", li, li.getTextExtractor().toString());
     String[] parts = li.getTextExtractor().toString().split(":");
     if (parts.length == 2) {
       Field field = new ScrapedField(parts[0], parts[1]);
       extractedFields.add(field);
       log.debug("found <li> to process: {}, added field: {}", li, field);
     } else if (tagsWithSpecificTagRemoved(HTMLElementName.BR, li.getAllTags()).size() == 4) {
       Tag enclosingTag = li.getAllTags().get(1);
       log.debug("enclosing tag: {}", enclosingTag);
       log.debug(
           "first element of enclosing tag: {}",
           enclosingTag.getElement().getTextExtractor().toString());
       String tagText =
           enclosingTag
               .getElement()
               .getRenderer()
               .setMaxLineLength(Integer.MAX_VALUE)
               .toString()
               .trim()
               .replaceAll(":$", "");
       String allText =
           li.getRenderer()
               .setMaxLineLength(Integer.MAX_VALUE)
               .toString()
               .trim()
               .replaceAll(":$", "");
       log.debug("enclosing tag text starts at: {}", allText.indexOf(tagText));
       log.debug(
           "tagText (length = {}): {} alltext (length = {}): {}",
           new Object[] {tagText.length(), tagText, allText.length(), allText});
       if (allText.startsWith(tagText)) {
         String valueText =
             (allText.length() > tagText.length()) ? allText.substring(tagText.length() + 1) : "";
         extractedFields.add(new ScrapedField(tagText, valueText));
         log.debug("extracted fields = {}", extractedFields);
       }
     }
   }
   return extractedFields;
 }