private List<Field> extractFieldsFromTable(String html) {
   // log.debug("extracting fields from table: {}", html);
   List<Field> extractedFields = new ArrayList<Field>();
   Source source = new Source(html);
   source.fullSequentialParse();
   int cellCount = source.getAllElements(HTMLElementName.TD).size();
   int rowCount = source.getAllElements(HTMLElementName.TR).size();
   log.debug("found {} cells in {} rows", cellCount, rowCount);
   if (cellCount == (rowCount * 2)) {
     Field lastField = null;
     log.debug("cells.size: {}", cellCount);
     List<Element> cells = source.getAllElements(HTMLElementName.TD);
     for (int i = 0; i < cellCount; i++) {
       Element labelElement = cells.get(i);
       Element valueElement = cells.get(++i);
       String label = labelElement.getTextExtractor().toString().trim().replaceAll(":$", "");
       String value = getValueFieldText(valueElement);
       log.debug("found field: {}={}", label, value);
       if (StringUtils.isEmpty(label) && lastField != null) {
         lastField.addValue(value);
       } else {
         lastField = new ScrapedField(label, value);
         extractedFields.add(lastField);
       }
     }
   } else {
     List<String> headers = new ArrayList<String>();
     List<Element> rows = source.getAllElements(HTMLElementName.TR);
     for (Element row : rows) {
       List<Element> headerElements = row.getAllElements(HTMLElementName.TH);
       if (headerElements.size() > 0) {
         headers.clear();
       }
       for (Element headerElement : headerElements) {
         String header = headerElement.getTextExtractor().toString();
         headers.add(header);
         log.debug("header text: {}", header);
       }
       List<Element> cells = row.getAllElements(HTMLElementName.TD);
       if (cells.size() > 0) {
         for (int n = headers.size(); n < cells.size(); n++) {
           headers.add("col" + n);
         }
         int index = 0;
         for (Element cell : cells) {
           String label = headers.get(index++);
           String value = getValueFieldText(cell);
           extractedFields.add(new ScrapedField(label, value));
         }
         headers.clear();
       }
     }
   }
   return extractedFields;
 }
 private List<Field> extractFieldsFromDL(String html) {
   List<Field> extractedFields = new ArrayList<Field>();
   Source source = new Source(html);
   source.fullSequentialParse();
   List<Element> labels = source.getAllElements(HTMLElementName.DT);
   List<Element> values = source.getAllElements(HTMLElementName.DD);
   int cellCount = Math.min(labels.size(), values.size());
   for (int i = 0; i < cellCount; i++) {
     String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", "");
     Element valueElement = values.get(i);
     log.debug("looking at value element: {}", valueElement);
     String value = getValueFieldText(valueElement);
     extractedFields.add(new ScrapedField(label, value));
   }
   return extractedFields;
 }
 private List<Field> extractFieldsFromUL(String html) {
   List<Field> extractedFields = new ArrayList<Field>();
   Source source = new Source(html);
   source.fullSequentialParse();
   List<Element> lis = source.getAllElements(HTMLElementName.LI);
   for (Element li : lis) {
     log.debug("looking at li: {} w/text: {}", li, li.getTextExtractor().toString());
     String[] parts = li.getTextExtractor().toString().split(":");
     if (parts.length == 2) {
       Field field = new ScrapedField(parts[0], parts[1]);
       extractedFields.add(field);
       log.debug("found <li> to process: {}, added field: {}", li, field);
     } else if (tagsWithSpecificTagRemoved(HTMLElementName.BR, li.getAllTags()).size() == 4) {
       Tag enclosingTag = li.getAllTags().get(1);
       log.debug("enclosing tag: {}", enclosingTag);
       log.debug(
           "first element of enclosing tag: {}",
           enclosingTag.getElement().getTextExtractor().toString());
       String tagText =
           enclosingTag
               .getElement()
               .getRenderer()
               .setMaxLineLength(Integer.MAX_VALUE)
               .toString()
               .trim()
               .replaceAll(":$", "");
       String allText =
           li.getRenderer()
               .setMaxLineLength(Integer.MAX_VALUE)
               .toString()
               .trim()
               .replaceAll(":$", "");
       log.debug("enclosing tag text starts at: {}", allText.indexOf(tagText));
       log.debug(
           "tagText (length = {}): {} alltext (length = {}): {}",
           new Object[] {tagText.length(), tagText, allText.length(), allText});
       if (allText.startsWith(tagText)) {
         String valueText =
             (allText.length() > tagText.length()) ? allText.substring(tagText.length() + 1) : "";
         extractedFields.add(new ScrapedField(tagText, valueText));
         log.debug("extracted fields = {}", extractedFields);
       }
     }
   }
   return extractedFields;
 }
Esempio n. 4
0
  public static void main(String[] args) throws Exception {
    String sourceUrlString = null;
    PrintWriter writer = null;
    File Directory = new File("asd\\Web Pages\\HousingAndEnvironment");
    if (Directory.isDirectory()) {
      for (File f : Directory.listFiles()) {
        sourceUrlString = Directory.getPath() + "\\" + f.getName();
        if (sourceUrlString.indexOf(':') == -1) sourceUrlString = "file:" + sourceUrlString;
        // reader = new BufferedReader(new FileReader(sourceUrlString));
        writer = new PrintWriter("asd\\Web Pages\\HousingAndEnvironment" + f.getName() + ".txt");
        Source source = new Source(new URL(sourceUrlString));
        source.fullSequentialParse();
        writer.print(source.getTextExtractor().setIncludeAttributes(true).toString());
        writer.flush();
        writer.close();

        //		MicrosoftConditionalCommentTagTypes.register();
        //		PHPTagTypes.register();
        //		PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise
        // they override processing instructions
        //		MasonTagTypes.register();

        // Call fullSequentialParse manually as most of the source will be parsed.

        //		System.out.println("Document title:");
        //		String title=getTitle(source);
        //		System.out.println(title==null ? "(none)" : title);

        //		System.out.println("\nDocument description:");
        //		String description=getMetaValue(source,"description");
        //		System.out.println(description==null ? "(none)" : description);
        //
        //		System.out.println("\nDocument keywords:");
        //		String keywords=getMetaValue(source,"keywords");
        //		System.out.println(keywords==null ? "(none)" : keywords);
        //
        //		System.out.println("\nLinks to other documents:");
        //		List<Element> linkElements=source.getAllElements(HTMLElementName.A);
        //		for (Element linkElement : linkElements) {
        //			String href=linkElement.getAttributeValue("href");
        //			if (href==null) continue;
        //			// A element can contain other tags so need to extract the text from it:
        //			String label=linkElement.getContent().getTextExtractor().toString();
        //			System.out.println(label+" <"+href+'>');
        //		}

        //		System.out.println("\nAll text from file (exluding content inside SCRIPT and STYLE
        // elements):\n");
        //		System.out.println(source.getTextExtractor().setIncludeAttributes(true).toString());

        // System.out.println("\nSame again but this time extend the TextExtractor class to also
        // exclude text from P elements and any elements with class=\"control\":\n");
        //		TextExtractor textExtractor=new TextExtractor(source) {
        //			public boolean excludeElement(StartTag startTag) {
        //				return startTag.getName()==HTMLElementName.P ||
        // "control".equalsIgnoreCase(startTag.getAttributeValue("class"));
        //			}
        //		};
        //		System.out.println(textExtractor.setIncludeAttributes(true).toString());
      }
    }
  }
 private boolean fieldHasMultipleValues(String fieldValue) {
   Source source = new Source(fieldValue);
   source.fullSequentialParse();
   return source.getAllElements(HTMLElementName.BR).size() > 1;
 }