private List<Field> extractFieldsFromTable(String html) { // log.debug("extracting fields from table: {}", html); List<Field> extractedFields = new ArrayList<Field>(); Source source = new Source(html); source.fullSequentialParse(); int cellCount = source.getAllElements(HTMLElementName.TD).size(); int rowCount = source.getAllElements(HTMLElementName.TR).size(); log.debug("found {} cells in {} rows", cellCount, rowCount); if (cellCount == (rowCount * 2)) { Field lastField = null; log.debug("cells.size: {}", cellCount); List<Element> cells = source.getAllElements(HTMLElementName.TD); for (int i = 0; i < cellCount; i++) { Element labelElement = cells.get(i); Element valueElement = cells.get(++i); String label = labelElement.getTextExtractor().toString().trim().replaceAll(":$", ""); String value = getValueFieldText(valueElement); log.debug("found field: {}={}", label, value); if (StringUtils.isEmpty(label) && lastField != null) { lastField.addValue(value); } else { lastField = new ScrapedField(label, value); extractedFields.add(lastField); } } } else { List<String> headers = new ArrayList<String>(); List<Element> rows = source.getAllElements(HTMLElementName.TR); for (Element row : rows) { List<Element> headerElements = row.getAllElements(HTMLElementName.TH); if (headerElements.size() > 0) { headers.clear(); } for (Element headerElement : headerElements) { String header = headerElement.getTextExtractor().toString(); headers.add(header); log.debug("header text: {}", header); } List<Element> cells = row.getAllElements(HTMLElementName.TD); if (cells.size() > 0) { for (int n = headers.size(); n < cells.size(); n++) { headers.add("col" + n); } int index = 0; for (Element cell : cells) { String label = headers.get(index++); String value = getValueFieldText(cell); extractedFields.add(new ScrapedField(label, value)); } headers.clear(); } } } return extractedFields; }
private List<Field> extractFieldsFromDL(String html) { List<Field> extractedFields = new ArrayList<Field>(); Source source = new Source(html); source.fullSequentialParse(); List<Element> labels = source.getAllElements(HTMLElementName.DT); List<Element> values = source.getAllElements(HTMLElementName.DD); int cellCount = Math.min(labels.size(), values.size()); for (int i = 0; i < cellCount; i++) { String label = labels.get(i).getTextExtractor().toString().trim().replaceAll(":$", ""); Element valueElement = values.get(i); log.debug("looking at value element: {}", valueElement); String value = getValueFieldText(valueElement); extractedFields.add(new ScrapedField(label, value)); } return extractedFields; }
private List<Field> extractFieldsFromUL(String html) { List<Field> extractedFields = new ArrayList<Field>(); Source source = new Source(html); source.fullSequentialParse(); List<Element> lis = source.getAllElements(HTMLElementName.LI); for (Element li : lis) { log.debug("looking at li: {} w/text: {}", li, li.getTextExtractor().toString()); String[] parts = li.getTextExtractor().toString().split(":"); if (parts.length == 2) { Field field = new ScrapedField(parts[0], parts[1]); extractedFields.add(field); log.debug("found <li> to process: {}, added field: {}", li, field); } else if (tagsWithSpecificTagRemoved(HTMLElementName.BR, li.getAllTags()).size() == 4) { Tag enclosingTag = li.getAllTags().get(1); log.debug("enclosing tag: {}", enclosingTag); log.debug( "first element of enclosing tag: {}", enclosingTag.getElement().getTextExtractor().toString()); String tagText = enclosingTag .getElement() .getRenderer() .setMaxLineLength(Integer.MAX_VALUE) .toString() .trim() .replaceAll(":$", ""); String allText = li.getRenderer() .setMaxLineLength(Integer.MAX_VALUE) .toString() .trim() .replaceAll(":$", ""); log.debug("enclosing tag text starts at: {}", allText.indexOf(tagText)); log.debug( "tagText (length = {}): {} alltext (length = {}): {}", new Object[] {tagText.length(), tagText, allText.length(), allText}); if (allText.startsWith(tagText)) { String valueText = (allText.length() > tagText.length()) ? allText.substring(tagText.length() + 1) : ""; extractedFields.add(new ScrapedField(tagText, valueText)); log.debug("extracted fields = {}", extractedFields); } } } return extractedFields; }
public static void main(String[] args) throws Exception { String sourceUrlString = null; PrintWriter writer = null; File Directory = new File("asd\\Web Pages\\HousingAndEnvironment"); if (Directory.isDirectory()) { for (File f : Directory.listFiles()) { sourceUrlString = Directory.getPath() + "\\" + f.getName(); if (sourceUrlString.indexOf(':') == -1) sourceUrlString = "file:" + sourceUrlString; // reader = new BufferedReader(new FileReader(sourceUrlString)); writer = new PrintWriter("asd\\Web Pages\\HousingAndEnvironment" + f.getName() + ".txt"); Source source = new Source(new URL(sourceUrlString)); source.fullSequentialParse(); writer.print(source.getTextExtractor().setIncludeAttributes(true).toString()); writer.flush(); writer.close(); // MicrosoftConditionalCommentTagTypes.register(); // PHPTagTypes.register(); // PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise // they override processing instructions // MasonTagTypes.register(); // Call fullSequentialParse manually as most of the source will be parsed. // System.out.println("Document title:"); // String title=getTitle(source); // System.out.println(title==null ? "(none)" : title); // System.out.println("\nDocument description:"); // String description=getMetaValue(source,"description"); // System.out.println(description==null ? "(none)" : description); // // System.out.println("\nDocument keywords:"); // String keywords=getMetaValue(source,"keywords"); // System.out.println(keywords==null ? "(none)" : keywords); // // System.out.println("\nLinks to other documents:"); // List<Element> linkElements=source.getAllElements(HTMLElementName.A); // for (Element linkElement : linkElements) { // String href=linkElement.getAttributeValue("href"); // if (href==null) continue; // // A element can contain other tags so need to extract the text from it: // String label=linkElement.getContent().getTextExtractor().toString(); // System.out.println(label+" <"+href+'>'); // } // System.out.println("\nAll text from file (exluding content inside SCRIPT and STYLE // elements):\n"); // System.out.println(source.getTextExtractor().setIncludeAttributes(true).toString()); // System.out.println("\nSame again but this time extend the TextExtractor class to also // exclude text from P elements and any elements with class=\"control\":\n"); // TextExtractor textExtractor=new TextExtractor(source) { // public boolean excludeElement(StartTag startTag) { // return startTag.getName()==HTMLElementName.P || // "control".equalsIgnoreCase(startTag.getAttributeValue("class")); // } // }; // System.out.println(textExtractor.setIncludeAttributes(true).toString()); } } }
private boolean fieldHasMultipleValues(String fieldValue) { Source source = new Source(fieldValue); source.fullSequentialParse(); return source.getAllElements(HTMLElementName.BR).size() > 1; }