/** * VIPS Rule Nine * * <p>If the child of the node with maximum size are small than a threshold (relative size), do * not divide this node. <br> * Set the DoC based on the html tag and size of this node. * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleNine(ElementBox node) { // System.err.println("Applying rule Nine on " + node.getNode().getNodeName() + " node"); if (node.getSubBoxList().isEmpty()) return false; int maxSize = 0; for (Box childNode : node.getSubBoxList()) { int childSize = childNode.getWidth() * childNode.getHeight(); if (maxSize < childSize) { maxSize = childSize; } } if (maxSize > _sizeTresholdWidth * _sizeTresholdHeight) return true; // TODO set DOC _currentVipsBlock.setIsVisualBlock(true); _currentVipsBlock.setIsDividable(false); if (node.getNode().getNodeName().equals("Xdiv")) _currentVipsBlock.setDoC(7); if (node.getNode().getNodeName().equals("a")) _currentVipsBlock.setDoC(11); else _currentVipsBlock.setDoC(8); return true; }
/** * Counts number of visual blocks in visual structure * * @param vipsBlock Visual structure */ private void getVisualBlocksCount(VipsBlock vipsBlock) { if (vipsBlock.isVisualBlock()) _visualBlocksCount++; for (VipsBlock vipsBlockChild : vipsBlock.getChildren()) { if (!(vipsBlockChild.getBox() instanceof TextBox)) getVisualBlocksCount(vipsBlockChild); } }
/** * Finds previous sibling node's VIPS block. * * @param node Node * @param vipsBlock Actual VIPS block * @param foundBlock VIPS block for given node */ private void findPreviousSiblingNodeVipsBlock(Node node, VipsBlock vipsBlock) { if (vipsBlock.getBox().getNode().equals(node)) { _tempVipsBlock = vipsBlock; return; } else for (VipsBlock vipsBlockChild : vipsBlock.getChildren()) findPreviousSiblingNodeVipsBlock(node, vipsBlockChild); }
/** * Construct VIPS block tree from viewport. * * <p>Starts from <body> element. * * @param element Box that represents element * @param node Visual structure tree node */ private void constructVipsBlockTree(Box element, VipsBlock node) { node.setBox(element); if (!(element instanceof TextBox)) { for (Box box : ((ElementBox) element).getSubBoxList()) { node.addChild(new VipsBlock()); constructVipsBlockTree(box, node.getChildren().get(node.getChildren().size() - 1)); } } }
/** * VIPS Rule Three * * <p>If the DOM node is the root node of the sub-DOM tree (corresponding to the block), and there * is only one sub DOM tree corresponding to this block, divide this node. * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleThree(ElementBox node) { // System.err.println("Applying rule Three on " + node.getNode().getNodeName() + " node"); if (!node.isRootElement()) return false; boolean result = true; int cnt = 0; for (VipsBlock vipsBlock : _vipsBlocks.getChildren()) { if (vipsBlock.getBox().getNode().getNodeName().equals(node.getNode().getNodeName())) { result = true; isOnlyOneDomSubTree(node.getNode(), vipsBlock.getBox().getNode(), result); if (result) cnt++; } } return (cnt == 1) ? true : false; }
/* * Checks if node has valid children nodes */ private boolean hasValidChildrenNodes(ElementBox node) { if (node.getNode().getNodeName().equals("img") || node.getNode().getNodeName().equals("input")) { if (node.getContentWidth() > 0 && node.getContentHeight() > 0) { _currentVipsBlock.setIsVisualBlock(true); _currentVipsBlock.setDoC(8); return true; } else return false; } if (node.getSubBoxList().isEmpty()) return false; _cnt = 0; for (Box child : node.getSubBoxList()) { checkValidChildrenNodes(child); } return (_cnt > 0) ? true : false; }
/** * VIPS Rule One * * <p>If the DOM node is not a text node and it has no valid children, then this node cannot be * divided and will be cut. * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleOne(ElementBox node) { // System.err.println("Applying rule One on " + node.getNode().getNodeName() + " node"); if (!isTextNode(node)) { if (!hasValidChildrenNodes(node)) { _currentVipsBlock.setIsDividable(false); return true; } } return false; }
/** * VIPS Rule Ten * * <p>If previous sibling node has not been divided, do not divide this node * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleTen(ElementBox node) { // System.err.println("Applying rule Ten on " + node.getNode().getNodeName() + " node"); // VipsBlock previousSiblingVipsBlock = null; // findPreviousSiblingNodeVipsBlock(node.getNode().getPreviousSibling(), _vipsBlocks, // previousSiblingVipsBlock); _tempVipsBlock = null; findPreviousSiblingNodeVipsBlock(node.getNode().getPreviousSibling(), _vipsBlocks); if (_tempVipsBlock == null) return false; if (_tempVipsBlock.isAlreadyDivided()) return true; return false; }
/** * VIPS Rule Twelve * * <p>Do not divide this node <br> * Set the DoC value based on the html tag and size of this node. * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleTwelve(ElementBox node) { // System.err.println("Applying rule Twelve on " + node.getNode().getNodeName() + " node"); _currentVipsBlock.setIsDividable(false); _currentVipsBlock.setIsVisualBlock(true); if (node.getNode().getNodeName().equals("Xdiv")) _currentVipsBlock.setDoC(7); else if (node.getNode().getNodeName().equals("li")) _currentVipsBlock.setDoC(8); else if (node.getNode().getNodeName().equals("span")) _currentVipsBlock.setDoC(8); else if (node.getNode().getNodeName().equals("sup")) _currentVipsBlock.setDoC(8); else if (node.getNode().getNodeName().equals("img")) _currentVipsBlock.setDoC(8); else _currentVipsBlock.setDoC(333); // TODO DoC Part return true; }
/** * VIPS Rule Seven * * <p>If the background color of this node is different from one of its children’s, divide this * node and at the same time, the child node with different background color will not be divided * in this round. Set the DoC value (6-8) for the child node based on the <html> tag of the * child node and the size of the child node. * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleSeven(ElementBox node) { // System.err.println("Applying rule Seven on " + node.getNode().getNodeName() + " node"); if (node.getSubBoxList().isEmpty()) return false; if (isTextNode(node)) return false; // String nodeBgColor = node.getStylePropertyValue("background-color"); String nodeBgColor = _currentVipsBlock.getBgColor(); for (VipsBlock vipsStructureChild : _currentVipsBlock.getChildren()) { if (!(vipsStructureChild.getBgColor().equals(nodeBgColor))) { vipsStructureChild.setIsDividable(false); vipsStructureChild.setIsVisualBlock(true); // TODO DoC values vipsStructureChild.setDoC(7); return true; } } return false; }
/** * VIPS Rule Eight * * <p>If the node has at least one text node child or at least one virtual text node child, and * the node's relative size is smaller than a threshold, then the node cannot be divided. Set the * DoC value (from 5-8) based on the html tag of the node. * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleEight(ElementBox node) { // System.err.println("Applying rule Eight on " + node.getNode().getNodeName() + " node"); if (node.getSubBoxList().isEmpty()) return false; List<Box> children = new ArrayList<Box>(); findTextChildrenNodes(node, children); int cnt = children.size(); if (cnt == 0) return false; if (node.getWidth() == 0 || node.getHeight() == 0) { children.clear(); getAllChildren(node, children); for (Box child : children) { if (child.getWidth() != 0 && child.getHeight() != 0) return true; } } if (node.getWidth() * node.getHeight() > _sizeTresholdHeight * _sizeTresholdWidth) return false; if (node.getNode().getNodeName().equals("ul")) { return true; } _currentVipsBlock.setIsVisualBlock(true); _currentVipsBlock.setIsDividable(false); if (node.getNode().getNodeName().equals("Xdiv")) _currentVipsBlock.setDoC(7); else if (node.getNode().getNodeName().equals("code")) _currentVipsBlock.setDoC(7); else if (node.getNode().getNodeName().equals("div")) _currentVipsBlock.setDoC(5); else _currentVipsBlock.setDoC(8); return true; }
/** * Append node from given visual structure to parent node * * @param parentNode Given visual structure * @param visualStructure Parent node */ private void writeVisualBlocks(Element parentNode, VisualStructure visualStructure) { Element layoutNode = doc.createElement("LayoutNode"); layoutNode.setAttribute( "FrameSourceIndex", String.valueOf(visualStructure.getFrameSourceIndex())); layoutNode.setAttribute("SourceIndex", visualStructure.getSourceIndex()); layoutNode.setAttribute("DoC", String.valueOf(visualStructure.getDoC())); layoutNode.setAttribute("ContainImg", String.valueOf(visualStructure.containImg())); layoutNode.setAttribute("IsImg", String.valueOf(visualStructure.isImg())); layoutNode.setAttribute("ContainTable", String.valueOf(visualStructure.containTable())); layoutNode.setAttribute("ContainP", String.valueOf(visualStructure.containP())); layoutNode.setAttribute("TextLen", String.valueOf(visualStructure.getTextLength())); layoutNode.setAttribute("LinkTextLen", String.valueOf(visualStructure.getLinkTextLength())); Box parentBox = visualStructure.getNestedBlocks().get(0).getBox().getParent(); layoutNode.setAttribute( "DOMCldNum", String.valueOf(parentBox.getNode().getChildNodes().getLength())); layoutNode.setAttribute("FontSize", String.valueOf(visualStructure.getFontSize())); layoutNode.setAttribute("FontWeight", String.valueOf(visualStructure.getFontWeight())); layoutNode.setAttribute("BgColor", visualStructure.getBgColor()); layoutNode.setAttribute("ObjectRectLeft", String.valueOf(visualStructure.getX())); layoutNode.setAttribute("ObjectRectTop", String.valueOf(visualStructure.getY())); layoutNode.setAttribute("ObjectRectWidth", String.valueOf(visualStructure.getWidth())); layoutNode.setAttribute("ObjectRectHeight", String.valueOf(visualStructure.getHeight())); layoutNode.setAttribute("ID", visualStructure.getId()); layoutNode.setAttribute("order", String.valueOf(_order)); _order++; if (_pDoC >= visualStructure.getDoC()) { // continue segmenting if (visualStructure.getChildrenVisualStructures().size() == 0) { if (visualStructure.getNestedBlocks().size() > 0) { String src = ""; String content = ""; for (VipsBlock block : visualStructure.getNestedBlocks()) { ElementBox elementBox = block.getElementBox(); if (elementBox == null) continue; if (!elementBox.getNode().getNodeName().equals("Xdiv") && !elementBox.getNode().getNodeName().equals("Xspan")) src += getSource(elementBox.getElement()); else src += elementBox.getText(); content += elementBox.getText() + " "; } layoutNode.setAttribute("SRC", src); layoutNode.setAttribute("Content", content); } } parentNode.appendChild(layoutNode); for (VisualStructure child : visualStructure.getChildrenVisualStructures()) writeVisualBlocks(layoutNode, child); } else { // "stop" segmentation if (visualStructure.getNestedBlocks().size() > 0) { String src = ""; String content = ""; for (VipsBlock block : visualStructure.getNestedBlocks()) { ElementBox elementBox = block.getElementBox(); if (elementBox == null) continue; if (!elementBox.getNode().getNodeName().equals("Xdiv") && !elementBox.getNode().getNodeName().equals("Xspan")) src += getSource(elementBox.getElement()); else src += elementBox.getText(); content += elementBox.getText() + " "; } layoutNode.setAttribute("SRC", src); layoutNode.setAttribute("Content", content); } parentNode.appendChild(layoutNode); } }
private void findVisualBlocks(VipsBlock vipsBlock, List<VipsBlock> list) { if (vipsBlock.isVisualBlock()) list.add(vipsBlock); for (VipsBlock vipsStructureChild : vipsBlock.getChildren()) findVisualBlocks(vipsStructureChild, list); }
/** * VIPS Rule Four * * <p>If all of the child nodes of the DOM node are text nodes or virtual text nodes, do not * divide the node. <br> * If the font size and font weight of all these child nodes are same, set the DoC of the * extracted block to 10. Otherwise, set the DoC of this extracted block to 9. * * @param node Input node * @return True, if rule is applied, otherwise false. */ private boolean ruleFour(ElementBox node) { // System.err.println("Applying rule Four on " + node.getNode().getNodeName() + " node"); if (node.getSubBoxList().isEmpty()) return false; for (Box box : node.getSubBoxList()) { if (box instanceof TextBox) continue; if (!isTextNode((ElementBox) box) || !isVirtualTextNode((ElementBox) box)) return false; } _currentVipsBlock.setIsVisualBlock(true); _currentVipsBlock.setIsDividable(false); if (node.getSubBoxList().size() == 1) { /* if (node.getSubBox(0) instanceof TextBox) { _currentVipsBlock.setIsVisualBlock(false); _currentVipsBlock.setIsDividable(true); _currentVipsBlock.getChildren().get(0).setIsVisualBlock(true); _currentVipsBlock.getChildren().get(0).setIsDividable(false); _currentVipsBlock.getChildren().get(0).setDoC(11); } */ if (node.getSubBox(0).getNode().getNodeName().equals("em")) _currentVipsBlock.setDoC(11); else _currentVipsBlock.setDoC(10); return true; } String fontWeight = ""; int fontSize = 0; for (Box childNode : node.getSubBoxList()) { int childFontSize = childNode.getVisualContext().getFont().getSize(); if (childNode instanceof TextBox) { if (fontSize > 0) { if (fontSize != childFontSize) { _currentVipsBlock.setDoC(9); break; } else _currentVipsBlock.setDoC(10); } else fontSize = childFontSize; continue; } ElementBox child = (ElementBox) childNode; if (child.getStylePropertyValue("font-weight") == null) return false; if (fontSize > 0) { if (child.getStylePropertyValue("font-weight").toString().equals(fontWeight) && childFontSize == fontSize) { _currentVipsBlock.setDoC(10); } else { _currentVipsBlock.setDoC(9); break; } } else { fontWeight = child.getStylePropertyValue("font-weight").toString(); fontSize = childFontSize; } } return true; }
/** * Tries to divide DOM elements and finds visual blocks. * * @param vipsBlock Visual structure */ private void divideVipsBlockTree(VipsBlock vipsBlock) { _currentVipsBlock = vipsBlock; ElementBox elementBox = (ElementBox) vipsBlock.getBox(); // System.err.println(elementBox.getNode().getNodeName()); // System.out.println(elementBox.getText()); if (elementBox.getElement().getAttribute("id").equals("logosLine")) { System.out.println(); } // With VIPS rules it tries to determine if element is dividable if (applyVipsRules(elementBox) && vipsBlock.isDividable() && !vipsBlock.isVisualBlock()) { // if element is dividable, let's divide it _currentVipsBlock.setAlreadyDivided(true); for (VipsBlock vipsBlockChild : vipsBlock.getChildren()) { if (!(vipsBlockChild.getBox() instanceof TextBox)) divideVipsBlockTree(vipsBlockChild); } } else { if (vipsBlock.isDividable()) { // System.err.println("Element " + elementBox.getNode().getNodeName() + " is visual block"); vipsBlock.setIsVisualBlock(true); vipsBlock.setDoC(11); } if (!verifyValidity(elementBox)) { _currentVipsBlock.setIsVisualBlock(false); } /* if (vipsBlock.isVisualBlock()) //System.err.println("Element " + elementBox.getNode().getNodeName() + " is visual block"); else System.err.println("Element " + elementBox.getNode().getNodeName() + " is not visual block");*/ } }