@Test public void testHtmlWithTags() throws Exception { final String htmlText = "<html><head><title>Title</title></head>" + "<body><p>this is a test</p></body></html>"; // Create FetchedDatum using data String url = "http://domain.com/page.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Now take the resulting HTML, process it using Dom4J SAXReader reader = new SAXReader(new Parser()); reader.setEncoding("UTF-8"); String htmlWithMarkup = parsedDatum.getParsedText(); Document doc = reader.read(new StringInputStream(htmlWithMarkup)); // We have to do helicopter stunts since HTML has a global namespace on it, set // at the <html> element level. XPath xpath = DocumentHelper.createXPath("/xhtml:html/xhtml:body/xhtml:p"); Map<String, String> namespaceUris = new HashMap<String, String>(); namespaceUris.put("xhtml", "http://www.w3.org/1999/xhtml"); xpath.setNamespaceURIs(namespaceUris); Node paragraphNode = xpath.selectSingleNode(doc); Assert.assertNotNull(paragraphNode); Assert.assertEquals("this is a test", paragraphNode.getText()); }
private String findValue(String xpathString, Node node) { XPath xpath = getXpath(xpathString); xpath.setNamespaceURIs(prefixNamespaceMap); Node resultNode = xpath.selectSingleNode(node); if (resultNode != null) { return resultNode.getText(); } else { return null; } }
public static Document unwrapMessage(Document soapEnvelope) throws InvalidInputFormatException { if (_log.isDebugEnabled()) _log.debug(soapEnvelope.asXML()); Document result = DocumentHelper.createDocument(); Node node = null; synchronized (path) { node = path.selectSingleNode(soapEnvelope); } result.add((Node) node.clone()); return result; }
public String getNamespaceNodeText( Document document, String namespaceKey, String namespaceValue, String xPath) { HashMap<String, String> xmlMap = new HashMap<String, String>(); // xmlMap.put("tns","http://www.99bill.com/schema/fo/settlement"); xmlMap.put(namespaceKey, namespaceValue); // XPath xpath=document.createXPath("//tns:status"); //要获取哪个节点,改这里就可以了 XPath xpath = document.createXPath(xPath); // 要获取哪个节点,改这里就可以了 xpath.setNamespaceURIs(xmlMap); Element element = (Element) xpath.selectSingleNode(document); return element.getText(); }
public IdMap handle(Element element) throws IOException { XPath idXPath = element.createXPath(idXpath); if (namespaces != null) idXPath.setNamespaceURIs(namespaces); Node id = idXPath.selectSingleNode(element); if (id == null && idXpathSecondary != null) { idXPath = element.createXPath(idXpathSecondary); if (namespaces != null) idXPath.setNamespaceURIs(namespaces); id = idXPath.selectSingleNode(element); } else if (id == null && idXpathSecondary == null) { logger.error("Doc without id field: " + element); return null; } Map<String, String> textFields = new HashMap<String, String>(); Map<String, String> storedFields = new HashMap<String, String>(); Map<String, Field> uniqueFields = new HashMap<String, Field>(); Collection<IdMap.TextField> isolatedFields = new HashSet<IdMap.TextField>(); for (XmlFieldHandler handler : fieldHandlers) { XPath xPath = element.createXPath(handler.getFieldXpath()); if (namespaces != null) xPath.setNamespaceURIs(namespaces); List<Node> nodes = xPath.selectNodes(element); for (Node node : nodes) { FilteredFields fields = handler.getFields(node); // TextFields if (fields != null && fields.getTextFields() != null) { for (Map.Entry<String, String> entry : fields.getTextFields().entrySet()) { String old = textFields.get(entry.getKey()); String newOne = entry.getValue(); if (old == null) textFields.put(entry.getKey(), newOne); else textFields.put(entry.getKey(), old + ' ' + newOne); } } // StoredFields if (fields != null && fields.getStoredTextFields() != null) { for (Map.Entry<String, String> entry : fields.getStoredTextFields().entrySet()) { String old = storedFields.get(entry.getKey()); if (old == null) storedFields.put(entry.getKey(), entry.getValue()); else storedFields.put(entry.getKey(), old + ' ' + entry.getValue()); } } // IsolatedFields if (fields != null && fields.getIsolatedTextFields() != null) { for (FilteredFields.TextField t : fields.getIsolatedTextFields()) { isolatedFields.add(new IdMap.TextField(t.name, t.value)); } } // PreparedFields if (fields != null && fields.getPreparedFields() != null) { for (Field f : fields.getPreparedFields()) { Field old = uniqueFields.get(f.name()); if (old == null) uniqueFields.put(f.name(), f); else logger.error("Not UNIQUE Prepared field " + f.name() + " in doc: " + id.getText()); } } } } if (id instanceof Element) return new IdMap( ((Element) id).getTextTrim(), textFields, storedFields, new ArrayList<Field>(uniqueFields.values()), isolatedFields); else return new IdMap( id.getText().trim(), textFields, storedFields, new ArrayList<Field>(uniqueFields.values()), isolatedFields); }