@Test
  public void testHtmlWithTags() throws Exception {
    final String htmlText =
        "<html><head><title>Title</title></head>" + "<body><p>this is a test</p></body></html>";

    // Create FetchedDatum using data
    String url = "http://domain.com/page.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8"));
    FetchedDatum fetchedDatum =
        new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);

    // Call parser.parse
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);

    // Now take the resulting HTML, process it using Dom4J
    SAXReader reader = new SAXReader(new Parser());
    reader.setEncoding("UTF-8");
    String htmlWithMarkup = parsedDatum.getParsedText();
    Document doc = reader.read(new StringInputStream(htmlWithMarkup));

    // We have to do helicopter stunts since HTML has a global namespace on it, set
    // at the <html> element level.
    XPath xpath = DocumentHelper.createXPath("/xhtml:html/xhtml:body/xhtml:p");
    Map<String, String> namespaceUris = new HashMap<String, String>();
    namespaceUris.put("xhtml", "http://www.w3.org/1999/xhtml");
    xpath.setNamespaceURIs(namespaceUris);

    Node paragraphNode = xpath.selectSingleNode(doc);
    Assert.assertNotNull(paragraphNode);
    Assert.assertEquals("this is a test", paragraphNode.getText());
  }
Esempio n. 2
0
 private String findValue(String xpathString, Node node) {
   XPath xpath = getXpath(xpathString);
   xpath.setNamespaceURIs(prefixNamespaceMap);
   Node resultNode = xpath.selectSingleNode(node);
   if (resultNode != null) {
     return resultNode.getText();
   } else {
     return null;
   }
 }
Esempio n. 3
0
 public static Document unwrapMessage(Document soapEnvelope) throws InvalidInputFormatException {
   if (_log.isDebugEnabled()) _log.debug(soapEnvelope.asXML());
   Document result = DocumentHelper.createDocument();
   Node node = null;
   synchronized (path) {
     node = path.selectSingleNode(soapEnvelope);
   }
   result.add((Node) node.clone());
   return result;
 }
Esempio n. 4
0
  public String getNamespaceNodeText(
      Document document, String namespaceKey, String namespaceValue, String xPath) {

    HashMap<String, String> xmlMap = new HashMap<String, String>();
    // xmlMap.put("tns","http://www.99bill.com/schema/fo/settlement");
    xmlMap.put(namespaceKey, namespaceValue);

    // XPath xpath=document.createXPath("//tns:status"); //要获取哪个节点,改这里就可以了
    XPath xpath = document.createXPath(xPath); // 要获取哪个节点,改这里就可以了
    xpath.setNamespaceURIs(xmlMap);
    Element element = (Element) xpath.selectSingleNode(document);

    return element.getText();
  }
Esempio n. 5
0
  public IdMap handle(Element element) throws IOException {
    XPath idXPath = element.createXPath(idXpath);
    if (namespaces != null) idXPath.setNamespaceURIs(namespaces);

    Node id = idXPath.selectSingleNode(element);
    if (id == null && idXpathSecondary != null) {
      idXPath = element.createXPath(idXpathSecondary);
      if (namespaces != null) idXPath.setNamespaceURIs(namespaces);
      id = idXPath.selectSingleNode(element);
    } else if (id == null && idXpathSecondary == null) {
      logger.error("Doc without id field: " + element);
      return null;
    }

    Map<String, String> textFields = new HashMap<String, String>();
    Map<String, String> storedFields = new HashMap<String, String>();
    Map<String, Field> uniqueFields = new HashMap<String, Field>();
    Collection<IdMap.TextField> isolatedFields = new HashSet<IdMap.TextField>();
    for (XmlFieldHandler handler : fieldHandlers) {
      XPath xPath = element.createXPath(handler.getFieldXpath());
      if (namespaces != null) xPath.setNamespaceURIs(namespaces);
      List<Node> nodes = xPath.selectNodes(element);
      for (Node node : nodes) {
        FilteredFields fields = handler.getFields(node);
        // TextFields
        if (fields != null && fields.getTextFields() != null) {
          for (Map.Entry<String, String> entry : fields.getTextFields().entrySet()) {
            String old = textFields.get(entry.getKey());
            String newOne = entry.getValue();
            if (old == null) textFields.put(entry.getKey(), newOne);
            else textFields.put(entry.getKey(), old + ' ' + newOne);
          }
        }
        // StoredFields
        if (fields != null && fields.getStoredTextFields() != null) {
          for (Map.Entry<String, String> entry : fields.getStoredTextFields().entrySet()) {
            String old = storedFields.get(entry.getKey());
            if (old == null) storedFields.put(entry.getKey(), entry.getValue());
            else storedFields.put(entry.getKey(), old + ' ' + entry.getValue());
          }
        }

        // IsolatedFields
        if (fields != null && fields.getIsolatedTextFields() != null) {
          for (FilteredFields.TextField t : fields.getIsolatedTextFields()) {
            isolatedFields.add(new IdMap.TextField(t.name, t.value));
          }
        }
        // PreparedFields
        if (fields != null && fields.getPreparedFields() != null) {
          for (Field f : fields.getPreparedFields()) {
            Field old = uniqueFields.get(f.name());
            if (old == null) uniqueFields.put(f.name(), f);
            else logger.error("Not UNIQUE Prepared field " + f.name() + " in doc: " + id.getText());
          }
        }
      }
    }
    if (id instanceof Element)
      return new IdMap(
          ((Element) id).getTextTrim(),
          textFields,
          storedFields,
          new ArrayList<Field>(uniqueFields.values()),
          isolatedFields);
    else
      return new IdMap(
          id.getText().trim(),
          textFields,
          storedFields,
          new ArrayList<Field>(uniqueFields.values()),
          isolatedFields);
  }