/**
   * Method that not only gets currently available text from the reader, but also checks that its
   * consistenly accessible using different (basic) StAX methods.
   */
  protected static String getAndVerifyText(XMLStreamReader sr) throws XMLStreamException {
    /* 05-Apr-2006, TSa: Although getText() is available for DTD
     *   and ENTITY_REFERENCE, getTextXxx() are not. Thus, can not
     *   do more checks for those types.
     */
    int type = sr.getEventType();
    if (type == ENTITY_REFERENCE || type == DTD) {
      return sr.getText();
    }

    int expLen = sr.getTextLength();
    /* Hmmh. It's only ok to return empty text for DTD event... well,
     * maybe also for CDATA, since empty CDATA blocks are legal?
     */
    /* !!! 01-Sep-2004, TSa:
     *  note: theoretically, in coalescing mode, it could be possible
     *  to have empty CDATA section(s) get converted to CHARACTERS,
     *  which would be empty... may need to enhance this to check that
     *  mode is not coalescing? Or something
     */
    if (type == CHARACTERS) {
      assertTrue("Stream reader should never return empty Strings.", (expLen > 0));
    }
    String text = sr.getText();
    assertNotNull("getText() should never return null.", text);
    assertEquals(
        "Expected text length of " + expLen + ", got " + text.length(), expLen, text.length());
    char[] textChars = sr.getTextCharacters();
    int start = sr.getTextStart();
    String text2 = new String(textChars, start, expLen);
    assertEquals(text, text2);
    return text;
  }
Beispiel #2
0
 private static Element parseElement(XMLStreamReader xsr) throws XMLStreamException {
   // xsr points to a START_ELEMENT event. Create the element and read all its attributes
   // Then read all its children events
   Element element = new Element(xsr.getLocalName());
   // text that will be added to the element. Text can come in different events, so we add it here
   // and add it to the element at the end
   StringBuilder elementText = new StringBuilder();
   int attributeCount = xsr.getAttributeCount();
   for (int i = 0; i < attributeCount; i++) {
     element.putAttribute(xsr.getAttributeLocalName(i), xsr.getAttributeValue(i));
   }
   while (xsr.hasNext()) {
     xsr.next();
     if (xsr.getEventType() == XMLStreamConstants.END_ELEMENT) {
       // element is closed. Move the cursor and return it
       // check if there is some text to add before (empty text is not added, but added text is not
       // trimmed)
       // we set empty text also if the element has no children
       if (!elementText.toString().trim().isEmpty() || !element.hasChildren()) {
         element.setText(elementText.toString());
       }
       //                xsr.next();
       return element;
     } else if (xsr.getEventType() == XMLStreamConstants.CHARACTERS) {
       // an attribute of the current element
       elementText.append(xsr.getText());
     } else if (xsr.getEventType() == XMLStreamConstants.START_ELEMENT) {
       // new element begins -> read it recursively and add it to the current element
       element.addChild(parseElement(xsr));
     }
   }
   // we reached the end of the document without the tag end -> error parsing
   throw new XMLStreamException(
       "End of the document unexpectedly reached. Element " + element.getName() + " not closed");
 }
Beispiel #3
0
    @Override
    protected void map(LongWritable key, Text value, Mapper.Context context)
        throws IOException, InterruptedException {
      String document = value.toString();
      System.out.println("'" + document + "'");
      try {
        XMLStreamReader reader =
            XMLInputFactory.newInstance()
                .createXMLStreamReader(new ByteArrayInputStream(document.getBytes()));
        String propertyName = "";
        String propertyValue = "";
        String currentElement = "";
        while (reader.hasNext()) {
          int code = reader.next();
          switch (code) {
            case XMLStreamConstants.START_ELEMENT: // START_ELEMENT:
              currentElement = reader.getLocalName();
              break;
            case XMLStreamConstants.CHARACTERS: // CHARACTERS:
              if (currentElement.equalsIgnoreCase("uid")) {
                propertyName += reader.getText().trim();
                System.out.println(propertyName);
              } else if (currentElement.equalsIgnoreCase("location")) {
                propertyValue += reader.getText().trim();
                System.out.println(propertyValue);

              } else if (currentElement.equalsIgnoreCase("age")) {
                propertyValue += ("," + reader.getText().trim());
                System.out.println(propertyValue);
              }
              break;
          }
        }
        reader.close();
        context.write(new Text(propertyName.trim()), new Text(propertyValue.trim()));

      } catch (Exception e) {
        throw new IOException(e);
      }
    }
Beispiel #4
0
 /** Note: calling this method will move stream to the next non-textual event. */
 protected String collectAllText(XMLStreamReader sr) throws XMLStreamException {
   StringBuilder sb = new StringBuilder(100);
   while (true) {
     int type = sr.getEventType();
     if (type == CHARACTERS || type == SPACE || type == CDATA) {
       sb.append(sr.getText());
       sr.next();
     } else {
       break;
     }
   }
   return sb.toString();
 }