@Override public String pullNextXmlChunk() throws KettleException { Stack<String> elementStack = xmlChunkerState.getElementStack(); XMLStreamReader xmlStreamReader = xmlChunkerState.getXmlStreamReader(); try { while (xmlStreamReader.hasNext()) { switch (xmlStreamReader.next()) { case XMLStreamConstants.END_DOCUMENT: return null; case XMLStreamConstants.END_ELEMENT: elementStack.pop(); break; case XMLStreamConstants.START_ELEMENT: elementStack.push(xmlStreamReader.getLocalName()); if (actualElementStackHasExpectedElements(xmlChunkerState)) { return pullNextXmlChunkFromTopElementOnStack(xmlChunkerState); } break; } } } catch (Exception e) { throw new KettleException("a problem has arisen reading the xero xml stream", e); } return null; }
private static Element parseElement(XMLStreamReader xsr) throws XMLStreamException { // xsr points to a START_ELEMENT event. Create the element and read all its attributes // Then read all its children events Element element = new Element(xsr.getLocalName()); // text that will be added to the element. Text can come in different events, so we add it here // and add it to the element at the end StringBuilder elementText = new StringBuilder(); int attributeCount = xsr.getAttributeCount(); for (int i = 0; i < attributeCount; i++) { element.putAttribute(xsr.getAttributeLocalName(i), xsr.getAttributeValue(i)); } while (xsr.hasNext()) { xsr.next(); if (xsr.getEventType() == XMLStreamConstants.END_ELEMENT) { // element is closed. Move the cursor and return it // check if there is some text to add before (empty text is not added, but added text is not // trimmed) // we set empty text also if the element has no children if (!elementText.toString().trim().isEmpty() || !element.hasChildren()) { element.setText(elementText.toString()); } // xsr.next(); return element; } else if (xsr.getEventType() == XMLStreamConstants.CHARACTERS) { // an attribute of the current element elementText.append(xsr.getText()); } else if (xsr.getEventType() == XMLStreamConstants.START_ELEMENT) { // new element begins -> read it recursively and add it to the current element element.addChild(parseElement(xsr)); } } // we reached the end of the document without the tag end -> error parsing throw new XMLStreamException( "End of the document unexpectedly reached. Element " + element.getName() + " not closed"); }
private String pullNextXmlChunkFromTopElementOnStack(XMLChunkerState data) throws KettleException { Stack<String> elementStack = data.getElementStack(); XMLStreamReader xmlStreamReader = data.getXmlStreamReader(); int elementStackDepthOnEntry = elementStack.size(); StringWriter stringWriter = new StringWriter(); try { XMLStreamWriter xmlStreamWriter = data.getXmlOutputFactory().createXMLStreamWriter(stringWriter); xmlStreamWriter.writeStartDocument(CharEncoding.UTF_8, "1.0"); // put the current element on because presumably it's the open element for the one // that is being looked for. XmlReaderToWriter.write(xmlStreamReader, xmlStreamWriter); while (xmlStreamReader.hasNext() & elementStack.size() >= elementStackDepthOnEntry) { switch (xmlStreamReader.next()) { case XMLStreamConstants.END_DOCUMENT: break; // handled below explicitly. case XMLStreamConstants.END_ELEMENT: elementStack.pop(); XmlReaderToWriter.write(xmlStreamReader, xmlStreamWriter); break; case XMLStreamConstants.START_ELEMENT: elementStack.push(xmlStreamReader.getLocalName()); XmlReaderToWriter.write(xmlStreamReader, xmlStreamWriter); break; default: XmlReaderToWriter.write(xmlStreamReader, xmlStreamWriter); break; } } xmlStreamWriter.writeEndDocument(); xmlStreamWriter.close(); } catch (Exception e) { throw new KettleException("unable to process a chunk of the xero xml stream", e); } return stringWriter.toString(); }
/** * Method that will iterate through contents of an XML document using specified stream reader; * will also access some of data to make sure reader reads most of lazy-loadable data. Method is * usually called to try to get an exception for invalid content. * * @return Dummy value calculated on contents; used to make sure no dead code is eliminated */ protected int streamThrough(XMLStreamReader sr) throws XMLStreamException { int result = 0; while (sr.hasNext()) { int type = sr.next(); result += type; if (sr.hasText()) { /* will also do basic verification for text content, to * see that all text accessor methods return same content */ result += getAndVerifyText(sr).hashCode(); } if (sr.hasName()) { result += sr.getName().hashCode(); } } return result; }
public static void main(String[] args) throws Exception { String urlString; if (args.length == 0) { urlString = "http://www.w3c.org"; System.out.println("Using " + urlString); } else urlString = args[0]; URL url = new URL(urlString); InputStream in = url.openStream(); XMLInputFactory factory = XMLInputFactory.newInstance(); XMLStreamReader parser = factory.createXMLStreamReader(in); while (parser.hasNext()) { int event = parser.next(); if (event == XMLStreamConstants.START_ELEMENT) { if (parser.getLocalName().equals("a")) { String href = parser.getAttributeValue(null, "href"); if (href != null) System.out.println(href); } } } }
@Override protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { String document = value.toString(); System.out.println("'" + document + "'"); try { XMLStreamReader reader = XMLInputFactory.newInstance() .createXMLStreamReader(new ByteArrayInputStream(document.getBytes())); String propertyName = ""; String propertyValue = ""; String currentElement = ""; while (reader.hasNext()) { int code = reader.next(); switch (code) { case XMLStreamConstants.START_ELEMENT: // START_ELEMENT: currentElement = reader.getLocalName(); break; case XMLStreamConstants.CHARACTERS: // CHARACTERS: if (currentElement.equalsIgnoreCase("uid")) { propertyName += reader.getText().trim(); System.out.println(propertyName); } else if (currentElement.equalsIgnoreCase("location")) { propertyValue += reader.getText().trim(); System.out.println(propertyValue); } else if (currentElement.equalsIgnoreCase("age")) { propertyValue += ("," + reader.getText().trim()); System.out.println(propertyValue); } break; } } reader.close(); context.write(new Text(propertyName.trim()), new Text(propertyValue.trim())); } catch (Exception e) { throw new IOException(e); } }