Beispiel #1
0
  /**
   * Reads an XML document from an input source and copies its values into the specified object
   *
   * @param ob The object to receive the values
   * @param source The location of the XML document
   * @throws IOException If there is an error reading the document
   */
  public void readObject(Object ob, InputSource source) throws IOException {
    try {
      // Create a document builder to read the document
      DocumentBuilder builder = factory.newDocumentBuilder();

      // Read the document
      Document doc = builder.parse(source);

      // Get the root element
      Element element = doc.getDocumentElement();

      // Copy the root element into the bean
      readObject(ob, element);
    } catch (SAXException exc) {
      throw new IOException("Error parsing XML document: " + exc.toString());
    } catch (ParserConfigurationException exc) {
      throw new IOException("Error parsing XML document: " + exc.toString());
    }
  }
  public static void main(String[] args) {
    try {
      DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
      DocumentBuilder builder = db.newDocumentBuilder();
      Document dom = builder.parse("data/geographic-area-data.html");
      XPathFactory xpath = XPathFactory.newInstance();
      XPath path = xpath.newXPath();
      XPathExpression table =
          path.compile("//div[@id='mw-content-text']/table[contains(@class,'wikitable')]/tr");
      NodeList wikiData = (NodeList) table.evaluate(dom, XPathConstants.NODESET);

      NodeList children;
      String currentData, cleanData;

      /* Open output stream */
      FileWriter fstream = new FileWriter("data/parsed.yaml");
      BufferedWriter out = new BufferedWriter(fstream);

      for (int i = 0; i < wikiData.getLength(); i++) {
        if (i == 0) {
          continue;
        }
        out.write(new Integer(i).toString() + ":\n");
        children = wikiData.item(i).getChildNodes();
        for (int j = 0; j < children.getLength(); j++) {
          currentData = (String) children.item(j).getTextContent();
          switch (j) {
            case 0:
              /* Current Data is empty */
              break;
            case 1:
              cleanData = decompose(currentData).trim().replaceAll("[^a-zA-Z\\s]+", "");
              out.write("\t\"Geographic entity\": \"" + cleanData + "\"\n");
              break;
            case 2:
              /* Current Data is empty */
              break;
            case 3:
              cleanData = decompose(currentData).trim().replaceAll(",", "");
              out.write("\t\"Area\": \"" + cleanData + "\"\n");
              break;
            case 4:
              /* Current Data is empty */
              break;
            case 5:
              cleanData = decompose(currentData).trim();
              out.write("\t\"Notes\": \"" + cleanData + "\"\n");
              break;
            case 6:
              /* Current Data is empty */
              break;
            default:
              /* System.out.println("[" + j + "] Hit default case statement. Current Data is: " + currentData); */
              break;
          }
        }
      }
      /* Close output stream */
      out.close();
    } catch (Exception e) {
      System.out.println(e);
    }
  }