public ArrayList<String> parseXML() throws Exception {
    ArrayList<String> ret = new ArrayList<String>();

    handshake();

    URL url =
        new URL(
            "http://mangaonweb.com/page.do?cdn="
                + cdn
                + "&cpn=book.xml&crcod="
                + crcod
                + "&rid="
                + (int) (Math.random() * 10000));
    String page = DownloaderUtils.getPage(url.toString(), "UTF-8", cookies);

    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    DocumentBuilder builder = factory.newDocumentBuilder();
    InputSource is = new InputSource(new StringReader(page));
    Document d = builder.parse(is);
    Element doc = d.getDocumentElement();

    NodeList pages = doc.getElementsByTagName("page");
    total = pages.getLength();
    for (int i = 0; i < pages.getLength(); i++) {
      Element e = (Element) pages.item(i);
      ret.add(e.getAttribute("path"));
    }

    return (ret);
  }
Beispiel #2
0
  static {
    try {
      dbf.setNamespaceAware(true);
      db = dbf.newDocumentBuilder();
      xpath.setNamespaceContext(
          new NamespaceContext() {

            @Override
            public Iterator<String> getPrefixes(String namespaceURI) {
              return Arrays.asList("md").iterator();
            }

            @Override
            public String getPrefix(String namespaceURI) {
              return "md";
            }

            @Override
            public String getNamespaceURI(String prefix) {
              return "http://www.osgi.org/xmlns/metatype/v1.1.0";
            }
          });
    } catch (ParserConfigurationException e) {
      e.printStackTrace();
      throw new ExceptionInInitializerError(e);
    }
  }
 private void createDOM() throws ParserConfigurationException {
   if (dbf == null) {
     dbf = DocumentBuilderFactory.newInstance();
     db = dbf.newDocumentBuilder();
     dil = (DOMImplementationLS) db.getDOMImplementation();
     dw = dil.createDOMWriter();
   }
 }
 public WikipediaMinerAnnotator(String configFile)
     throws ParserConfigurationException, FileNotFoundException, SAXException, IOException,
         XPathExpressionException {
   DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
   DocumentBuilder builder = factory.newDocumentBuilder();
   Document doc = builder.parse(new FileInputStream(configFile));
   url = getConfigValue("access", "url", doc);
   if (url.equals(""))
     throw new AnnotationException(
         "Configuration file " + configFile + " has missing value 'url'.");
 }
Beispiel #5
0
  static {
    try {
      URL url = SpellCheckActivator.bundleContext.getBundle().getResource(RESOURCE_LOC);

      InputStream stream = url.openStream();

      if (stream == null) throw new IOException();

      // strict parsing options
      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

      factory.setValidating(false);
      factory.setIgnoringComments(true);
      factory.setIgnoringElementContentWhitespace(true);

      // parses configuration xml
      /*-
       * Warning: Felix is unable to import the com.sun.rowset.internal
       * package, meaning this can't use the XmlErrorHandler. This causes
       * a warning and a default handler to be attached. Otherwise this
       * should have: builder.setErrorHandler(new XmlErrorHandler());
       */
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document doc = builder.parse(stream);

      // iterates over nodes, parsing contents
      Node root = doc.getChildNodes().item(1);

      NodeList categories = root.getChildNodes();

      for (int i = 0; i < categories.getLength(); ++i) {
        Node node = categories.item(i);
        if (node.getNodeName().equals(NODE_DEFAULTS)) {
          parseDefaults(node.getChildNodes());
        } else if (node.getNodeName().equals(NODE_LOCALES)) {
          parseLocales(node.getChildNodes());
        } else {
          logger.warn("Unrecognized category: " + node.getNodeName());
        }
      }
    } catch (IOException exc) {
      logger.error("Unable to load spell checker parameters", exc);
    } catch (SAXException exc) {
      logger.error("Unable to parse spell checker parameters", exc);
    } catch (ParserConfigurationException exc) {
      logger.error("Unable to parse spell checker parameters", exc);
    }
  }
  @Override
  public HashSet<ScoredAnnotation> solveSa2W(String text) throws AnnotationException {
    HashSet<ScoredAnnotation> res;
    try {
      res = new HashSet<ScoredAnnotation>();
      lastTime = Calendar.getInstance().getTimeInMillis();

      URL wikiApi = new URL(url);
      String parameters =
          "references=true&repeatMode=all&minProbability=0.0&source="
              + URLEncoder.encode(text, "UTF-8");
      HttpURLConnection slConnection = (HttpURLConnection) wikiApi.openConnection();
      slConnection.setRequestProperty("accept", "text/xml");
      slConnection.setDoOutput(true);
      slConnection.setDoInput(true);
      slConnection.setRequestMethod("POST");
      slConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
      slConnection.setRequestProperty("charset", "utf-8");
      slConnection.setRequestProperty(
          "Content-Length", "" + Integer.toString(parameters.getBytes().length));
      slConnection.setUseCaches(false);

      DataOutputStream wr = new DataOutputStream(slConnection.getOutputStream());
      wr.writeBytes(parameters);
      wr.flush();
      wr.close();
      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document doc = builder.parse(slConnection.getInputStream());

      /*			URL wikiApi = new URL(url+"?references=true&repeatMode=all&minProbability=0.0&source="+URLEncoder.encode(text, "UTF-8"));
      			URLConnection wikiConnection = wikiApi.openConnection();
      			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      			DocumentBuilder builder = factory.newDocumentBuilder();
      			Document doc = builder.parse(wikiConnection.getInputStream());
      */

      lastTime = Calendar.getInstance().getTimeInMillis() - lastTime;

      XPathFactory xPathfactory = XPathFactory.newInstance();
      XPath xpath = xPathfactory.newXPath();
      XPathExpression idExpr = xpath.compile("//detectedTopic/@id");
      XPathExpression weightExpr = xpath.compile("//detectedTopic/@weight");
      XPathExpression referenceExpr = xpath.compile("//detectedTopic/references");

      NodeList ids = (NodeList) idExpr.evaluate(doc, XPathConstants.NODESET);
      NodeList weights = (NodeList) weightExpr.evaluate(doc, XPathConstants.NODESET);
      NodeList references = (NodeList) referenceExpr.evaluate(doc, XPathConstants.NODESET);

      for (int i = 0; i < weights.getLength(); i++) {
        if (weights.item(i).getNodeType() != Node.TEXT_NODE) {
          int id = Integer.parseInt(ids.item(i).getNodeValue());
          float weight = Float.parseFloat(weights.item(i).getNodeValue());
          //					System.out.println("ID="+ids.item(i).getNodeValue()+" weight="+weight);
          XPathExpression startExpr =
              xpath.compile("//detectedTopic[@id=" + id + "]/references/reference/@start");
          XPathExpression endExpr =
              xpath.compile("//detectedTopic[@id=" + id + "]/references/reference/@end");
          NodeList starts =
              (NodeList) startExpr.evaluate(references.item(i), XPathConstants.NODESET);
          NodeList ends = (NodeList) endExpr.evaluate(references.item(i), XPathConstants.NODESET);
          for (int j = 0; j < starts.getLength(); j++) {
            int start = Integer.parseInt(starts.item(j).getNodeValue());
            int end = Integer.parseInt(ends.item(j).getNodeValue());
            int len = end - start;
            res.add(new ScoredAnnotation(start, len, id, weight));
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      throw new AnnotationException(
          "An error occurred while querying Wikipedia Miner API. Message: " + e.getMessage());
    }
    return res;
  }