private String getConfigValue(String setting, String name, Document doc)
     throws XPathExpressionException {
   XPathFactory xPathfactory = XPathFactory.newInstance();
   XPath xpath = xPathfactory.newXPath();
   XPathExpression userExpr =
       xpath.compile(
           "wikipediaminer/setting[@name=\""
               + setting
               + "\"]/param[@name=\""
               + name
               + "\"]/@value");
   return userExpr.evaluate(doc);
 }
Esempio n. 2
0
 private NodeList findNodes(Document doc, Node operation) {
   List<Node> xpaths = getChildNodes(operation, "xpath");
   if (xpaths.isEmpty()) {
     return null;
   }
   String xpathExpression = xpaths.get(0).getTextContent();
   if (xpathExpression == null) {
     return null;
   }
   XPathFactory xPathfactory = XPathFactory.newInstance();
   XPath xpath = xPathfactory.newXPath();
   NodeList nl = null;
   try {
     XPathExpression expr = xpath.compile(xpathExpression);
     nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
   } catch (XPathExpressionException ex) {
     Utils.onError(new Error.WrongXpathExpression(xpathExpression));
   }
   return nl;
 }
  public static void main(String[] args) {
    try {
      DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
      DocumentBuilder builder = db.newDocumentBuilder();
      Document dom = builder.parse("data/geographic-area-data.html");
      XPathFactory xpath = XPathFactory.newInstance();
      XPath path = xpath.newXPath();
      XPathExpression table =
          path.compile("//div[@id='mw-content-text']/table[contains(@class,'wikitable')]/tr");
      NodeList wikiData = (NodeList) table.evaluate(dom, XPathConstants.NODESET);

      NodeList children;
      String currentData, cleanData;

      /* Open output stream */
      FileWriter fstream = new FileWriter("data/parsed.yaml");
      BufferedWriter out = new BufferedWriter(fstream);

      for (int i = 0; i < wikiData.getLength(); i++) {
        if (i == 0) {
          continue;
        }
        out.write(new Integer(i).toString() + ":\n");
        children = wikiData.item(i).getChildNodes();
        for (int j = 0; j < children.getLength(); j++) {
          currentData = (String) children.item(j).getTextContent();
          switch (j) {
            case 0:
              /* Current Data is empty */
              break;
            case 1:
              cleanData = decompose(currentData).trim().replaceAll("[^a-zA-Z\\s]+", "");
              out.write("\t\"Geographic entity\": \"" + cleanData + "\"\n");
              break;
            case 2:
              /* Current Data is empty */
              break;
            case 3:
              cleanData = decompose(currentData).trim().replaceAll(",", "");
              out.write("\t\"Area\": \"" + cleanData + "\"\n");
              break;
            case 4:
              /* Current Data is empty */
              break;
            case 5:
              cleanData = decompose(currentData).trim();
              out.write("\t\"Notes\": \"" + cleanData + "\"\n");
              break;
            case 6:
              /* Current Data is empty */
              break;
            default:
              /* System.out.println("[" + j + "] Hit default case statement. Current Data is: " + currentData); */
              break;
          }
        }
      }
      /* Close output stream */
      out.close();
    } catch (Exception e) {
      System.out.println(e);
    }
  }
Esempio n. 4
0
  protected Document parse(InputStream inputStream) throws LibraryException {
    Map<Integer, SongBean> songMap = new HashMap<Integer, SongBean>();
    Map<Integer, AlbumBean> albumMap = new HashMap<Integer, AlbumBean>();
    Map<Integer, AlbumBean> secondaryAlbumMap = new HashMap<Integer, AlbumBean>();
    warningList.clear();

    try {
      // Create a builder factory
      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      factory.setValidating(false);
      factory.setNamespaceAware(true);
      factory.setIgnoringElementContentWhitespace(true);
      factory.setIgnoringComments(true);

      // Create the builder and parse the file
      DocumentBuilder builder = factory.newDocumentBuilder();

      // Set an error listener and parse the document
      builder.setErrorHandler(new iTradeTunesLibraryErrorHandler());
      builder.setEntityResolver(new iTradeTunesLibraryResolver());
      Document document = builder.parse(inputStream);

      synchronized (libraryList) {
        // Query the library document and build the library list
        XPath xPath = XPathFactory.newInstance().newXPath();
        XPathExpression xPathExpression = xPath.compile(XPATH_LIBRARY_LIST);
        NodeList nodelist = (NodeList) xPathExpression.evaluate(document, XPathConstants.NODESET);

        // Process the elements in the nodelist
        SongBean song = null;

        for (int i = 0; i < nodelist.getLength(); i++) {
          boolean isTrackID = false;

          // Get element and child nodes
          Element elem = (Element) nodelist.item(i);
          NodeList list = elem.getChildNodes();

          // Get node value
          Node childKey = list.item(0);
          String key = childKey.getNodeValue();

          // Check if we have to create a new bean
          if (SongBean.NAME_TRACK_ID.equals(key)) {
            isTrackID = true;
            SongBean previousSong = song;
            song = new SongBean();
            if (previousSong != null
                && !("AAC audio file".equals(previousSong.getKind())
                    || "MPEG audio file".equals(previousSong.getKind()))) {
              songMap.remove(previousSong.getTrack_ID());
            } else {
              // Add an album bean
              addOrUpdateAlbum(albumMap, previousSong, false);
            }
          }

          // The first parameter is the key
          String prop = childKey.getNodeValue().replace(' ', '_');

          // The second parameter is the value
          i++;

          // Get element and child nodes
          elem = (Element) nodelist.item(i);

          // Check for boolean properties
          Object value = null;
          // Get node value
          list = elem.getChildNodes();
          childKey = list.item(0);
          value = (childKey == null) ? elem.getNodeName() : childKey.getNodeValue();

          if (isTrackID) {
            isTrackID = false;
          }

          // Set the property of the song bean
          Statement stmt = new Statement(song, "set" + prop, new Object[] {value});
          try {
            stmt.execute();

          } catch (Exception e) {
            // Ignore that field, we do not have it in our bean
          }

          // If the property is the track ID, add the song to the hash
          // map
          if (SongBean.NAME_TRACK_ID.equals(key)) {
            int trackID = Integer.valueOf((String) value);
            songMap.put(trackID, song);
          }
        }

        // Update album for last song
        addOrUpdateAlbum(albumMap, song, false);

        // Check the album map for inconsistencies
        Iterator<AlbumBean> albums = albumMap.values().iterator();
        while (albums.hasNext()) {
          AlbumBean album = albums.next();
          if (album.checkConsistency()) {
            libraryList.add(album);
            album.setHashCode();
          } else {
            // Add an inconsistent album only using the album title
            SongBean[] songs = album.getSongs();
            for (int i = 0; i < songs.length; i++) {
              addOrUpdateAlbum(secondaryAlbumMap, songs[i], true);
            }
          }
        }

        // Check secondary album map for consistency
        albums = secondaryAlbumMap.values().iterator();
        while (albums.hasNext()) {
          AlbumBean album = albums.next();
          if (album.checkConsistency()) {
            libraryList.add(album);
            album.setHashCode();
          } else {
            // This album cannot be matched
            // TODO: Add to warning message
          }
        }

        setChanged();
      }

      return document;
    } catch (IOException ioe) {
      // Log an expected connect exception
      throw new LibraryException(ioe);
    } catch (SAXException se) {
      // Catch all other exceptions
      throw new LibraryException(se);
    } catch (ParserConfigurationException pce) {
      // Catch all other exceptions
      Utils.logSevere(pce);
      throw new LibraryException(pce);
    } catch (XPathExpressionException xpe) {
      // Catch all other exceptions
      Utils.logSevere(xpe);
      throw new LibraryException(xpe);
    } catch (NumberFormatException nfe) {
      // Catch all other exceptions
      throw new LibraryException(nfe);
    }
  }
  @Override
  public HashSet<ScoredAnnotation> solveSa2W(String text) throws AnnotationException {
    HashSet<ScoredAnnotation> res;
    try {
      res = new HashSet<ScoredAnnotation>();
      lastTime = Calendar.getInstance().getTimeInMillis();

      URL wikiApi = new URL(url);
      String parameters =
          "references=true&repeatMode=all&minProbability=0.0&source="
              + URLEncoder.encode(text, "UTF-8");
      HttpURLConnection slConnection = (HttpURLConnection) wikiApi.openConnection();
      slConnection.setRequestProperty("accept", "text/xml");
      slConnection.setDoOutput(true);
      slConnection.setDoInput(true);
      slConnection.setRequestMethod("POST");
      slConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
      slConnection.setRequestProperty("charset", "utf-8");
      slConnection.setRequestProperty(
          "Content-Length", "" + Integer.toString(parameters.getBytes().length));
      slConnection.setUseCaches(false);

      DataOutputStream wr = new DataOutputStream(slConnection.getOutputStream());
      wr.writeBytes(parameters);
      wr.flush();
      wr.close();
      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document doc = builder.parse(slConnection.getInputStream());

      /*			URL wikiApi = new URL(url+"?references=true&repeatMode=all&minProbability=0.0&source="+URLEncoder.encode(text, "UTF-8"));
      			URLConnection wikiConnection = wikiApi.openConnection();
      			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      			DocumentBuilder builder = factory.newDocumentBuilder();
      			Document doc = builder.parse(wikiConnection.getInputStream());
      */

      lastTime = Calendar.getInstance().getTimeInMillis() - lastTime;

      XPathFactory xPathfactory = XPathFactory.newInstance();
      XPath xpath = xPathfactory.newXPath();
      XPathExpression idExpr = xpath.compile("//detectedTopic/@id");
      XPathExpression weightExpr = xpath.compile("//detectedTopic/@weight");
      XPathExpression referenceExpr = xpath.compile("//detectedTopic/references");

      NodeList ids = (NodeList) idExpr.evaluate(doc, XPathConstants.NODESET);
      NodeList weights = (NodeList) weightExpr.evaluate(doc, XPathConstants.NODESET);
      NodeList references = (NodeList) referenceExpr.evaluate(doc, XPathConstants.NODESET);

      for (int i = 0; i < weights.getLength(); i++) {
        if (weights.item(i).getNodeType() != Node.TEXT_NODE) {
          int id = Integer.parseInt(ids.item(i).getNodeValue());
          float weight = Float.parseFloat(weights.item(i).getNodeValue());
          //					System.out.println("ID="+ids.item(i).getNodeValue()+" weight="+weight);
          XPathExpression startExpr =
              xpath.compile("//detectedTopic[@id=" + id + "]/references/reference/@start");
          XPathExpression endExpr =
              xpath.compile("//detectedTopic[@id=" + id + "]/references/reference/@end");
          NodeList starts =
              (NodeList) startExpr.evaluate(references.item(i), XPathConstants.NODESET);
          NodeList ends = (NodeList) endExpr.evaluate(references.item(i), XPathConstants.NODESET);
          for (int j = 0; j < starts.getLength(); j++) {
            int start = Integer.parseInt(starts.item(j).getNodeValue());
            int end = Integer.parseInt(ends.item(j).getNodeValue());
            int len = end - start;
            res.add(new ScoredAnnotation(start, len, id, weight));
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
      throw new AnnotationException(
          "An error occurred while querying Wikipedia Miner API. Message: " + e.getMessage());
    }
    return res;
  }