private String getConfigValue(String setting, String name, Document doc) throws XPathExpressionException { XPathFactory xPathfactory = XPathFactory.newInstance(); XPath xpath = xPathfactory.newXPath(); XPathExpression userExpr = xpath.compile( "wikipediaminer/setting[@name=\"" + setting + "\"]/param[@name=\"" + name + "\"]/@value"); return userExpr.evaluate(doc); }
private NodeList findNodes(Document doc, Node operation) { List<Node> xpaths = getChildNodes(operation, "xpath"); if (xpaths.isEmpty()) { return null; } String xpathExpression = xpaths.get(0).getTextContent(); if (xpathExpression == null) { return null; } XPathFactory xPathfactory = XPathFactory.newInstance(); XPath xpath = xPathfactory.newXPath(); NodeList nl = null; try { XPathExpression expr = xpath.compile(xpathExpression); nl = (NodeList) expr.evaluate(doc, XPathConstants.NODESET); } catch (XPathExpressionException ex) { Utils.onError(new Error.WrongXpathExpression(xpathExpression)); } return nl; }
public static void main(String[] args) { try { DocumentBuilderFactory db = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = db.newDocumentBuilder(); Document dom = builder.parse("data/geographic-area-data.html"); XPathFactory xpath = XPathFactory.newInstance(); XPath path = xpath.newXPath(); XPathExpression table = path.compile("//div[@id='mw-content-text']/table[contains(@class,'wikitable')]/tr"); NodeList wikiData = (NodeList) table.evaluate(dom, XPathConstants.NODESET); NodeList children; String currentData, cleanData; /* Open output stream */ FileWriter fstream = new FileWriter("data/parsed.yaml"); BufferedWriter out = new BufferedWriter(fstream); for (int i = 0; i < wikiData.getLength(); i++) { if (i == 0) { continue; } out.write(new Integer(i).toString() + ":\n"); children = wikiData.item(i).getChildNodes(); for (int j = 0; j < children.getLength(); j++) { currentData = (String) children.item(j).getTextContent(); switch (j) { case 0: /* Current Data is empty */ break; case 1: cleanData = decompose(currentData).trim().replaceAll("[^a-zA-Z\\s]+", ""); out.write("\t\"Geographic entity\": \"" + cleanData + "\"\n"); break; case 2: /* Current Data is empty */ break; case 3: cleanData = decompose(currentData).trim().replaceAll(",", ""); out.write("\t\"Area\": \"" + cleanData + "\"\n"); break; case 4: /* Current Data is empty */ break; case 5: cleanData = decompose(currentData).trim(); out.write("\t\"Notes\": \"" + cleanData + "\"\n"); break; case 6: /* Current Data is empty */ break; default: /* System.out.println("[" + j + "] Hit default case statement. Current Data is: " + currentData); */ break; } } } /* Close output stream */ out.close(); } catch (Exception e) { System.out.println(e); } }
protected Document parse(InputStream inputStream) throws LibraryException { Map<Integer, SongBean> songMap = new HashMap<Integer, SongBean>(); Map<Integer, AlbumBean> albumMap = new HashMap<Integer, AlbumBean>(); Map<Integer, AlbumBean> secondaryAlbumMap = new HashMap<Integer, AlbumBean>(); warningList.clear(); try { // Create a builder factory DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); factory.setIgnoringElementContentWhitespace(true); factory.setIgnoringComments(true); // Create the builder and parse the file DocumentBuilder builder = factory.newDocumentBuilder(); // Set an error listener and parse the document builder.setErrorHandler(new iTradeTunesLibraryErrorHandler()); builder.setEntityResolver(new iTradeTunesLibraryResolver()); Document document = builder.parse(inputStream); synchronized (libraryList) { // Query the library document and build the library list XPath xPath = XPathFactory.newInstance().newXPath(); XPathExpression xPathExpression = xPath.compile(XPATH_LIBRARY_LIST); NodeList nodelist = (NodeList) xPathExpression.evaluate(document, XPathConstants.NODESET); // Process the elements in the nodelist SongBean song = null; for (int i = 0; i < nodelist.getLength(); i++) { boolean isTrackID = false; // Get element and child nodes Element elem = (Element) nodelist.item(i); NodeList list = elem.getChildNodes(); // Get node value Node childKey = list.item(0); String key = childKey.getNodeValue(); // Check if we have to create a new bean if (SongBean.NAME_TRACK_ID.equals(key)) { isTrackID = true; SongBean previousSong = song; song = new SongBean(); if (previousSong != null && !("AAC audio file".equals(previousSong.getKind()) || "MPEG audio file".equals(previousSong.getKind()))) { songMap.remove(previousSong.getTrack_ID()); } else { // Add an album bean addOrUpdateAlbum(albumMap, previousSong, false); } } // The first parameter is the key String prop = childKey.getNodeValue().replace(' ', '_'); // The second parameter is the value i++; // Get element and child nodes elem = (Element) nodelist.item(i); // Check for boolean properties Object value = null; // Get node value list = elem.getChildNodes(); childKey = list.item(0); value = (childKey == null) ? elem.getNodeName() : childKey.getNodeValue(); if (isTrackID) { isTrackID = false; } // Set the property of the song bean Statement stmt = new Statement(song, "set" + prop, new Object[] {value}); try { stmt.execute(); } catch (Exception e) { // Ignore that field, we do not have it in our bean } // If the property is the track ID, add the song to the hash // map if (SongBean.NAME_TRACK_ID.equals(key)) { int trackID = Integer.valueOf((String) value); songMap.put(trackID, song); } } // Update album for last song addOrUpdateAlbum(albumMap, song, false); // Check the album map for inconsistencies Iterator<AlbumBean> albums = albumMap.values().iterator(); while (albums.hasNext()) { AlbumBean album = albums.next(); if (album.checkConsistency()) { libraryList.add(album); album.setHashCode(); } else { // Add an inconsistent album only using the album title SongBean[] songs = album.getSongs(); for (int i = 0; i < songs.length; i++) { addOrUpdateAlbum(secondaryAlbumMap, songs[i], true); } } } // Check secondary album map for consistency albums = secondaryAlbumMap.values().iterator(); while (albums.hasNext()) { AlbumBean album = albums.next(); if (album.checkConsistency()) { libraryList.add(album); album.setHashCode(); } else { // This album cannot be matched // TODO: Add to warning message } } setChanged(); } return document; } catch (IOException ioe) { // Log an expected connect exception throw new LibraryException(ioe); } catch (SAXException se) { // Catch all other exceptions throw new LibraryException(se); } catch (ParserConfigurationException pce) { // Catch all other exceptions Utils.logSevere(pce); throw new LibraryException(pce); } catch (XPathExpressionException xpe) { // Catch all other exceptions Utils.logSevere(xpe); throw new LibraryException(xpe); } catch (NumberFormatException nfe) { // Catch all other exceptions throw new LibraryException(nfe); } }
@Override public HashSet<ScoredAnnotation> solveSa2W(String text) throws AnnotationException { HashSet<ScoredAnnotation> res; try { res = new HashSet<ScoredAnnotation>(); lastTime = Calendar.getInstance().getTimeInMillis(); URL wikiApi = new URL(url); String parameters = "references=true&repeatMode=all&minProbability=0.0&source=" + URLEncoder.encode(text, "UTF-8"); HttpURLConnection slConnection = (HttpURLConnection) wikiApi.openConnection(); slConnection.setRequestProperty("accept", "text/xml"); slConnection.setDoOutput(true); slConnection.setDoInput(true); slConnection.setRequestMethod("POST"); slConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); slConnection.setRequestProperty("charset", "utf-8"); slConnection.setRequestProperty( "Content-Length", "" + Integer.toString(parameters.getBytes().length)); slConnection.setUseCaches(false); DataOutputStream wr = new DataOutputStream(slConnection.getOutputStream()); wr.writeBytes(parameters); wr.flush(); wr.close(); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(slConnection.getInputStream()); /* URL wikiApi = new URL(url+"?references=true&repeatMode=all&minProbability=0.0&source="+URLEncoder.encode(text, "UTF-8")); URLConnection wikiConnection = wikiApi.openConnection(); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(wikiConnection.getInputStream()); */ lastTime = Calendar.getInstance().getTimeInMillis() - lastTime; XPathFactory xPathfactory = XPathFactory.newInstance(); XPath xpath = xPathfactory.newXPath(); XPathExpression idExpr = xpath.compile("//detectedTopic/@id"); XPathExpression weightExpr = xpath.compile("//detectedTopic/@weight"); XPathExpression referenceExpr = xpath.compile("//detectedTopic/references"); NodeList ids = (NodeList) idExpr.evaluate(doc, XPathConstants.NODESET); NodeList weights = (NodeList) weightExpr.evaluate(doc, XPathConstants.NODESET); NodeList references = (NodeList) referenceExpr.evaluate(doc, XPathConstants.NODESET); for (int i = 0; i < weights.getLength(); i++) { if (weights.item(i).getNodeType() != Node.TEXT_NODE) { int id = Integer.parseInt(ids.item(i).getNodeValue()); float weight = Float.parseFloat(weights.item(i).getNodeValue()); // System.out.println("ID="+ids.item(i).getNodeValue()+" weight="+weight); XPathExpression startExpr = xpath.compile("//detectedTopic[@id=" + id + "]/references/reference/@start"); XPathExpression endExpr = xpath.compile("//detectedTopic[@id=" + id + "]/references/reference/@end"); NodeList starts = (NodeList) startExpr.evaluate(references.item(i), XPathConstants.NODESET); NodeList ends = (NodeList) endExpr.evaluate(references.item(i), XPathConstants.NODESET); for (int j = 0; j < starts.getLength(); j++) { int start = Integer.parseInt(starts.item(j).getNodeValue()); int end = Integer.parseInt(ends.item(j).getNodeValue()); int len = end - start; res.add(new ScoredAnnotation(start, len, id, weight)); } } } } catch (Exception e) { e.printStackTrace(); throw new AnnotationException( "An error occurred while querying Wikipedia Miner API. Message: " + e.getMessage()); } return res; }